# Domain Adaptation

In this notebook, we are going to perform domain adaptation on the distilbert model, using our dataset. Instead of just performing the regular fine-tuning, we are going to use the Masked Language Model (MLM) objective to train the model. 

The idea is to train the model in this way, and then see if it can perform better than the regular fine-tuning approach.

[Reference](https://towardsdatascience.com/fine-tuning-for-domain-adaptation-in-nlp-c47def356fd6)

In [1]:
import multiprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import warnings
import time

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from datasets import Dataset
from datasets import DatasetDict
from datasets import load_metric

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import TextClassificationPipeline
from transformers import pipeline
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import DataCollatorForLanguageModeling


2024-05-18 20:50:33.383656: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-18 20:50:33.642463: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 20:50:33.642644: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 20:50:33.678317: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-18 20:50:33.748706: I tensorflow/core/platform/cpu_feature_guar

In [2]:
df = pd.read_pickle("data/data_original.pkl")

down_sample_percentage = 0.5

df = df.sample(frac=down_sample_percentage/100, random_state=1)

print(df.shape)
df.head()

(2084, 2)


Unnamed: 0,text,emotions
98790,i wanted them to feel now i feel as though i a...,sadness
18398,i found myself feeling very sympathetic toward...,love
139349,ive been feeling pretty good today and tonight...,joy
13416,i can use to cover my ass when i feel inadequate,sadness
22297,i feel like ive pissed myself again,anger


In [3]:
dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.2, seed=42)

train_dataset = dataset['train']
test_dataset = dataset['test']
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['text', 'emotions', '__index_level_0__'],
    num_rows: 1667
})
Dataset({
    features: ['text', 'emotions', '__index_level_0__'],
    num_rows: 417
})


In [4]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')

In [5]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, return_special_tokens_mask=True)

column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function, 
    batched=True,
    num_proc=4, # multiprocessing.cpu_count() I am not sure if this is a good idea, since it might use all cores?
    remove_columns=column_names
)

test_dataset = test_dataset.map(
    tokenize_function, 
    batched=True,
    num_proc=4,
    remove_columns=column_names
)

Map (num_proc=4):   0%|          | 0/1667 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/417 [00:00<?, ? examples/s]

In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="./domain-model",
    learning_rate=2e-5,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()
trainer.save_model("./domain-model/distilbert-emotions")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/209 [00:00<?, ?it/s]

  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 2.7863354682922363, 'eval_runtime': 215.3701, 'eval_samples_per_second': 1.936, 'eval_steps_per_second': 0.246, 'epoch': 1.0}


There were missing keys in the checkpoint model loaded: ['vocab_projector.weight'].


{'train_runtime': 3337.4522, 'train_samples_per_second': 0.499, 'train_steps_per_second': 0.063, 'train_loss': 2.976363624682267, 'epoch': 1.0}


In [9]:
trainer.save_model("./domain-model/distilbert-emotions")

In [8]:
tokenizer.save_pretrained("./domain-model/distilbert-emotions")

('./domain-model/distilbert-emotions/tokenizer_config.json',
 './domain-model/distilbert-emotions/special_tokens_map.json',
 './domain-model/distilbert-emotions/vocab.txt',
 './domain-model/distilbert-emotions/added_tokens.json')

In [7]:
trainer.evaluate()

  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 2.8271596431732178,
 'eval_runtime': 218.0222,
 'eval_samples_per_second': 1.913,
 'eval_steps_per_second': 0.243,
 'epoch': 1.0}