#### Personal Background Information Model Fine-tuning
(helper_models) 

In [1]:
import pandas as pd

data = pd.read_csv('../datasets/raw/personal_background.tsv', sep='\t')
data["label"] = data["label"].replace({True: 'Yes', False: 'No'})
data

Unnamed: 0,label,text
0,No,You sound like an animal lover too. Any pets?
1,No,I do love animals. We currently have a dog and...
2,Yes,Same on the dog and cat. The dog was a stray ...
3,No,"We found our cat at the park one day, she was ..."
4,No,Awe! I bet she is sweet. What is her name? ...
...,...,...
7011,No,It's a lot of fun. I play quarterback.
7012,No,what's your favorite team? Mine is the Saints
7013,Yes,I liked watching the Saints win Superbowl XLIV...
7014,Yes,I think we may go back this year


In [2]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(data, test_size=0.20, shuffle=True, random_state=42)

In [3]:
label = train_df.label.unique()
# Create id2label and label2id dictionaries
id2label = {i: emotion for i, emotion in enumerate(label)}
label2id = {emotion: i for i, emotion in enumerate(label)}

train_text = train_df.text.values
train_label = [label2id[label] for label in train_df.label.values]

eval_text = valid_df.text.values
eval_label = [label2id[label] for label in valid_df.label.values]

In [4]:
def encodings(texts, tokenizer):
    encoded_data = tokenizer(
        [text for text in texts],
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=64
    )
    return encoded_data

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_encoding = encodings(train_text, tokenizer)
eval_encoding = encodings(eval_text, tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

train_dataset = CustomDataset(train_encoding, train_label)
eval_dataset = CustomDataset(eval_encoding, eval_label)

In [7]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
recall = evaluate.load('recall')
precision = evaluate.load("precision")
f1 = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc_score = accuracy.compute(predictions=predictions, references=labels)
    r_score = recall.compute(predictions=predictions, references=labels, average='macro')
    p_score = precision.compute(predictions=predictions, references=labels, average='macro')
    f1_score = f1.compute(predictions=predictions, references=labels, average='macro')

    metrics = {
        'accuracy':round(acc_score['accuracy'], 2),
        'precision':round(p_score['precision'], 2),
        'recall':round(r_score['recall'], 2),
        'f1':round(f1_score['f1'], 2)
    }
    return metrics


In [10]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, IntervalStrategy

batch_size = 64
training_args = TrainingArguments(
    output_dir="results-bg-10",
    learning_rate=2e-5,
    seed= 42,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps = 50,
    report_to="tensorboard",
    push_to_hub=False,
    save_total_limit=2,
    logging_dir='results-bg-10/logs',
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
)
early_stop = EarlyStoppingCallback(2, 1.0)
# early_stop = EarlyStoppingCallback(early_stopping_patience=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stop]
)

trainer.train()
trainer.save_model(output_dir='results-bg-10/best-model')

  0%|          | 0/880 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                 
  6%|▌         | 50/880 [00:32<05:18,  2.61it/s]

{'eval_loss': 0.6151394248008728, 'eval_accuracy': 0.68, 'eval_precision': 0.68, 'eval_recall': 0.68, 'eval_f1': 0.68, 'eval_runtime': 3.3877, 'eval_samples_per_second': 414.435, 'eval_steps_per_second': 6.494, 'epoch': 0.57}


                                                 
 11%|█▏        | 100/880 [00:54<04:51,  2.67it/s]

{'eval_loss': 0.6062088012695312, 'eval_accuracy': 0.69, 'eval_precision': 0.69, 'eval_recall': 0.69, 'eval_f1': 0.69, 'eval_runtime': 3.0898, 'eval_samples_per_second': 454.4, 'eval_steps_per_second': 7.12, 'epoch': 1.14}


                                                 
 17%|█▋        | 150/880 [01:16<04:33,  2.67it/s]

{'eval_loss': 0.6003609299659729, 'eval_accuracy': 0.69, 'eval_precision': 0.69, 'eval_recall': 0.69, 'eval_f1': 0.69, 'eval_runtime': 4.0762, 'eval_samples_per_second': 344.441, 'eval_steps_per_second': 5.397, 'epoch': 1.7}


                                                 
 23%|██▎       | 200/880 [01:38<04:12,  2.69it/s]

{'eval_loss': 0.6344714164733887, 'eval_accuracy': 0.68, 'eval_precision': 0.69, 'eval_recall': 0.68, 'eval_f1': 0.68, 'eval_runtime': 2.9364, 'eval_samples_per_second': 478.129, 'eval_steps_per_second': 7.492, 'epoch': 2.27}


                                                 
 28%|██▊       | 250/880 [02:02<03:55,  2.67it/s]

{'eval_loss': 0.6115859746932983, 'eval_accuracy': 0.69, 'eval_precision': 0.69, 'eval_recall': 0.69, 'eval_f1': 0.69, 'eval_runtime': 5.9203, 'eval_samples_per_second': 237.151, 'eval_steps_per_second': 3.716, 'epoch': 2.84}


                                                 
 34%|███▍      | 300/880 [02:25<03:36,  2.68it/s]

{'eval_loss': 0.6319335699081421, 'eval_accuracy': 0.69, 'eval_precision': 0.69, 'eval_recall': 0.69, 'eval_f1': 0.69, 'eval_runtime': 3.3596, 'eval_samples_per_second': 417.908, 'eval_steps_per_second': 6.548, 'epoch': 3.41}


                                                 
 40%|███▉      | 350/880 [02:47<03:20,  2.64it/s]

{'eval_loss': 0.6404987573623657, 'eval_accuracy': 0.69, 'eval_precision': 0.69, 'eval_recall': 0.69, 'eval_f1': 0.69, 'eval_runtime': 3.0057, 'eval_samples_per_second': 467.105, 'eval_steps_per_second': 7.319, 'epoch': 3.98}


                                                 
 45%|████▌     | 400/880 [03:08<02:59,  2.68it/s]

{'eval_loss': 0.6731035113334656, 'eval_accuracy': 0.68, 'eval_precision': 0.68, 'eval_recall': 0.68, 'eval_f1': 0.68, 'eval_runtime': 2.9526, 'eval_samples_per_second': 475.516, 'eval_steps_per_second': 7.451, 'epoch': 4.55}


                                                 
 51%|█████     | 450/880 [03:30<02:43,  2.63it/s]

{'eval_loss': 0.7515609860420227, 'eval_accuracy': 0.66, 'eval_precision': 0.67, 'eval_recall': 0.66, 'eval_f1': 0.66, 'eval_runtime': 2.9635, 'eval_samples_per_second': 473.763, 'eval_steps_per_second': 7.424, 'epoch': 5.11}


 57%|█████▋    | 500/880 [03:49<02:27,  2.57it/s]

{'loss': 0.4998, 'learning_rate': 8.636363636363637e-06, 'epoch': 5.68}


                                                 
 57%|█████▋    | 500/880 [03:53<02:27,  2.57it/s]

{'eval_loss': 0.750919759273529, 'eval_accuracy': 0.67, 'eval_precision': 0.67, 'eval_recall': 0.67, 'eval_f1': 0.67, 'eval_runtime': 3.0534, 'eval_samples_per_second': 459.811, 'eval_steps_per_second': 7.205, 'epoch': 5.68}


                                                 
 62%|██████▎   | 550/880 [04:38<02:10,  2.52it/s]

{'eval_loss': 0.790925920009613, 'eval_accuracy': 0.67, 'eval_precision': 0.67, 'eval_recall': 0.67, 'eval_f1': 0.67, 'eval_runtime': 3.1059, 'eval_samples_per_second': 452.046, 'eval_steps_per_second': 7.083, 'epoch': 6.25}


                                                 
 68%|██████▊   | 600/880 [05:01<02:20,  1.99it/s]


{'eval_loss': 0.8316775560379028, 'eval_accuracy': 0.65, 'eval_precision': 0.65, 'eval_recall': 0.65, 'eval_f1': 0.65, 'eval_runtime': 3.0719, 'eval_samples_per_second': 457.048, 'eval_steps_per_second': 7.162, 'epoch': 6.82}
{'train_runtime': 301.854, 'train_samples_per_second': 185.918, 'train_steps_per_second': 2.915, 'train_loss': 0.46509509086608886, 'epoch': 6.82}


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import logging

# Set the logging level to WARNING or higher to suppress INFO messages
logging.getLogger("transformers").setLevel(logging.WARNING)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained('results-bg-10/best-model')

new_text = "Both are excellent technology they are helpful in many ways. For the security purpose both are super."
inputs = tokenizer(new_text, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

probabilities = torch.softmax(logits, dim=1)
predicted_class_index = torch.argmax(probabilities, dim=1).item()

class_labels = ["Yes", "No"]

predicted_class = class_labels[predicted_class_index]
predicted_probabilities = probabilities[0].tolist()

print("Predicted Class:", predicted_class)
print("Class Probabilities:")
# for label, prob in zip(class_labels, predicted_probabilities):
#     print(f"{label}: {prob:.4f}")
sorted_probabilities = sorted(
    zip(class_labels, predicted_probabilities),
    key=lambda x: x[1],
    reverse=True
)

for label, prob in sorted_probabilities:
    print(f"{label}: {prob:.4f}")


Predicted Class: No
Class Probabilities:
No: 0.5286
Yes: 0.4714
