In [4]:
import torch
from transformers import AutoTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, load_dataset

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = '/content/drive/My Drive/ECE1508(F3)-RL-Project/'

In [5]:

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# load pre-trained DistilBERT sequence classification model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', id2label={0:'NEG',1:'POS'},label2id={'NEG':0,'POS':1})



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [7]:
# freeze ALL model parameters

for param in model.parameters():
    param.requires_grad = False

In [8]:
# unfreeze classification head ('pre_classifier' and 'classifier' layers)

for param in model.pre_classifier.parameters():
    param.requires_grad = True
for param in model.classifier.parameters():
    param.requires_grad = True


In [9]:
# unfreeze layer 5 (last transformer block)

for param in model.distilbert.transformer.layer[-1].parameters():
    param.requires_grad = True



In [10]:
# unfreeze layer 4 (second-to-last transformer block)

for param in model.distilbert.transformer.layer[-2].parameters():
    param.requires_grad = True


In [11]:


# verify which parameters are trainable
print("\nTrainable layers:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"  Trainable: {name}")
    else:
       print(f"  Frozen:    {name}")


Trainable layers:
  Frozen:    distilbert.embeddings.word_embeddings.weight
  Frozen:    distilbert.embeddings.position_embeddings.weight
  Frozen:    distilbert.embeddings.LayerNorm.weight
  Frozen:    distilbert.embeddings.LayerNorm.bias
  Frozen:    distilbert.transformer.layer.0.attention.q_lin.weight
  Frozen:    distilbert.transformer.layer.0.attention.q_lin.bias
  Frozen:    distilbert.transformer.layer.0.attention.k_lin.weight
  Frozen:    distilbert.transformer.layer.0.attention.k_lin.bias
  Frozen:    distilbert.transformer.layer.0.attention.v_lin.weight
  Frozen:    distilbert.transformer.layer.0.attention.v_lin.bias
  Frozen:    distilbert.transformer.layer.0.attention.out_lin.weight
  Frozen:    distilbert.transformer.layer.0.attention.out_lin.bias
  Frozen:    distilbert.transformer.layer.0.sa_layer_norm.weight
  Frozen:    distilbert.transformer.layer.0.sa_layer_norm.bias
  Frozen:    distilbert.transformer.layer.0.ffn.lin1.weight
  Frozen:    distilbert.transformer.lay

In [30]:
# load training dataset

# load the datasets from hugging face
train_dataset = load_dataset('marcbishara/sarcasm-on-reddit', split='reward_train')
val_dataset = load_dataset('marcbishara/sarcasm-on-reddit', split='reward_validation')


In [31]:
len(val_dataset)

30325

In [32]:
len(train_dataset)

272922

In [33]:
# data cleaning (NEW)

# define the list of column names to drop
columns_to_drop = ['author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc']

# drop columns
train_dataset = train_dataset.remove_columns(columns_to_drop)
val_dataset = val_dataset.remove_columns(columns_to_drop)

# drop None and empty strings
train_dataset = train_dataset.filter(lambda x: x['comment'] is not None and x['comment'] != "" and x['parent_comment'] is not None and x['parent_comment'] != "")
val_dataset = val_dataset.filter(lambda x: x['comment'] is not None and x['comment'] != "" and x['parent_comment'] is not None and x['parent_comment'] != "")


Filter:   0%|          | 0/272922 [00:00<?, ? examples/s]

Filter:   0%|          | 0/30325 [00:00<?, ? examples/s]

In [34]:
print(train_dataset)

Dataset({
    features: ['label', 'comment', 'parent_comment'],
    num_rows: 272902
})


In [35]:
print(val_dataset)

Dataset({
    features: ['label', 'comment', 'parent_comment'],
    num_rows: 30324
})


In [73]:
# function to tokenize datasets
# string 'parent_comment' combined with 'comment', separated by [SEP] token

def tokenize_function(example):

    return tokenizer(
        example['parent_comment'],
        example['comment'],
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=128
    )

In [74]:
train_tok = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/272902 [00:00<?, ? examples/s]

In [77]:
val_tok = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/30324 [00:00<?, ? examples/s]

In [82]:
# fine-tune training of model

# set training arguments
training_args = TrainingArguments(
    output_dir=file_path+'files/SarcasmClassifierModel_final',
    learning_rate = 5e-5,
    per_device_train_batch_size=32, # increased batch size, since partially-frozen model uses less memory
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    push_to_hub=False, # do not intend to upload model to Hugging Face during training
    report_to='none',
    fp16=True,
)

# define compute metrics
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)

    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


# initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=dataset_train,
    # eval_dataset=dataset_val,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    # processing_class=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [83]:
# train model
trainer.train()
# trainer.train(resume_from_checkpoint=checkpoint_path)

# evaluate model
trainer.evaluate()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5344,0.518586,0.737996,0.737327,0.741325,0.738463
2,0.4898,0.50919,0.749538,0.7495,0.749539,0.749487
3,0.4496,0.512484,0.753595,0.753575,0.753877,0.75372


{'eval_loss': 0.5091903209686279,
 'eval_Accuracy': 0.7495383194829178,
 'eval_F1': 0.7494996010165715,
 'eval_Precision': 0.7495387401612217,
 'eval_Recall': 0.7494867645525416,
 'eval_runtime': 11.9868,
 'eval_samples_per_second': 2529.791,
 'eval_steps_per_second': 79.087,
 'epoch': 3.0}

In [84]:
# save model and tokenizer
model_save_path =file_path+'files/TrainedSarcasmClassifierModel_final'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel_final/tokenizer_config.json',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel_final/special_tokens_map.json',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel_final/vocab.txt',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel_final/added_tokens.json',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel_final/tokenizer.json')

In [86]:

# test code below
text1 = "Time is the best teacher"
text2 = "Unfortunately it kills all its students"
inputs = tokenizer(text1, text2, return_tensors='pt')

# Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}

outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1).item()

print(outputs)
print('predictions: ', predictions)


SequenceClassifierOutput(loss=None, logits=tensor([[-0.3979,  0.3115]], device='cuda:0', grad_fn=<ToCopyBackward0>), hidden_states=None, attentions=None)
predictions:  1
