In [None]:
import torch
from transformers import AutoTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/ECE1508(F3)-RL-Project/'

In [None]:

# training checkpoint path
# checkpoint_path = f'files/SarcasmClassifierModel/checkpoint-2138'

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# load pre-trained DistilBERT sequence classification model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', id2label={0:'NEG',1:'POS'},label2id={'NEG':0,'POS':1})
# model = DistilBertForSequenceClassification.from_pretrained(checkpoint_path)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# freeze ALL model parameters

for param in model.parameters():
    param.requires_grad = False

In [None]:
# unfreeze classification head ('pre_classifier' and 'classifier' layers)

for param in model.pre_classifier.parameters():
    param.requires_grad = True
for param in model.classifier.parameters():
    param.requires_grad = True


In [None]:
# unfreeze layer 5 (last transformer block)

for param in model.distilbert.transformer.layer[-1].parameters():
    param.requires_grad = True



In [None]:
# unfreeze layer 4 (second-to-last transformer block)

for param in model.distilbert.transformer.layer[-2].parameters():
    param.requires_grad = True


In [None]:


# verify which parameters are trainable
print("\nTrainable layers:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"  Trainable: {name}")
    else:
       print(f"  Frozen:    {name}")


Trainable layers:
  Frozen:    distilbert.embeddings.word_embeddings.weight
  Frozen:    distilbert.embeddings.position_embeddings.weight
  Frozen:    distilbert.embeddings.LayerNorm.weight
  Frozen:    distilbert.embeddings.LayerNorm.bias
  Frozen:    distilbert.transformer.layer.0.attention.q_lin.weight
  Frozen:    distilbert.transformer.layer.0.attention.q_lin.bias
  Frozen:    distilbert.transformer.layer.0.attention.k_lin.weight
  Frozen:    distilbert.transformer.layer.0.attention.k_lin.bias
  Frozen:    distilbert.transformer.layer.0.attention.v_lin.weight
  Frozen:    distilbert.transformer.layer.0.attention.v_lin.bias
  Frozen:    distilbert.transformer.layer.0.attention.out_lin.weight
  Frozen:    distilbert.transformer.layer.0.attention.out_lin.bias
  Frozen:    distilbert.transformer.layer.0.sa_layer_norm.weight
  Frozen:    distilbert.transformer.layer.0.sa_layer_norm.bias
  Frozen:    distilbert.transformer.layer.0.ffn.lin1.weight
  Frozen:    distilbert.transformer.lay

In [None]:
# load training dataset

## parent and child reddit comment, labeled as sarcastic or not

data_total = pd.read_csv(file_path+'sarcasm_datasets/train-balanced-sarcasm.csv')



In [None]:
print(data_total)

         label                                            comment  \
0            0                                         NC and NH.   
1            0  You do know west teams play against west teams...   
2            0  They were underdogs earlier today, but since G...   
3            0  This meme isn't funny none of the "new york ni...   
4            0                    I could use one of those tools.   
...        ...                                                ...   
1010821      1  I'm sure that Iran and N. Korea have the techn...   
1010822      1                 whatever you do, don't vote green!   
1010823      1  Perhaps this is an atheist conspiracy to make ...   
1010824      1  The Slavs got their own country - it is called...   
1010825      1  values, as in capitalism .. there is good mone...   

                 author           subreddit  score  ups  downs     date  \
0             Trumpbart            politics      2   -1     -1  2016-10   
1             Shbshb9

In [None]:
# data cleaning

# define the list of column names to drop
columns_to_drop = ['author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc']

# drop columns
data_total = data_total.drop(columns=columns_to_drop)

# drop rows wtih missing values
data_total = data_total.dropna()

print(data_total)



         label                                            comment  \
0            0                                         NC and NH.   
1            0  You do know west teams play against west teams...   
2            0  They were underdogs earlier today, but since G...   
3            0  This meme isn't funny none of the "new york ni...   
4            0                    I could use one of those tools.   
...        ...                                                ...   
1010821      1  I'm sure that Iran and N. Korea have the techn...   
1010822      1                 whatever you do, don't vote green!   
1010823      1  Perhaps this is an atheist conspiracy to make ...   
1010824      1  The Slavs got their own country - it is called...   
1010825      1  values, as in capitalism .. there is good mone...   

                                            parent_comment  
0        Yeah, I get that argument. At this point, I'd ...  
1        The blazers and Mavericks (The wests 5 a

In [None]:
# split the data to be 64% training, 16% validation and 20% test data
data_train_and_val, data_test = train_test_split(data_total, test_size=0.2, random_state=42)
data_train, data_val = train_test_split(data_train_and_val, test_size=0.2, random_state=42)

print(data_train.shape, data_val.shape, data_test.shape)


# tokenize datasets
# string 'parent_comment' combined with 'comment', separated by [SEP] token
tr_tok = tokenizer(data_train['parent_comment'].tolist(), data_train['comment'].tolist(), return_tensors='pt', truncation=True, padding=True, max_length=128)
val_tok = tokenizer(data_val['parent_comment'].tolist(), data_val['comment'].tolist(), return_tensors='pt', truncation=True, padding=True, max_length=128)
test_tok = tokenizer(data_test['parent_comment'].tolist(), data_test['comment'].tolist(), return_tensors='pt', truncation=True, padding=True, max_length=128)

# add tokenized outputs as new columns in dfs
data_train['input_ids'] = tr_tok['input_ids'].tolist()
data_train['attention_mask'] = tr_tok['attention_mask'].tolist()

data_val['input_ids'] = val_tok['input_ids'].tolist()
data_val['attention_mask'] = val_tok['attention_mask'].tolist()

data_test['input_ids'] = test_tok['input_ids'].tolist()
data_test['attention_mask'] = test_tok['attention_mask'].tolist()

# convert to Hugging Face datasets
dataset_train = Dataset.from_pandas(data_train)
dataset_val = Dataset.from_pandas(data_val)
dataset_test = Dataset.from_pandas(data_test)

(646892, 3) (161724, 3) (202155, 3)


In [None]:
# fine-tune training of model

# set training arguments
training_args = TrainingArguments(
    output_dir=file_path+'files/SarcasmClassifierModel2',
    learning_rate = 5e-5, #1e-3, # instead of lr=2e-5; higher learning rate because we are training the classifier head from scratch
    per_device_train_batch_size=32, # increased batch size, since frozen model uses less memory
    per_device_eval_batch_size=32,
    num_train_epochs=3, ### TODO: increase number of epochs to 3 or 4?
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    push_to_hub=False, # do not intend to upload model to Hugging Face during training
    report_to='none',
    fp16=True,
)

# define compute metrics
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)

    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


# initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    # processing_class=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# train model
trainer.train()
# trainer.train(resume_from_checkpoint=checkpoint_path)

# evaluate model
trainer.evaluate()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4982,0.496166,0.756394,0.75637,0.75655,0.756423
2,0.4623,0.482847,0.765353,0.765343,0.76537,0.765342
3,0.4333,0.48532,0.768068,0.768067,0.768087,0.768078


{'eval_loss': 0.48284727334976196,
 'eval_Accuracy': 0.7653533179985654,
 'eval_F1': 0.765343122728193,
 'eval_Precision': 0.7653700274705105,
 'eval_Recall': 0.7653415600068902,
 'eval_runtime': 61.8083,
 'eval_samples_per_second': 2616.542,
 'eval_steps_per_second': 81.769,
 'epoch': 3.0}

In [None]:
# save model and tokenizer
model_save_path =file_path+'files/TrainedSarcasmClassifierModel4'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel4/tokenizer_config.json',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel4/special_tokens_map.json',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel4/vocab.txt',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel4/added_tokens.json',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel4/tokenizer.json')

In [None]:

# test code below
text1 = "Time is the best teacher"
text2 = "Unfortunately it kills all its students!"
inputs = tokenizer(text1, text2, return_tensors='pt')

# Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}

outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1).item()

print(outputs)
print('predictions: ', predictions)


SequenceClassifierOutput(loss=None, logits=tensor([[-0.3035,  0.4644]], device='cuda:0', grad_fn=<ToCopyBackward0>), hidden_states=None, attentions=None)
predictions:  1
