In [1]:
import torch
from transformers import AutoTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
file_path = '/content/drive/My Drive/ECE1508(F3)-RL-Project/'

In [6]:

# training checkpoint path
# checkpoint_path = f'files/SarcasmClassifierModel/checkpoint-2138'

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# load pre-trained DistilBERT sequence classification model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', id2label={0:'NEG',1:'POS'},label2id={'NEG':0,'POS':1})
# model = DistilBertForSequenceClassification.from_pretrained(checkpoint_path)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# load training dataset

## parent and child reddit comment, labeled as sarcastic or not

data_total = pd.read_csv(file_path+'sarcasm_datasets/train-balanced-sarcasm.csv')



In [35]:
print(data_total)

         label                                            comment  \
0            0                                         NC and NH.   
1            0  You do know west teams play against west teams...   
2            0  They were underdogs earlier today, but since G...   
3            0  This meme isn't funny none of the "new york ni...   
4            0                    I could use one of those tools.   
...        ...                                                ...   
1010821      1  I'm sure that Iran and N. Korea have the techn...   
1010822      1                 whatever you do, don't vote green!   
1010823      1  Perhaps this is an atheist conspiracy to make ...   
1010824      1  The Slavs got their own country - it is called...   
1010825      1  values, as in capitalism .. there is good mone...   

                 author           subreddit  score  ups  downs     date  \
0             Trumpbart            politics      2   -1     -1  2016-10   
1             Shbshb9

In [36]:
# data cleaning

# define the list of column names to drop
columns_to_drop = ['author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc']

# drop columns
data_total = data_total.drop(columns=columns_to_drop)

# drop rows wtih missing values
data_total = data_total.dropna()

print(data_total)



         label                                            comment  \
0            0                                         NC and NH.   
1            0  You do know west teams play against west teams...   
2            0  They were underdogs earlier today, but since G...   
3            0  This meme isn't funny none of the "new york ni...   
4            0                    I could use one of those tools.   
...        ...                                                ...   
1010821      1  I'm sure that Iran and N. Korea have the techn...   
1010822      1                 whatever you do, don't vote green!   
1010823      1  Perhaps this is an atheist conspiracy to make ...   
1010824      1  The Slavs got their own country - it is called...   
1010825      1  values, as in capitalism .. there is good mone...   

                                            parent_comment  
0        Yeah, I get that argument. At this point, I'd ...  
1        The blazers and Mavericks (The wests 5 a

In [37]:
# split the data to be 64% training, 16% validation and 20% test data
data_train_and_val, data_test = train_test_split(data_total, test_size=0.2, random_state=42)
data_train, data_val = train_test_split(data_train_and_val, test_size=0.2, random_state=42)

print(data_train.shape, data_val.shape, data_test.shape)


# tokenize datasets
# string 'parent_comment' combined with 'comment', separated by [SEP] token
tr_tok = tokenizer(data_train['parent_comment'].tolist(), data_train['comment'].tolist(), return_tensors='pt', truncation=True, padding=True, max_length=128)
val_tok = tokenizer(data_val['parent_comment'].tolist(), data_val['comment'].tolist(), return_tensors='pt', truncation=True, padding=True, max_length=128)
test_tok = tokenizer(data_test['parent_comment'].tolist(), data_test['comment'].tolist(), return_tensors='pt', truncation=True, padding=True, max_length=128)

# add tokenized outputs as new columns in dfs
data_train['input_ids'] = tr_tok['input_ids'].tolist()
data_train['attention_mask'] = tr_tok['attention_mask'].tolist()

data_val['input_ids'] = val_tok['input_ids'].tolist()
data_val['attention_mask'] = val_tok['attention_mask'].tolist()

data_test['input_ids'] = test_tok['input_ids'].tolist()
data_test['attention_mask'] = test_tok['attention_mask'].tolist()

# convert to Hugging Face datasets
dataset_train = Dataset.from_pandas(data_train)
dataset_val = Dataset.from_pandas(data_val)
dataset_test = Dataset.from_pandas(data_test)

(646892, 3) (161724, 3) (202155, 3)


In [38]:
# fine-tune training of model

# set training arguments
training_args = TrainingArguments(
    output_dir=file_path+'files/SarcasmClassifierModel2',
    learning_rate=2e-5,
    per_device_train_batch_size=16, ### TODO: Make batch size larger for faster training?
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    push_to_hub=False, # do not intend to upload model to Hugging Face during training
    report_to='none',
    fp16=True,
)

# define compute metrics
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)

    # labels = pred.label_ids
    # preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


# initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    # processing_class=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [39]:
# train model
trainer.train()
# trainer.train(resume_from_checkpoint=checkpoint_path)

# evaluate model
trainer.evaluate()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4789,0.477196,0.772681,0.772671,0.772764,0.772702


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4789,0.477196,0.772681,0.772671,0.772764,0.772702
2,0.4059,0.48134,0.777776,0.777772,0.777824,0.777791


{'eval_loss': 0.4771955609321594,
 'eval_Accuracy': 0.7726806163587346,
 'eval_F1': 0.772671233470531,
 'eval_Precision': 0.7727644592561771,
 'eval_Recall': 0.7727017421997169,
 'eval_runtime': 100.4138,
 'eval_samples_per_second': 1610.575,
 'eval_steps_per_second': 100.663,
 'epoch': 2.0}

In [40]:
# save model and tokenizer
model_save_path =file_path+'files/TrainedSarcasmClassifierModel2'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel2/tokenizer_config.json',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel2/special_tokens_map.json',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel2/vocab.txt',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel2/added_tokens.json',
 '/content/drive/My Drive/ECE1508(F3)-RL-Project/files/TrainedSarcasmClassifierModel2/tokenizer.json')

In [54]:

# test code below
text1 = "Time is the best teacher"
text2 = "Unfortunately it kills all its students!"
inputs = tokenizer(text1, text2, return_tensors='pt')

# Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}

outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1).item()

print(outputs)
print('predictions: ', predictions)


SequenceClassifierOutput(loss=None, logits=tensor([[-0.6167,  0.6113]], device='cuda:0', grad_fn=<ToCopyBackward0>), hidden_states=None, attentions=None)
predictions:  1
