#Dependencies and Setup#

In [None]:
!pip install transformers
!pip install datasets transformers[SentencePiece]

In [3]:
from google.colab import drive
drive.mount('/content/drive')

FOLDERNAME = 'CS682/CS682-Project/Final'

import sys
sys.path.append('/content/drive/My Drive/{}/code'.format(FOLDERNAME))

%load_ext autoreload
%autoreload 2

%cd /content/drive/My\Drive/$FOLDERNAME

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/17sOv4AaDBYiDb5d0-jvQZQFiRXJ7Ui5-/project-shared/Final


In [None]:
from datasets import *
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AdamW, DataCollatorWithPadding, get_scheduler
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from solver import *
from models import *
from rmse import *
from utils import *
from log_control import *
from transformers import logging
from datetime import date
date = date.today()
model_lib = {}
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

#Single-Task | Base Model

The next 2 cells are meant to be run together

In [None]:
#Classification Training Loops

# set_global_logging_level(logging.INFO)
set_global_logging_level(logging.ERROR, ["transformers", "nlp", "torch", "tensorflow", "tensorboard", "wandb", "datasets"])
# logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere
# logging.disable(logging.WARNING) # disable WARNING, INFO and DEBUG logging everywhere

experiment = 'single_base_5' #experiment type, also used to create output files
logging.set_verbosity_error()
checkpoints = ['bert-base-uncased','roberta-base', 'microsoft/deberta-base', 'albert-base-v2', 'nghuyong/ernie-2.0-en'] #Pre-Trained Model Checkpoints
tasks = ['is_humor', 'humor_rating', 'humor_controversy', 'offense_rating'] #Tasks
setup = 'single' #'multi' if multi-task model, otherwise 'single'
fc_dim = 256 # Hidden layer size of our classifier
has_decoder=False # We don't have an LSTM layer for this experiment]
batch_size=24 #input batch_size for our dataloader
epochs = 2
checkpoints = ['nghuyong/ernie-2.0-en']


df_train, df_val, df_test = get_split_dfs('cs682-humor-offense/data/train.csv')
raw_datasets = DatasetDict({'train':Dataset.from_pandas(df_train), 'validation':Dataset.from_pandas(df_val), 'test':Dataset.from_pandas(df_test)})
new_df_val=df_val.copy() # copy validation DataFrame to enter results and write to file

#Iterate for (#pre-trained models * #tasks) combinations
for checkpoint in checkpoints:
  for task in tasks:
    
    #initialize model, tokenizer and optimizer
    model = base_model(checkpoint=checkpoint,setup='single',task=task,has_decoder=has_decoder,fc_dim=fc_dim)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    
    optimizer = AdamW(model.parameters(), lr=3e-5)
    
    model.to(device)

    #get tokenized datasets for this particular task
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    tokenized_datasets = get_tokenized_datasets(raw_datasets, tokenizer,task)
    

    #create dataloaders from tokenized datasets
    train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, collate_fn=data_collator)
    eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size, collate_fn=data_collator)

    print()
    print("Model: %s | Task:%s" % (checkpoint,task))
    #train and save model
    best_model = train_loop(model, optimizer, train_dataloader, eval_dataloader, epochs=epochs)
    # model.save_pretrained('/content/drive/My Drive/{}/saved_models/{}_tuned'.format(FOLDERNAME,checkpoint))


    #evaluate and save to CSV
    perfs, pred = eval_model(best_model, eval_dataloader)
    new_df_val['pred_'+checkpoint+'_'+task] = pred
    new_df_val.to_csv('cs682-humor-offense/output/'+experiment+'.csv')

    model_lib[(experiment,task, checkpoint)] = [best_model, perfs]
    df_summary = pd.DataFrame.from_dict(model_lib, orient='index')
    df_summary.to_csv('cs682-humor-offense/output/'+experiment+'_summary.csv')
    
    
    for perf in perfs:
      for _,key in enumerate(perf):
        print ("%s = %0.2f" % (key,perf[key]))
    print()


Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]


Model: nghuyong/ernie-2.0-en | Task:is_humor
Training Loop...


  0%|          | 0/534 [00:00<?, ?it/s]


Started Epoch 1/2:
Iteration 1/267: | Loss = 0.70 | [{'accuracy': 0.48375}, {'f1': 0.30118443316412863}]
Iteration 83/267: | Loss = 0.11 | [{'accuracy': 0.93625}, {'f1': 0.9491525423728815}]
Iteration 166/267: | Loss = 0.18 | [{'accuracy': 0.9375}, {'f1': 0.9494949494949495}]
Iteration 249/267: | Loss = 0.06 | [{'accuracy': 0.9475}, {'f1': 0.9569672131147541}]
Iteration 267/267: | Loss = 0.25 | [{'accuracy': 0.95}, {'f1': 0.96}]

Started Epoch 2/2:
Iteration 1/267: | Loss = 0.07 | [{'accuracy': 0.9475}, {'f1': 0.9580838323353293}]
Iteration 83/267: | Loss = 0.01 | [{'accuracy': 0.94625}, {'f1': 0.9566968781470292}]
Iteration 166/267: | Loss = 0.01 | [{'accuracy': 0.9525}, {'f1': 0.9618473895582329}]
Iteration 249/267: | Loss = 0.01 | [{'accuracy': 0.9525}, {'f1': 0.9610655737704918}]
Iteration 267/267: | Loss = 0.06 | [{'accuracy': 0.95625}, {'f1': 0.9642492339121553}]
Best Performance: 0.95625
accuracy = 0.96
f1 = 0.96



  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]


Model: nghuyong/ernie-2.0-en | Task:humor_rating
Training Loop...


  0%|          | 0/534 [00:00<?, ?it/s]


Started Epoch 1/2:
Iteration 1/267: | Loss = 2.07 | [{'rmse': 1.517399090549352}]
Iteration 83/267: | Loss = 0.67 | [{'rmse': 0.6354132513569417}]
Iteration 166/267: | Loss = 0.51 | [{'rmse': 0.653834841531101}]
Iteration 249/267: | Loss = 0.28 | [{'rmse': 0.6204836822995429}]
Iteration 267/267: | Loss = 0.77 | [{'rmse': 0.6174544517614234}]

Started Epoch 2/2:
Iteration 1/267: | Loss = 0.04 | [{'rmse': 0.6184658438426491}]
Iteration 83/267: | Loss = 0.28 | [{'rmse': 0.6133922073192648}]
Iteration 166/267: | Loss = 0.20 | [{'rmse': 0.6133922073192648}]
Iteration 249/267: | Loss = 0.13 | [{'rmse': 0.6123724356957945}]
Iteration 267/267: | Loss = 0.48 | [{'rmse': 0.6184658438426491}]
Best Performance: 0.6123724356957945
rmse = 0.62



  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]


Model: nghuyong/ernie-2.0-en | Task:humor_controversy
Training Loop...


  0%|          | 0/534 [00:00<?, ?it/s]


Started Epoch 1/2:
Iteration 1/267: | Loss = 0.69 | [{'accuracy': 0.68625}, {'f1': 0.0}]
Iteration 83/267: | Loss = 0.36 | [{'accuracy': 0.70375}, {'f1': 0.2882882882882883}]
Iteration 166/267: | Loss = 0.48 | [{'accuracy': 0.68875}, {'f1': 0.5870646766169154}]
Iteration 249/267: | Loss = 0.45 | [{'accuracy': 0.69875}, {'f1': 0.26299694189602446}]
Iteration 267/267: | Loss = 0.73 | [{'accuracy': 0.71125}, {'f1': 0.5443786982248521}]

Started Epoch 2/2:
Iteration 1/267: | Loss = 0.36 | [{'accuracy': 0.715}, {'f1': 0.5762081784386617}]
Iteration 83/267: | Loss = 0.38 | [{'accuracy': 0.70625}, {'f1': 0.5811051693404634}]
Iteration 166/267: | Loss = 0.49 | [{'accuracy': 0.71125}, {'f1': 0.5514563106796116}]
Iteration 249/267: | Loss = 0.40 | [{'accuracy': 0.70625}, {'f1': 0.5489443378119003}]
Iteration 267/267: | Loss = 0.61 | [{'accuracy': 0.7075}, {'f1': 0.5482625482625483}]
Best Performance: 0.715
accuracy = 0.71
f1 = 0.55



  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]


Model: nghuyong/ernie-2.0-en | Task:offense_rating
Training Loop...


  0%|          | 0/534 [00:00<?, ?it/s]


Started Epoch 1/2:
Iteration 1/267: | Loss = 0.64 | [{'rmse': 0.8993052874302474}]
Iteration 83/267: | Loss = 0.12 | [{'rmse': 0.6314665470157544}]
Iteration 166/267: | Loss = 0.20 | [{'rmse': 0.6154266812545586}]
Iteration 249/267: | Loss = 0.36 | [{'rmse': 0.6373774391990981}]
Iteration 267/267: | Loss = 0.36 | [{'rmse': 0.5809475019311126}]

Started Epoch 2/2:
Iteration 1/267: | Loss = 0.12 | [{'rmse': 0.5830951894845301}]
Iteration 83/267: | Loss = 0.10 | [{'rmse': 0.544288526426931}]
Iteration 166/267: | Loss = 0.07 | [{'rmse': 0.6294839156007086}]
Iteration 249/267: | Loss = 0.08 | [{'rmse': 0.552268050859363}]
Iteration 267/267: | Loss = 0.11 | [{'rmse': 0.5477225575051661}]
Best Performance: 0.544288526426931
rmse = 0.55



##Single-Task, Base Model, Ensemble##

In [None]:
#Performance
# Single-Task, Base Model, Ensemble
res_single_base_ens = compute_ensemble('base_ensemble')

print("Performance: Single-Task | Base-Model | Ensemble")
print("************************************************")
print()
for key in res_single_base_ens:
  print(key)
  print(res_single_base_ens[key])
  print()


Downloading:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Performance: Single-Task | Base-Model | Ensemble
************************************************

is_humor
({'f1': 0.9631901840490799}, {'accuracy': 0.955})

humor_controversy
({'f1': 0.5476190476190476}, {'accuracy': 0.715})

humor_rating
{'rmse': 0.6214901447328026}

offense_rating
{'rmse': 0.532681893816563}



#Single-Task | Base Model + LSTM

The next 2 cells are meant to be run together

In [None]:
#Classification Training Loops

# set_global_logging_level(logging.INFO)
set_global_logging_level(logging.ERROR, ["transformers", "nlp", "torch", "tensorflow", "tensorboard", "wandb", "datasets"])
# logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere
# logging.disable(logging.WARNING) # disable WARNING, INFO and DEBUG logging everywhere

experiment = 'single_lstm_5' #experiment type, also used to create output files
logging.set_verbosity_error()
checkpoints = ['bert-base-uncased','roberta-base', 'microsoft/deberta-base', 'albert-base-v2', 'nghuyong/ernie-2.0-en'] #Pre-Trained Model Checkpoints
tasks = ['is_humor', 'humor_rating', 'humor_controversy', 'offense_rating'] #Tasks
setup = 'single' #'multi' if multi-task model, otherwise 'single'
fc_dim = 256 # Hidden layer size of our classifier
lstm_dim = 256
has_decoder=True # We have an LSTM layer for this experiment
batch_size=32 #input batch_size for our dataloader
epochs = 2
checkpoints = ['nghuyong/ernie-2.0-en']

df_train, df_val, df_test = get_split_dfs('cs682-humor-offense/data/train.csv')
raw_datasets = DatasetDict({'train':Dataset.from_pandas(df_train), 'validation':Dataset.from_pandas(df_val), 'test':Dataset.from_pandas(df_test)})
new_df_val=df_val.copy() # copy validation DataFrame to enter results and write to file

#Iterate for (#pre-trained models * #tasks) combinations
for checkpoint in checkpoints:
  for task in tasks:
    
    #initialize model, tokenizer and optimizer
    # model = base_model(checkpoint=checkpoint,setup='single',task=task,has_decoder=has_decoder,fc_dim=fc_dim, lstm_dim=lstm_dim)
    # tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    
    optimizer = AdamW(model.parameters(), lr=3e-5)
    
    model.to(device)

    #get tokenized datasets for this particular task
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    tokenized_datasets = get_tokenized_datasets(raw_datasets, tokenizer,task)
    

    #create dataloaders from tokenized datasets
    train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, collate_fn=data_collator)
    eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size, collate_fn=data_collator)
    break

    print()
    print("Model: %s | Task:%s" % (checkpoint,task))
    #train and save model
    best_model = train_loop(model, optimizer, train_dataloader, eval_dataloader, epochs=epochs)
    # model.save_pretrained('/content/drive/My Drive/{}/saved_models/{}_tuned'.format(FOLDERNAME,checkpoint))


    #evaluate and save to CSV
    perfs, pred = eval_model(best_model, eval_dataloader)
    new_df_val['pred_'+checkpoint+'_'+task] = pred
    new_df_val.to_csv('cs682-humor-offense/output/'+experiment+'.csv')

    model_lib[(experiment,task, checkpoint)] = [best_model, perfs]
    df_summary = pd.DataFrame.from_dict(model_lib, orient='index')
    df_summary.to_csv('cs682-humor-offense/output/'+experiment+'_summary.csv')
    
    
    for perf in perfs:
      for _,key in enumerate(perf):
        print ("%s = %0.2f" % (key,perf[key]))
    print()
  break

##Single-Task, Base Model + LSTM, Ensemble##

In [None]:
#TEMPORARY CODE FOR ENSEMBLE COMPUTATION BEFORE PROJECT SUBMISSION
# Single-Task, Base Model, Ensemble
res_single_base_ens = compute_ensemble('lstm_ensemble')

print("Performance: Single-Task | Base-Model | Ensemble")
print("************************************************")
print()
for key in res_single_base_ens:
  print(key)
  print(res_single_base_ens[key])
  print()


Performance: Single-Task | Base-Model | Ensemble
************************************************

is_humor
({'f1': 0.9551934826883911}, {'accuracy': 0.945})

humor_controversy
({'f1': 0.5280000000000001}, {'accuracy': 0.705})

humor_rating
{'rmse': 0.6383572667401852}

offense_rating
{'rmse': 0.5689903338370521}



In [None]:
#Performance
# Single-Task, Base Model, Ensemble
res_single_lstm_ens = compute_ensemble(experiment)

print("Performance: Single-Task | Base-LSTM | Ensemble")
print("************************************************")
print()
for key in res_single_lstm_ens:
  print(key)
  print(res_single_lstm_ens[key])
  print()
