# HW2 DistilBert Notebook

In [3]:
import os
import sys

from sklearn.metrics import classification_report
import torch
import torch.nn as nn
from transformers import BertConfig, BertModel, BertTokenizer
from transformers import DistilBertConfig, DistilBertModel, DistilBertTokenizer
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer
from transformers import XLNetConfig, XLNetModel, XLNetTokenizer

from torch_shallow_neural_classifier import TorchShallowNeuralClassifier
from torch_rnn_classifier import TorchRNNModel
from torch_rnn_classifier import TorchRNNClassifier
from torch_rnn_classifier import TorchRNNClassifierModel
from torch_rnn_classifier import TorchRNNClassifier

import sst
import utils

import pandas as pd
import json

In [4]:
SST_PATH = os.path.join('data', 'sentiment')
DYNASENT_PATH = os.path.join('data', 'dynasent/dynasent-v1.1')
TWITTER_PATH = os.path.join('data', 'twitter')

In [5]:
def load_dynasent_dataset(*src_filenames, labels=None):
    data = []
    for filename in src_filenames:
        with open(filename) as f:
            for line in f:
                d = json.loads(line)
                if labels is None or d['gold_label'] in labels:
                    data.append(d)
    return data

In [6]:
utils.fix_random_seeds()

bakeoff_dev = sst.bakeoff_dev_reader(SST_PATH)
# SST train label count: 
# Positive: 3610
# Negative: 3310
# Neutral:  1624
sst_train = sst.train_reader(SST_PATH, include_subtrees=True, dedup=True)
sst_dev = sst.dev_reader(SST_PATH)

# Dynasent Yelp train label count: 
# Positive: 9,577 
# Negative: 10,222 
# Neutral:  5,201
dynasent_train = load_dynasent_dataset(os.path.join(DYNASENT_PATH, 'dynasent-v1.1-round01-yelp-train.jsonl'))
dynasent_dev = load_dynasent_dataset(os.path.join(DYNASENT_PATH, 'dynasent-v1.1-round01-yelp-dev.jsonl'))

In [7]:
# Process Dynasent data
def extract_data_to_df(label_name, count):
  dynasent_train_temp = load_dynasent_dataset(
      os.path.join(DYNASENT_PATH, 'dynasent-v1.1-round01-yelp-train.jsonl'), 
      labels=(label_name, label_name))[:count]
  pairs_temp = zip((d['text_id'], d['sentence'], d['gold_label'], 0) for d in dynasent_train_temp)
  df_src = [p[0] for p in list(pairs_temp)]
  return pd.DataFrame(df_src, columns=['example_id', 'sentence', 'label', 'is_subtree'])

dynasent_positive_df = extract_data_to_df('positive', 500) #2000)
dynasent_negative_df = extract_data_to_df('negative', 500) #2000)
dynasent_neutral_df = extract_data_to_df('neutral', 2000)
dynasent_df = pd.concat([dynasent_positive_df, dynasent_negative_df, dynasent_neutral_df])

In [8]:
# Process datasets
# TODO: Add cell above for processing examples from the dynasent dataset. 
train_df = sst_train
augmented_train_df = pd.concat([sst_train, dynasent_df])

# DistilBERT

In [9]:
class HfDistilBertModel(nn.Module):
    def __init__(self, n_classes, weights_name='distilbert-base-cased', dropout_p=0.5):
        super().__init__()
        self.weights_name = weights_name
        self.dropout_p = dropout_p
        self.distilbert = DistilBertModel.from_pretrained(self.weights_name)
        self.distilbert.train()
        self.hidden_dim = self.distilbert.embeddings.word_embeddings.embedding_dim
        self.n_classes = n_classes
        self.preclassifier_layer = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.dropout = nn.Dropout(self.dropout_p)
        self.classifier_layer = nn.Linear(self.hidden_dim, self.n_classes)

    def forward(self, indices, mask):
        reps = self.distilbert(indices, attention_mask=mask)
        hidden_state = reps[0]
        pooler = hidden_state[:,0]
        pooler = self.preclassifier_layer(pooler)
        pooler = nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        return self.classifier_layer(pooler)
        """
        return self.classifier_layer(reps.pooler_output)
        """

class HfDistilBertClassifier(TorchShallowNeuralClassifier):
    def __init__(self, weights_name, *args, **kwargs):
        self.weights_name = weights_name
        self.tokenizer = DistilBertTokenizer.from_pretrained(self.weights_name)
        super().__init__(*args, **kwargs)
        self.params += ['weights_name']

    def build_graph(self):
        return HfDistilBertModel(self.n_classes_, self.weights_name)

    def build_dataset(self, X, y=None):
        data = self.tokenizer.batch_encode_plus(
            X,
            max_length=None,
            add_special_tokens=True,
            padding='longest',
            return_attention_mask=True)
        indices = torch.tensor(data['input_ids'])
        mask = torch.tensor(data['attention_mask'])
        if y is None:
            dataset = torch.utils.data.TensorDataset(indices, mask)
            return dataset
        self.classes_ = sorted(set(y))
        self.n_classes_ = len(self.classes_)
        class_to_ind = dict(zip(self.classes_, range(self.n_classes_)))
        return torch.utils.data.TensorDataset(
            indices, 
            mask,
            torch.tensor([class_to_ind[y0] for y0 in y]))

def distilbert_fine_tune_phi(text):
    return text

# Uses the best values from parameter tuning.
def fit_distilbert_classifier(X, y):
    mod = HfDistilBertClassifier(
        gradient_accumulation_steps=8,
        #eta=0.0005,
        eta=0.0001,
        #hidden_dim=325,
        hidden_dim=300,
        weights_name='distilbert-base-cased',
        batch_size=8,
        max_iter=1,  
        n_iter_no_change=5,
        early_stopping=True)  

    mod.fit(X, y)

    return mod

In [None]:
# took around an hour maybe to run
# without gpu: still has not completed after 5h
# Start: 2021/09/08 03:15
distilbert_classifier_xval = sst.experiment(
    train_df,
    distilbert_fine_tune_phi,
    fit_distilbert_classifier,
    assess_dataframes=[sst_dev, bakeoff_dev],
    vectorize=False) 

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Finished epoch 1 of 1; error is 1375.383385562338

Assessment dataset 1
              precision    recall  f1-score   support

    negative      0.696     0.897     0.784       428
     neutral      0.514     0.162     0.246       229
    positive      0.780     0.838     0.808       444

    accuracy                          0.720      1101
   macro avg      0.663     0.632     0.612      1101
weighted avg      0.692     0.720     0.682      1101



## Train for real

In [None]:
optimized_distlbert_classifier = distilbert_classifier_xval['model']

# Remove the rest of the experiment results to clear out some memory
#del distilbert_classifier_xval

In [None]:
def fit_distilbert_classifier_bakeoff(X, y):
    mod = HfDistilBertClassifier(
        gradient_accumulation_steps=8,
        eta=0.0001,
        hidden_dim=300,
        weights_name='distilbert-base-cased',
        batch_size=8,
        max_iter=10, #100,  
        n_iter_no_change=5,
        early_stopping=True)  
    mod.fit(X, y)
    return mood

In [None]:
def fit_optimized_distilbert_classifier(X, y):
    optimized_distilbert_classifier.max_iter = 10 #100 #1000
    optimized_distilbert_classifier.fit(X, y)
    return optimized_distilbert_classifier

In [None]:
# Start time: 2021/09/08 
hfdistilbert_experiment = sst.experiment(
    augmented_train_df, 
    distilbert_fine_tune_phi,
    fit_distilbert_classifier,
    assess_dataframes=[sst_dev, bakeoff_dev],
    vectorize=False)  

## Bakeoff submission

In [None]:
def predict_one_rnn_distilbert(text):
    # List of tokenized examples:
    X = [hfdistilbert_experiment['phi'](text)]
    # Standard `predict` step on a list of lists of str:
    preds = hfdistilbert_experiment['model'].predict(X)
    # Be sure to return the only member of the predictions,
    # rather than the singleton list:
    return preds[0]

def create_bakeoff_submission_distilbert(
        predict_one_func,
        output_filename='cs224u-sentiment-bakeoff-entry-v1distilbert.csv'):

    bakeoff_test = sst.bakeoff_test_reader(SST_PATH)
    sst_test = sst.test_reader(SST_PATH)
    bakeoff_test['dataset'] = 'bakeoff'
    sst_test['dataset'] = 'sst3'
    df = pd.concat((bakeoff_test, sst_test))

    df['prediction'] = df['sentence'].apply(predict_one_func)

    df.to_csv(output_filename, index=None)

In [None]:
create_bakeoff_submission_distilbert(predict_one_rnn_distilbert)

In [None]:
create_bakeoff_submission_bert(predict_one_rnn_bert