## Data Loading

In [1]:
import pandas as pd
import numpy as np
x_train = pd.read_csv('data/train/subtaskA_data_all.csv')
y_train = pd.read_csv('data/train/subtaskA_answers_all.csv', header=None)
x_dev = pd.read_csv('data/dev/subtaskA_dev_data.csv')
y_dev = pd.read_csv('data/dev/subtaskA_gold_answers.csv', header=None)
x_test = pd.read_csv('data/trial/taskA_trial_data.csv')
y_test = pd.read_csv('data/trial/taskA_trial_answer.csv', header=None)
x_submit = pd.read_csv('data/test/subtaskA_test_data.csv')
y_train = y_train.rename(columns={0 : 'id',1: 'invalid_sent'})
y_dev = y_dev.rename(columns={0 : 'id',1: 'invalid_sent'})
y_test = y_test.rename(columns={0 : 'id',1: 'invalid_sent'})

In [2]:
xy = x_train.merge(y_train)
items = []
for row in xy.itertuples():
    if row.invalid_sent == 0:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 0.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 1.0})
    else:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 1.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 0.0})
traindf = pd.DataFrame(items)

xytest = x_test.merge(y_test)
items = []
for row in xytest.itertuples():
    if row.invalid_sent == 0:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 0.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 1.0})
    else:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 1.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 0.0})
        
testdf = pd.DataFrame(items)

xydev = x_dev.merge(y_dev)
items = []
for row in xydev.itertuples():
    if row.invalid_sent == 0:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 0.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 1.0})
    else:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 1.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 0.0})
devdf = pd.DataFrame(items)

## Baseline - Logistic Regression

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

ps = PorterStemmer()
stop_words = set(stopwords.words('english')) 
def preprocess(x):
    x = x.lower()
    tokens = word_tokenize(x)
    filtered_tokens = [w for w in tokens if not w in stop_words] 
    return ' '.join([ps.stem(t) for t in filtered_tokens])
trainpdf = traindf.sent.apply(preprocess)
testpdf = testdf.sent.apply(preprocess)

[nltk_data] Downloading package punkt to /Users/goelprat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/goelprat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0, lowercase=False, max_features=3000)
vectorizer.fit(trainpdf)
len(vectorizer.vocabulary_.keys())

3000

In [None]:
X_train = vectorizer.transform(trainpdf)
X_test = vectorizer.transform(testpdf)
Y_train = traindf.isvalid
Y_test = testdf.isvalid

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
score = classifier.score(X_test, Y_test)
score

0.5774369124195943

Baseline accuracy is 57%. I now use Keras to essentially do simple logistic regression just to practice grid search. 

In [None]:
### Answer 5
from keras.datasets import cifar10
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
### Answer 3
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

def create_model(optimizer='sgd'):
    model = Sequential()
    model.add(Dense(units = 1, activation = 'sigmoid', input_dim = 3000))
    # Describe the loss and how it is optimized
    model.compile(loss = 'mean_squared_error', optimizer = optimizer, metrics = ['accuracy'])
    return model

def grid_search():
    optimizers = []
    model = KerasClassifier(build_fn=create_model)
    optimizer = ['SGD', 'Adam', 'RMSProp']
    param_grid = dict(epochs=[20, 40, 60], batch_size=[20, 40, 60], optimizer=optimizer)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_result = grid.fit(x_train, y_train)

    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        
model = create_model(optimizer='adam')
model.fit(X_train, Y_train, epochs=40, batch_size=40)


In [None]:
model.evaluate(X_test, Y_test)



[0.24135216026539735, 0.5821375846862793]

## Using BERT with pretrained weights to get sentence embedding

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def bert_input(df):
    bertdf = df.sent.apply(lambda x : "[CLS] " + x + " [SEP]")
    bertdf_tokenized = bertdf.apply((lambda x: tokenizer.encode(x)))
    max_len = 0
    for i in bertdf_tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in bertdf_tokenized.values])
    attention_mask = np.where(padded != 0, 1, 0)
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    return input_ids, attention_mask

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')
input_ids_train, attention_mask_train = bert_input(traindf)
with torch.no_grad():
    hidden_train = model(input_ids_train, attention_mask=attention_mask_train)
features_train = hidden_train[0][:,0,:].numpy()

input_ids_test, attention_mask_test = bert_input(testdf)
with torch.no_grad():
    hidden_test = model(input_ids_test, attention_mask=attention_mask_test)
features_test = hidden_test[0][:,0,:].numpy()


In [None]:
Y_train = traindf.isvalid
Y_test = testdf.isvalid

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(features_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
score = classifier.score(features_test, Y_test)
score

0.6264225630875804

## Using BERT to get perplexity

In the [paper](https://arxiv.org/abs/1906.00363) published along with this task, the authors claim to get 70% accuracy by using BERT to get sentence perplexity. I try to replicate that claim in this section.

In [7]:
bertdf = traindf.sent.apply(lambda x : "[CLS] " + x + " [SEP]")
testbertdf = testdf.sent.apply(lambda x : "[CLS] " + x + " [SEP]")

In [None]:
!pip install transformers

In [37]:
import torch
from transformers import BertForMaskedLM, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def get_bert_inputs(df):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for i, row in df.iterrows():
        tokendict = tokenizer.encode_plus(row['sent'], max_length=25, pad_to_max_length=True)
        input_ids.append(tokendict['input_ids'])
        attention_masks.append(tokendict['attention_mask'])
        token_type_ids.append(tokendict['token_type_ids'])
    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(token_type_ids)

In [38]:
input_ids, attention_masks, token_type_ids = get_bert_inputs(testdf)

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_tensors = []
for sent in bertdf:
    # Tokenize input
    tokenized_text = tokenizer.tokenize(sent)
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    train_tensors.append(tokens_tensor)
    
test_tensors = []
for sent in testbertdf:
    # Tokenize input
    tokenized_text = tokenizer.tokenize(sent)
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    test_tensors.append(tokens_tensor)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/goelprat/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [85]:
from transformers import BertForMaskedLM
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [None]:
import torch.nn.functional as F
import math

loss_fct = torch.nn.CrossEntropyLoss()

def run_tokens(s):
    tokenized_text = tokenizer.tokenize(s)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_ids = [0] * tokens_tensor.shape[1]
    segments_tensors = torch.tensor([segments_ids])
    prediction = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = prediction[0]
    predicted_tokens = []
    for i in range(tokens_tensor.shape[1]):
        predicted_index = torch.argmax(predictions[0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        predicted_tokens.append(predicted_token)
    return predicted_tokens

def run_perp(s):
    tokenized_text = tokenizer.tokenize(s)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_ids = [0] * tokens_tensor.shape[1]
    segments_tensors = torch.tensor([segments_ids])
    predictions = model(tokens_tensor, token_type_ids=segments_tensors)
    loss = loss_fct(predictions[0].squeeze(),tokens_tensor.squeeze()).data 
    return math.exp(loss)

def run(sents, mode):
    model.eval()
    f = run_perp if mode == 'perp' else run_tokens
    out = []
    with torch.no_grad():
        for sent in sents:
            out.append(f(sent))
    return out

In [41]:
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel, BertForMaskedLM
class text_dataset(Dataset):
    def __init__(self,x_y_list):
        self.x_y_list = x_y_list
        
    def __getitem__(self,index):           
        input_ids = self.x_y_list[0][index]
        token_type_ids = self.x_y_list[1][index]
        attention_mask = self.x_y_list[2][index]
        label = self.x_y_list[3][index]
        return input_ids, token_type_ids, attention_mask, label
    
    def __len__(self):
        return len(self.x_y_list[0])

In [77]:
from sklearn.model_selection import train_test_split
batch_size = 1
epochs = 10
isvalid = torch.tensor(testdf.isvalid)
test_lists = [input_ids, attention_masks, token_type_ids, isvalid]
test_dataset = text_dataset(x_y_list = test_lists)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0)


In [78]:
import math
perps = []
loss_fct = torch.nn.CrossEntropyLoss()
for inputs, masks, token_types, target in test_loader:
    inputs = inputs.to(device)
    #masks = masks.to(device)
    #token_types = token_types.to(device)
    #import pdb; pdb.set_trace()
    output = model(inputs, masked_lm_labels=inputs)
    #loss = loss_fct(output[0].squeeze(),inputs.squeeze()).data 
    perps.append(math.exp(output[0]))

In [105]:
i = 10
sent0 = testdf.sent[i]
sent1 = testdf.sent[i + 1]
input_ids0 = torch.tensor(tokenizer.encode(sent0, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
input_ids1 = torch.tensor(tokenizer.encode(sent1, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs0 = model(input_ids0, masked_lm_labels=input_ids0)
outputs1 = model(input_ids1, masked_lm_labels=input_ids1)

print(sent0, math.exp(outputs0[0]))
print(sent1, math.exp(outputs1[0]))

he was sent to a restaurant for treatment after a car crash 53.811206193880054
he was sent to a hospital for treatment after a car crash 56.52741072294196


In [100]:
testdf.sent[4]

'money can be used for buying cars'

In [None]:
sents = ['[CLS] my sister eats an apple after breakfast every day . [SEP]', '[CLS] my sister eats a stone after breakfast every day . [SEP]']
run(sents, 'perp')


[42.116081023755655, 38.70850197316724]

In [None]:
sents = []
for s in testdf.sent:
    sents.append('[CLS] ' + s + ' . [SEP]')

In [None]:
perps = run(sents, 'perp')

In [80]:
def get_results_from_perps(perps):
    results = []
    for i in range(0, len(perps), 2):
        if perps[i] < perps[i + 1]:
            results.append(1)
        else:
            results.append(0)
    return results

results = get_results_from_perps(perps)

In [81]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python3 eval/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv

Accuracy: 49.4805%


I was unable to get better than random results using BERT to get sentence probability. I believe this is because BERT outputs probability conditional on all the other tokens. This means that multiplying the probabilities for tokens in a sentence does not yield the probability of the sentence. 

## Using GPT-2 to get sentence probability

In [None]:
import math
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Model
gpt_model = GPT2Model.from_pretrained('gpt2')
gpt_model.eval()
# Load pre-trained model (weights)
gpt_model_lm = GPT2LMHeadModel.from_pretrained('gpt2-large')
gpt_model_lm.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

loss_fct = torch.nn.CrossEntropyLoss()

In [None]:
def gpt_input(df):
    gptdf = df.sent.apply(lambda x : ' <|endoftext|> ' + x)
    gptdf_tokenized = gptdf.apply((lambda x: tokenizer.encode(x)))
    max_len = 0
    for i in gptdf_tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in gptdf_tokenized.values])
    attention_mask = np.where(padded != 0, 1, 0)
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    return input_ids, attention_mask

In [None]:
def gpt_predictions(sentence, model):
    sentence = ' <|endoftext|> ' + sentence
    tokenize_input = tokenizer.tokenize(sentence, add_prefix_space=True)
    tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    return model(tokens_tensor)
    
def get_last_hidden_state(sentence, model):
    predictions = gpt_predictions(sentence, model)
    return predictions[0].squeeze()[-1]

def gpt_score(sentence):
    sentence = ' <|endoftext|> ' + sentence
    tokenize_input = tokenizer.tokenize(sentence, add_prefix_space=True)
    tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    predictions=gpt_model_lm(tokens_tensor)
    loss = loss_fct(predictions[0].squeeze()[:-1],tokens_tensor.squeeze()[1:]).data 
    return math.exp(loss)

def gpt_tokens(input_sentence):
    sentence = '<|endoftext|> ' + input_sentence
    tokenize_input = tokenizer.tokenize(sentence)
    tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    predictions =gpt_model(tokens_tensor)
    predicted_tokens = []
    for i in range(tokens_tensor.shape[1]):
        predicted_index = torch.argmax(predictions[0][0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        predicted_tokens.append(predicted_token)
    return predicted_tokens

In [None]:
def run(sents, mode='perp'):
    out = []
    with torch.no_grad():
        for sent in sents:
            if mode == 'perp':
                out.append(gpt_score(sent))
            else:
                out.append(gpt_tokens(sent))
    return out

In [None]:
run(['he put an elephant into the fridge', 'he put a turkey into the fridge'], mode='perp')

[628.2036629979144, 448.928654083428]

In [None]:
input_ids, attention_mask = gpt_input(testdf)
with torch.no_grad():
    predictions = gpt_model_lm(input_ids)

In [None]:
i = 1
loss = loss_fct(predictions[0][i, :-1, :],input_ids[i, 1:]).data 
math.exp(loss)

150.6329864510042

In [None]:
perps = []
for i in range(4042):
    loss = loss_fct(predictions[0][i, :-1, :],input_ids[i, 1:]).data 
    perps.append(math.exp(loss))

In [None]:
def get_results_from_perps(perps):
    results = []
    for i in range(0, len(perps), 2):
        if perps[i] < perps[i + 1]:
            results.append(1)
        else:
            results.append(0)
    return results

results = get_results_from_perps(perps)

In [None]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python3 eval/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv

Accuracy: 71.3508%


These results using the GPT-2 small model match the ~70% baseline reported in the paper. 
Using the large model, I was able to get 71.35% accuracy. 

Next, I experiment with using the final hidden state output of the GPT-2 model to classify a sentence as for or against common sense. 

In [None]:
def run_classification(sents, model):
    import numpy as np
    x = np.zeros((len(sents), 768))
    with torch.no_grad():
        for i in range(len(sents)):
            h = get_last_hidden_state(sents[i], model)
            x[i] = h
    return x

In [None]:
sents = [x for x in traindf.sent]
x = run_classification(sents, gpt_model)
y = traindf.isvalid

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
x_test = run_classification([x for x in testdf.sent], gpt_model)

In [None]:
y_test = testdf.isvalid

In [None]:
classifier.score(x_test, y_test)

0.591044037605146

Now, I try to find examples that are incorrectly classified by my GPT-2 based model. 

In [None]:
test_answers = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/trial/taskA_trial_answer.csv', header=None)

In [None]:
results

In [None]:
wrong_examples = x_test[test_answers[1] != pd.Series(results)]

In [None]:
items = []
for row in wrong_examples.itertuples():
    items.append({'id' : row.id, 'sent' : row.sent0})
    items.append({'id' : row.id, 'sent' : row.sent1})
wrongdf = pd.DataFrame(items)

In [39]:
wrongdf.sent

0       a man can better see stars and the moon in day...
1       a man can hardly see stars and the moon in day...
2                                   I work 25 hours a day
3                                    I work 8 hours a day
4        I changed my direction when passing a crossroads
                              ...                        
1273                    Jim downloads music from the book
1274              Bob goes to bed because he feels sleepy
1275             Bob goes to work because he feels sleepy
1276    people have to hold onto their hats because of...
1277    people have to hold onto their shoes because o...
Name: sent, Length: 1278, dtype: object

In [None]:
with open('wrong_examples.txt', 'w') as f:
    for s in wrongdf.sent:
        f.write(str(s) + "\n")

## Finetuning BERT using transformers library

In [3]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def get_bert_inputs(df):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for i, row in df.iterrows():
        tokendict = tokenizer.encode_plus(row['sent0'], row['sent1'], max_length=50, pad_to_max_length=True)
        input_ids.append(tokendict['input_ids'])
        attention_masks.append(tokendict['attention_mask'])
        token_type_ids.append(tokendict['token_type_ids'])
    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(token_type_ids)

In [4]:
input_ids, attention_masks, token_type_ids = get_bert_inputs(xy)

In [5]:
input_ids_dev, attention_masks_dev, token_type_ids_dev = get_bert_inputs(xydev)

In [6]:
input_ids_test, attention_masks_test, token_type_ids_test = get_bert_inputs(xytest)

In [15]:
class MyBertForSequenceClassification(torch.nn.Module):  
    def __init__(self, num_labels=1):
        super(MyBertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertForSequenceClassification.from_pretrained('bert-large-uncased')
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        return pooled_output[0]

In [16]:
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel, BertForMaskedLM
max_seq_length = 50
class text_dataset(Dataset):
    def __init__(self,x_y_list):
        self.x_y_list = x_y_list
        
    def __getitem__(self,index):           
        input_ids = self.x_y_list[0][index]
        token_type_ids = self.x_y_list[1][index]
        attention_mask = self.x_y_list[2][index]
        label = self.x_y_list[3][index]
        return input_ids, token_type_ids, attention_mask, label
    
    def __len__(self):
        return len(self.x_y_list[0])

In [17]:
import torch
import time
import os
import copy
import torch.nn.functional as F
import numpy as np

def train_model(model, criterion, optimizer, scheduler, device, num_epochs=25):
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                print("VALIDATION")
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            corrects = 0
                        
            # Iterate over data.
            for inputs, token_types, mask, target in dataloaders_dict[phase]:
                inputs = inputs.to(device)
                token_types = token_types.to(device)
                mask = mask.to(device)
                target = target.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs, token_type_ids=token_types, attention_mask=mask)
                    outputs = F.softmax(outputs,dim=1)  
                    loss = criterion(outputs, target)
                    # backward + optimize only if in training phase
                    if phase == 'train':                        
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                #print('Running loss : {:.4f}'.format(running_loss))
                corrects += torch.sum(torch.max(outputs, 1)[1] == target)
            epoch_loss = running_loss / dataset_sizes[phase]
            acc = corrects.double() / dataset_sizes[phase]
            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} accuracy: {:.4f}'.format(
                phase, acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(float(best_loss)))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
from sklearn.model_selection import train_test_split
batch_size = 16
invalid = torch.tensor(xy.invalid_sent)
invalid_dev = torch.tensor(xydev.invalid_sent)
#train_indices, val_indices = train_test_split([i for i in range(10000)])
#train_lists = [input_ids[train_indices], token_type_ids[train_indices], attention_masks[train_indices], invalid[train_indices]]
#val_lists = [input_ids[val_indices], token_type_ids[val_indices], attention_masks[val_indices], invalid[val_indices]]
train_lists = [input_ids, token_type_ids, attention_masks, invalid]
val_lists = [input_ids_dev, token_type_ids_dev, attention_masks_dev, invalid_dev]
test_lists = [input_ids_test, token_type_ids_test, attention_masks_test, invalid]
training_dataset = text_dataset(x_y_list = train_lists)
val_dataset = text_dataset(x_y_list = val_lists)
test_dataset = text_dataset(x_y_list = test_lists)

dataloaders_dict = {'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, num_workers=0),
                   'val':torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
                   }
dataset_sizes = {'train':len(train_lists[0]),
                'val':len(val_lists[0])}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [11]:
len(train_lists[0])

10000

In [12]:
model = MyBertForSequenceClassification().to(device)

In [14]:
#from torch import optim
from transformers.optimization import AdamW
from transformers import get_linear_schedule_with_warmup
epochs = 3
lrlast = .001
lrmain = 2e-5
optim = AdamW(model.bert.parameters(), lr=lrmain, eps=1e-8)
criterion = torch.nn.CrossEntropyLoss()

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = (len(train_lists[0]) / batch_size) * epochs)

In [13]:
model_ft1 = train_model(model, criterion, optim, scheduler, device, num_epochs=epochs)

starting
Epoch 0/2
----------
train total loss: 0.6208 
train accuracy: 0.6521
VALIDATION
val total loss: 0.5578 
val accuracy: 0.7422
saving with loss of 0.5577636059810788 improved over previous 100
Epoch 1/2
----------
train total loss: 0.4933 
train accuracy: 0.8118
VALIDATION
val total loss: 0.5051 
val accuracy: 0.7974
saving with loss of 0.5051305820136037 improved over previous 0.5577636059810788
Epoch 2/2
----------
train total loss: 0.4240 
train accuracy: 0.8858
VALIDATION
val total loss: 0.5033 
val accuracy: 0.8074
saving with loss of 0.503279189587596 improved over previous 0.5051305820136037
Training complete in 36m 6s


In [19]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=0)
results = []
for ipt, toktypes, maskt, targett in test_loader:
    ipt = ipt.to(device)
    toktypes = toktypes.to(device)
    maskt = maskt.to(device)
    outputs = model_ft1(ipt, toktypes, maskt)
    outputs = F.softmax(outputs,dim=1)
    for i in range(len(outputs)):
        predicted_index = torch.argmax(outputs[i]).item()
        results.append(predicted_index)

In [None]:
len(results)

2021

In [21]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python3 eval/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv


Accuracy: 73.6764%


In [None]:
test_answers = pd.read_csv('data/trial/taskA_trial_answer.csv', header=None)
wrong_examples = x_test[test_answers[1] != pd.Series(results)]
items = []
for row in wrong_examples.itertuples():
    items.append({'id' : row.id, 'sent' : row.sent0})
    items.append({'id' : row.id, 'sent' : row.sent1})
wrongdf = pd.DataFrame(items)
with open('wrong_examples.txt', 'w') as f:
    for s in wrongdf.sent:
        f.write(str(s) + "\n")

# Finetuning RoBERTa using [CLS] embedding

In [None]:
!pip install transformers

In [5]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
import torch
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
def get_bert_inputs(df):
    input_ids = []
    attention_masks = []
    for i, row in df.iterrows():
        tokendict = tokenizer.encode_plus(row['sent0'], row['sent1'], max_length=50, pad_to_max_length=True)
        input_ids.append(tokendict['input_ids'])
        attention_masks.append(tokendict['attention_mask'])
    return torch.tensor(input_ids), torch.tensor(attention_masks)

In [20]:
input_ids, attention_masks = get_bert_inputs(xy)
input_ids_dev, attention_masks_dev = get_bert_inputs(xydev)

In [None]:
from torch import nn
from transformers import RobertaForSequenceClassification
class MyRoBertaForSequenceClassification(nn.Module):
  
    def __init__(self, num_labels=1):
        super(MyRoBertaForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = RobertaForSequenceClassification.from_pretrained('roberta-base')
        #self.classifier = nn.Linear(768, num_labels)
        #nn.init.xavier_normal_(self.classifier.weight)
    def forward(self, input_ids, attention_mask=None, labels=None):
        pooled_output = self.bert(input_ids, attention_mask)
        return pooled_output[0]


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MyRoBertaForSequenceClassification().to(device)
#model = RobertaForSequenceClassification.from_pretrained('roberta-base')
#model = model.to(device)

In [29]:
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel, BertForMaskedLM
max_seq_length = 50
class text_dataset(Dataset):
    def __init__(self,x_y_list):
        self.x_y_list = x_y_list
        
    def __getitem__(self,index):           
        input_ids = self.x_y_list[0][index]
        attention_mask = self.x_y_list[1][index]
        label = self.x_y_list[2][index]
        return input_ids, attention_mask, label
    
    def __len__(self):
        return len(self.x_y_list[0])

In [30]:
from sklearn.model_selection import train_test_split

invalid = torch.tensor(xy.invalid_sent)
invalid_dev = torch.tensor(xydev.invalid_sent)
train_lists = [input_ids, attention_masks, invalid]
val_lists = [input_ids_dev, attention_masks_dev, invalid_dev]

training_dataset = text_dataset(x_y_list = train_lists)
val_dataset = text_dataset(x_y_list = val_lists)

dataloaders_dict = {'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, num_workers=0),
                   'val':torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
                   }
dataset_sizes = {'train':len(train_lists[0]),
                'val':len(val_lists[0])}


In [31]:
import torch
import time
import os
import copy
import torch.nn.functional as F
import numpy as np

def train_model(model, criterion, optimizer, scheduler, device, num_epochs=25, phases=['train', 'val']):
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in phases:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                print("VALIDATION")
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            corrects = 0
                        
            # Iterate over data.
            for inputs, mask, target in dataloaders_dict[phase]:
                inputs = inputs.to(device)
                mask = mask.to(device)
                target = target.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs, attention_mask=mask)
                    #import pdb; pdb.set_trace()
                    outputs = F.softmax(outputs,dim=1)  
                    loss = criterion(outputs, target)
                    # backward + optimize only if in training phase
                    if phase == 'train':                        
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                #print('Running loss : {:.4f}'.format(running_loss))
                corrects += torch.sum(torch.max(outputs, 1)[1] == target)
            epoch_loss = running_loss / dataset_sizes[phase]
            acc = corrects.double() / dataset_sizes[phase]
            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} accuracy: {:.4f}'.format(
                phase, acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(float(best_loss)))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [32]:
#from torch import optim
from transformers.optimization import AdamW
from transformers import get_linear_schedule_with_warmup
batch_size = 16
epochs = 3
lrlast = .001
lrmain = 2e-5
optim = AdamW(model.bert.parameters(), lr=lrmain, eps=1e-8)
criterion = nn.CrossEntropyLoss()

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = (len(train_lists[0]) / batch_size) * epochs)

In [26]:
len(train_lists[0])

10000

In [13]:
import random
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [33]:
model_ft1 = train_model(model, criterion, optim, scheduler, device, num_epochs=epochs)


starting
Epoch 0/2
----------
train total loss: 0.6948 
train accuracy: 0.4889
VALIDATION
val total loss: 0.6947 
val accuracy: 0.4804
saving with loss of 0.6946904892428828 improved over previous 100
Epoch 1/2
----------
train total loss: 0.6141 
train accuracy: 0.6624
VALIDATION
val total loss: 0.5119 
val accuracy: 0.7924
saving with loss of 0.5119188002263053 improved over previous 0.6946904892428828
Epoch 2/2
----------
train total loss: 0.4871 
train accuracy: 0.8126
VALIDATION
val total loss: 0.4781 
val accuracy: 0.8215
saving with loss of 0.4781279314604542 improved over previous 0.5119188002263053
Training complete in 11m 34s


In [33]:
model_ft1 = train_model(model, criterion, optim, scheduler, device, num_epochs=1, phases=['val'])


starting
Epoch 0/0
----------
VALIDATION
val total loss: 0.6932 
val accuracy: 0.5064
saving with loss of 0.6932363882064819 improved over previous 100
Training complete in 0m 21s


In [18]:
invalid_test = torch.tensor(xytest.invalid_sent)
input_ids_test, attention_masks_test = get_bert_inputs(xytest)
test_lists = [input_ids_test, attention_masks_test, invalid_test]
test_dataset = text_dataset(x_y_list = test_lists)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=0)
        

In [23]:
def generate_results(data_loader):
    results = []
    for ipt, maskt, targett in data_loader:
        ipt = ipt.to(device)
        maskt = maskt.to(device)
        outputs = model_ft1(ipt, maskt)
        outputs = F.softmax(outputs,dim=1)
        for i in range(len(outputs)):
            predicted_index = torch.argmax(outputs[i]).item()
            results.append(predicted_index)
    return results

In [24]:
results = generate_results(test_loader)

In [21]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python3 eval/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv


Accuracy: 86.8877%


In [26]:
x_eval = pd.read_csv('data/test/subtaskA_test_data.csv')
input_ids_eval, attention_masks_eval = get_bert_inputs(x_eval)
eval_lists = [input_ids_eval, attention_masks_eval, invalid]
eval_dataset = text_dataset(x_y_list = eval_lists)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size, num_workers=0)
                            

In [27]:
results = generate_results(eval_loader)

In [28]:
pd.concat([x_eval.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)

In [None]:
test_answers = pd.read_csv('data/trial/taskA_trial_answer.csv', header=None)
wrong_examples = x_test[test_answers[1] != pd.Series(results)]
items = []
for row in wrong_examples.itertuples():
    items.append({'id' : row.id, 'sent' : row.sent0})
    items.append({'id' : row.id, 'sent' : row.sent1})
wrongdf = pd.DataFrame(items)
with open('wrong_examples.txt', 'w') as f:
    for s in wrongdf.sent:
        f.write(str(s) + "\n")

# Using LSTM to get sentence embedding from RoBERTa embeddings

In [None]:
!pip install transformers

In [54]:
import torch
from torch import nn
from transformers import RobertaModel
class CommonSenseClassifier(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ninp, nhid, nlayers, dropout=0.5):
        super(CommonSenseClassifier, self).__init__()
        #self.ntoken = ntoken
        self.drop = nn.Dropout(dropout)
        #self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn1 = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
            self.rnn2 = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.lin1 = nn.Linear(ninp, 1)
        self.lin2 = nn.Linear(ninp, 1)
        self.classifier = nn.Linear(2, 2)
        nn.init.xavier_normal_(self.lin1.weight)
        nn.init.xavier_normal_(self.lin2.weight)
        nn.init.xavier_normal_(self.classifier.weight)

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        #self.encoder.weight.data.uniform_(-initrange, initrange)
        self.classifier.bias.data.zero_()
        self.classifier.weight.data.uniform_(-initrange, initrange)

    def forward(self, input1, input2, hidden1, hidden2):
        #emb = self.drop(self.encoder(input))
        #emb = self.drop(input)
        output1, hidden1 = self.rnn1(input1, hidden1)
        output2, hidden2 = self.rnn2(input2, hidden2)
        #output = self.drop(output)
        #decoded = self.decoder(output)
        #decoded = decoded.view(-1, self.ntoken)
        #return F.log_softmax(decoded, dim=1), hidden
        cell1 = hidden1[1].squeeze()
        cell2 = hidden2[1].squeeze()
        out1 = self.lin1(cell1)
        out2 = self.lin2(cell2)
        #class_features = torch.cat((cell1, cell2), 1)
        #import pdb; pdb.set_trace()
        output = torch.cat((out1, out2), 1)
        return torch.nn.functional.softmax(output, dim=1), hidden1, hidden2

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bert = RobertaModel.from_pretrained('roberta-base')
bert = bert.to(device)
sense_model = CommonSenseClassifier('LSTM', 768, 768, 1).to(device)

  "num_layers={}".format(dropout, num_layers))


In [111]:
max_toks = 0
for i, row in xytest.iterrows():
    ntokens = len(row['sent0'].split())
    if ntokens > max_toks:
        max_toks = ntokens

In [112]:
max_toks

18

In [4]:
import torch
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
max_seq_length = 25
def get_roberta_inputs(df):
    input_ids = []
    attention_masks = []
    for i, row in df.iterrows():

        tokendict1 = tokenizer.encode_plus(row['sent0'], max_length=max_seq_length, pad_to_max_length=True, add_special_tokens=False)
        tokendict2 = tokenizer.encode_plus(row['sent1'], max_length=max_seq_length, pad_to_max_length=True, add_special_tokens=False)
        input_ids.append((tokendict1['input_ids'], tokendict2['input_ids']))
        attention_masks.append((tokendict1['attention_mask'], tokendict2['attention_mask']))
    return torch.tensor(input_ids), torch.tensor(attention_masks)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [5]:
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel, BertForMaskedLM

class text_dataset(Dataset):
    def __init__(self,x_y_list):
        self.x_y_list = x_y_list
        
    def __getitem__(self,index):           
        input_ids = self.x_y_list[0][index]
        attention_mask = self.x_y_list[1][index]
        label = self.x_y_list[2][index]
        return input_ids, attention_mask, label
    
    def __len__(self):
        return len(self.x_y_list[0])

In [6]:
#xy = xy[:100]
#xydev = xydev[:20]
input_ids, attention_masks = get_roberta_inputs(xy)
input_ids_dev, attention_masks_dev = get_roberta_inputs(xydev)

In [37]:
input_ids.shape

torch.Size([10000, 2, 25])

In [7]:
from sklearn.model_selection import train_test_split
batch_size = 16
epochs = 20
invalid = torch.tensor(xy.invalid_sent)
invalid_dev = torch.tensor(xydev.invalid_sent)
train_indices, val_indices = train_test_split([i for i in range(10000)])
#train_lists = [input_ids[train_indices], attention_masks[train_indices], invalid[train_indices]]
#val_lists = [input_ids[val_indices], attention_masks[val_indices], invalid[val_indices]]
train_lists = [input_ids, attention_masks, invalid]
val_lists = [input_ids_dev, attention_masks_dev, invalid_dev]

training_dataset = text_dataset(x_y_list = train_lists)
val_dataset = text_dataset(x_y_list = val_lists)

dataloaders_dict = {'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, num_workers=0),
                   'val':torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
                   }
dataset_sizes = {'train':len(train_lists[0]),
                'val':len(val_lists[0])}


In [None]:
#from torch import optim
from torch.optim import Adam
from torch.optim import lr_scheduler
max_lr = 0.001
lrmain = 2e-5
optim = Adam(sense_model.parameters(), lr=lrmain, eps=1e-8)
criterion = nn.CrossEntropyLoss()

# Create the learning rate scheduler.
scheduler = lr_scheduler.OneCycleLR(optim, max_lr=max_lr, total_steps = int((len(train_lists[0]) / batch_size) * epochs))

In [68]:
lrmain

2e-05

In [69]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [122]:
import copy
import time

def train(model, criterion, optimizer, scheduler, device, num_epochs=25, phases=['train', 'val']):
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100
    for epoch in range(num_epochs):
        print("Epoch %d" % epoch)
        model.train()  # Set model to training mode
        #running_loss = 0.0
        # Iterate over data.
        for phase in phases:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                print("VALIDATION")
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            corrects = 0
            for inputs, mask, target in dataloaders_dict[phase]:
                inputs = inputs.to(device)            # batch_size x max_sent_len
                mask = mask.to(device)
                target = target.to(device)
                hidden1 = model.init_hidden(inputs.size(0))
                hidden2 = model.init_hidden(inputs.size(0))
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(False):
                    bert_outputs = bert(inputs.reshape((inputs.size(0) * 2, max_seq_length)), attention_mask=mask.reshape((inputs.size(0) * 2, max_seq_length)))
                with torch.set_grad_enabled(phase == 'train'):
                    embeddings = bert_outputs[0].permute(1, 0, 2)
                    emb1 = embeddings[:, ::2, :]
                    emb2 = embeddings[:, 1::2, :]
                    #hidden = model.init_hidden(inputs.size(0))
                    #hidden = repackage_hidden(hidden)
                    logits, h1, h2 = model(emb1, emb2, hidden1, hidden2)
                    loss = criterion(logits, target)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                    running_loss += loss.item() * inputs.size(0)
                    corrects += torch.sum(torch.max(logits, 1)[1] == target)
            epoch_loss = running_loss / dataset_sizes[phase]
            acc = corrects.double() / dataset_sizes[phase]
            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} accuracy: {:.4f}'.format(
                phase, acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(float(best_loss)))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model
    

In [123]:
model_ft1 = train(sense_model, criterion, optim, scheduler, device, num_epochs=epochs)

starting
Epoch 0
train total loss: 0.4945 
train accuracy: 0.8088
VALIDATION
val total loss: 0.5606 
val accuracy: 0.7392
saving with loss of 0.5605972791268091 improved over previous 100
Epoch 1
train total loss: 0.4784 
train accuracy: 0.8259
VALIDATION
val total loss: 0.5562 
val accuracy: 0.7462
saving with loss of 0.556222177077918 improved over previous 0.5605972791268091
Epoch 2
train total loss: 0.4620 
train accuracy: 0.8460
VALIDATION
val total loss: 0.5557 
val accuracy: 0.7462
saving with loss of 0.5557334069442844 improved over previous 0.556222177077918
Epoch 3
train total loss: 0.4486 
train accuracy: 0.8603
VALIDATION
val total loss: 0.5712 
val accuracy: 0.7212
Epoch 4
train total loss: 0.4379 
train accuracy: 0.8699
VALIDATION
val total loss: 0.5971 
val accuracy: 0.7041
Epoch 5
train total loss: 0.4221 
train accuracy: 0.8891
VALIDATION
val total loss: 0.5680 
val accuracy: 0.7212
Epoch 6
train total loss: 0.4140 
train accuracy: 0.8967
VALIDATION
val total loss: 0.5

In [78]:
invalid_test = torch.tensor(xytest.invalid_sent)
input_ids_test, attention_masks_test = get_roberta_inputs(xytest)
test_lists = [input_ids_test, attention_masks_test, invalid_test]
test_dataset = text_dataset(x_y_list = test_lists)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=0)
        

In [91]:
def generate_results(model, data_loader):
    results = []
    for ipt, maskt, targett in data_loader:
        hidden1 = model.init_hidden(ipt.size(0))
        hidden2 = model.init_hidden(ipt.size(0))
        ipt = ipt.to(device)
        maskt = maskt.to(device)
        bert_outputs = bert(ipt.reshape((ipt.size(0) * 2, max_seq_length)), attention_mask=maskt.reshape((ipt.size(0) * 2, max_seq_length)))
        embeddings = bert_outputs[0].permute(1, 0, 2)
        emb1 = embeddings[:, ::2, :]
        emb2 = embeddings[:, 1::2, :]
        logits, h1, h2 = model(emb1, emb2, hidden1, hidden2)
        for i in range(len(logits)):
            predicted_index = torch.argmax(logits[i]).item()
            results.append(predicted_index)
    return results

In [92]:
results = generate_results(sense_model, test_loader)

In [95]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python3 eval/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv


Accuracy: 77.5854%


# Finetuning RoBERTa using all token embeddings

In [None]:
!pip install transformers

In [3]:
import torch
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
import logging
logging.basicConfig(level=logging.INFO)
def get_roberta_inputs(df):
    input_ids = []
    attention_masks = []
    for i, row in df.iterrows():
        tokendict = tokenizer.encode_plus(row['sent0'], row['sent1'], max_length=50, pad_to_max_length=True)
        input_ids.append(tokendict['input_ids'])
        attention_masks.append(tokendict['attention_mask'])
    return torch.tensor(input_ids), torch.tensor(attention_masks)

In [4]:
input_ids, attention_masks = get_roberta_inputs(xy)
input_ids_dev, attention_masks_dev = get_roberta_inputs(xydev)

In [5]:
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel, BertForMaskedLM
max_seq_length = 50
class text_dataset(Dataset):
    def __init__(self,x_y_list):
        self.x_y_list = x_y_list
        
    def __getitem__(self,index):           
        input_ids = self.x_y_list[0][index]
        attention_mask = self.x_y_list[1][index]
        label = self.x_y_list[2][index]
        return input_ids, attention_mask, label
    
    def __len__(self):
        return len(self.x_y_list[0])

In [6]:
from sklearn.model_selection import train_test_split
batch_size = 16
invalid = torch.tensor(xy.invalid_sent)
invalid_dev = torch.tensor(xydev.invalid_sent)
train_lists = [input_ids, attention_masks, invalid]
val_lists = [input_ids_dev, attention_masks_dev, invalid_dev]

training_dataset = text_dataset(x_y_list = train_lists)
val_dataset = text_dataset(x_y_list = val_lists)

dataloaders_dict = {'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, num_workers=0),
                   'val':torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
                   }
dataset_sizes = {'train':len(train_lists[0]),
                'val':len(val_lists[0])}


In [8]:
from torch import nn
from transformers import RobertaModel
class MyRoBertaForSequenceClassification(nn.Module):
  
    def __init__(self, num_labels=2):
        super(MyRoBertaForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True)
        self.classifier = nn.Linear(38400, num_labels)
        #nn.init.xavier_normal_(self.classifier.weight)
    def forward(self, input_ids, attention_mask=None, labels=None):
        pooled_output = self.bert(input_ids, attention_mask)
        #embeddings = pooled_output[2][-1] + pooled_output[2][-2] + pooled_output[2][-3] + pooled_output[2][-4]
        embeddings = pooled_output[0].reshape(input_ids.size(0), max_seq_length * 768) #16 x 50 x 768
        #embeddings = pooled_output[0][:, 0, :] #16 x 784
        #embeddings = embeddings.reshape((input_ids.size(0), max_seq_length * 768))
        #import pdb; pdb.set_trace()
        logits = self.classifier(embeddings)
        return logits


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MyRoBertaForSequenceClassification().to(device)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /home/ec2-user/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b61d3c50101609b39690
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_e

In [9]:
import torch
import time
import os
import copy
import torch.nn.functional as F
import numpy as np

def train_model(model, criterion, optimizer, scheduler, device, num_epochs=25, phases=['train', 'val']):
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in phases:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                print("VALIDATION")
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            corrects = 0
                        
            # Iterate over data.
            for inputs, mask, target in dataloaders_dict[phase]:
                inputs = inputs.to(device)
                mask = mask.to(device)
                target = target.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs, attention_mask=mask)
                    #import pdb; pdb.set_trace()
                    outputs = F.softmax(outputs,dim=1)  
                    loss = criterion(outputs, target)
                    # backward + optimize only if in training phase
                    if phase == 'train':                        
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                #print('Running loss : {:.4f}'.format(running_loss))
                corrects += torch.sum(torch.max(outputs, 1)[1] == target)
            epoch_loss = running_loss / dataset_sizes[phase]
            acc = corrects.double() / dataset_sizes[phase]
            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} accuracy: {:.4f}'.format(
                phase, acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(float(best_loss)))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [10]:
#from torch import optim
from transformers.optimization import AdamW
from transformers import get_linear_schedule_with_warmup
epochs = 3
lrlast = .001
lrmain = 2e-5
optim = AdamW(model.parameters(), lr=lrmain, eps=1e-8)
criterion = nn.CrossEntropyLoss()

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = (len(train_lists[0]) / batch_size) * epochs)

In [11]:
model_ft1 = train_model(model, criterion, optim, scheduler, device, num_epochs=epochs)

starting
Epoch 0/2
----------
train total loss: 0.5358 
train accuracy: 0.7481
VALIDATION
val total loss: 0.4396 
val accuracy: 0.8696
saving with loss of 0.43964810410856364 improved over previous 100
Epoch 1/2
----------


KeyboardInterrupt: 

In [14]:
invalid_test = torch.tensor(xytest.invalid_sent)
input_ids_test, attention_masks_test = get_roberta_inputs(xytest)
test_lists = [input_ids_test, attention_masks_test, invalid_test]
test_dataset = text_dataset(x_y_list = test_lists)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=0)
        

In [58]:
def generate_results(data_loader):
    results = []
    for ipt, maskt, targett in data_loader:
        ipt = ipt.to(device)
        maskt = maskt.to(device)
        outputs = model_ft1(ipt, maskt)
        outputs = F.softmax(outputs,dim=1)
        for i in range(len(outputs)):
            predicted_index = torch.argmax(outputs[i]).item()
            results.append(predicted_index)
    return results

In [50]:
results = generate_results(test_loader)

In [52]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python3 eval/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv


Accuracy: 88.9659%


In [55]:
x_eval = pd.read_csv('data/test/subtaskA_test_data.csv')
input_ids_eval, attention_masks_eval = get_roberta_inputs(x_eval)
eval_lists = [input_ids_eval, attention_masks_eval, invalid]
eval_dataset = text_dataset(x_y_list = eval_lists)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size, num_workers=0)
                            

In [59]:
results = generate_results(eval_loader)

In [60]:
len(results)

1000

In [62]:
pd.concat([x_eval.id, pd.Series(results)], axis=1).to_csv('./subtaskA_answers.csv', header=False, index=False)