## Data Loading

In [3]:
import pandas as pd
import numpy as np
x_train = pd.read_csv('data/train/subtaskA_data_all.csv')
y_train = pd.read_csv('data/train/subtaskA_answers_all.csv', header=None)
x_test = pd.read_csv('data/trial/taskA_trial_data.csv')
y_test = pd.read_csv('data/trial/taskA_trial_answer.csv', header=None)
y_train = y_train.rename(columns={0 : 'id',1: 'invalid_sent'})
y_test = y_test.rename(columns={0 : 'id',1: 'invalid_sent'})

In [9]:
xy = x_train.merge(y_train)
items = []
for row in xy.itertuples():
    if row.invalid_sent == 0:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 0.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 1.0})
    else:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 1.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 0.0})
traindf = pd.DataFrame(items)

xytest = x_test.merge(y_test)
items = []
for row in xytest.itertuples():
    if row.invalid_sent == 0:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 0.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 1.0})
    else:
        items.append({'id' : row.id, 'sent' : row.sent0, 'isvalid' : 1.0})
        items.append({'id' : row.id, 'sent' : row.sent1, 'isvalid' : 0.0})
        
testdf = pd.DataFrame(items)

## Baseline - Logistic Regression

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

ps = PorterStemmer()
stop_words = set(stopwords.words('english')) 
def preprocess(x):
    x = x.lower()
    tokens = word_tokenize(x)
    filtered_tokens = [w for w in tokens if not w in stop_words] 
    return ' '.join([ps.stem(t) for t in filtered_tokens])
trainpdf = traindf.sent.apply(preprocess)
testpdf = testdf.sent.apply(preprocess)

[nltk_data] Downloading package punkt to /Users/goelprat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/goelprat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0, lowercase=False, max_features=3000)
vectorizer.fit(trainpdf)
len(vectorizer.vocabulary_.keys())

3000

In [None]:
X_train = vectorizer.transform(trainpdf)
X_test = vectorizer.transform(testpdf)
Y_train = traindf.isvalid
Y_test = testdf.isvalid

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
score = classifier.score(X_test, Y_test)
score

0.5774369124195943

Baseline accuracy is 57%. I now use Keras to essentially do simple logistic regression just to practice grid search. 

In [None]:
### Answer 5
from keras.datasets import cifar10
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
### Answer 3
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

def create_model(optimizer='sgd'):
    model = Sequential()
    model.add(Dense(units = 1, activation = 'sigmoid', input_dim = 3000))
    # Describe the loss and how it is optimized
    model.compile(loss = 'mean_squared_error', optimizer = optimizer, metrics = ['accuracy'])
    return model

def grid_search():
    optimizers = []
    model = KerasClassifier(build_fn=create_model)
    optimizer = ['SGD', 'Adam', 'RMSProp']
    param_grid = dict(epochs=[20, 40, 60], batch_size=[20, 40, 60], optimizer=optimizer)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_result = grid.fit(x_train, y_train)

    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        
model = create_model(optimizer='adam')
model.fit(X_train, Y_train, epochs=40, batch_size=40)


In [None]:
model.evaluate(X_test, Y_test)



[0.24135216026539735, 0.5821375846862793]

## Using BERT with pretrained weights to get sentence embedding

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def bert_input(df):
    bertdf = df.sent.apply(lambda x : "[CLS] " + x + " [SEP]")
    bertdf_tokenized = bertdf.apply((lambda x: tokenizer.encode(x)))
    max_len = 0
    for i in bertdf_tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in bertdf_tokenized.values])
    attention_mask = np.where(padded != 0, 1, 0)
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    return input_ids, attention_mask

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')
input_ids_train, attention_mask_train = bert_input(traindf)
with torch.no_grad():
    hidden_train = model(input_ids_train, attention_mask=attention_mask_train)
features_train = hidden_train[0][:,0,:].numpy()

input_ids_test, attention_mask_test = bert_input(testdf)
with torch.no_grad():
    hidden_test = model(input_ids_test, attention_mask=attention_mask_test)
features_test = hidden_test[0][:,0,:].numpy()


In [None]:
Y_train = traindf.isvalid
Y_test = testdf.isvalid

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(features_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
score = classifier.score(features_test, Y_test)
score

0.6264225630875804

## Using BERT to get perplexity

In the [paper](https://arxiv.org/abs/1906.00363) published along with this task, the authors claim to get 70% accuracy by using BERT to get sentence perplexity. I try to replicate that claim in this section.

In [None]:
bertdf = traindf.sent.apply(lambda x : "[CLS] " + x + " [SEP]")
testbertdf = testdf.sent.apply(lambda x : "[CLS] " + x + " [SEP]")

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_tensors = []
for sent in bertdf:
    # Tokenize input
    tokenized_text = tokenizer.tokenize(sent)
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    train_tensors.append(tokens_tensor)
    
test_tensors = []
for sent in testbertdf:
    # Tokenize input
    tokenized_text = tokenizer.tokenize(sent)
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    test_tensors.append(tokens_tensor)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/goelprat/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [None]:
from transformers import BertForNextSentencePrediction
#model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/goelprat/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
INFO:transformers.configuration_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file https://s3.amazona

In [None]:
import torch.nn.functional as F
import math

loss_fct = torch.nn.CrossEntropyLoss()

def run_tokens(s):
    tokenized_text = tokenizer.tokenize(s)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_ids = [0] * tokens_tensor.shape[1]
    segments_tensors = torch.tensor([segments_ids])
    prediction = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = prediction[0]
    predicted_tokens = []
    for i in range(tokens_tensor.shape[1]):
        predicted_index = torch.argmax(predictions[0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        predicted_tokens.append(predicted_token)
    return predicted_tokens

def run_perp(s):
    tokenized_text = tokenizer.tokenize(s)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_ids = [0] * tokens_tensor.shape[1]
    segments_tensors = torch.tensor([segments_ids])
    predictions = model(tokens_tensor, token_type_ids=segments_tensors)
    loss = loss_fct(predictions[0].squeeze(),tokens_tensor.squeeze()).data 
    return math.exp(loss)

def run(sents, mode):
    model.eval()
    f = run_perp if mode == 'perp' else run_tokens
    out = []
    with torch.no_grad():
        for sent in sents:
            out.append(f(sent))
    return out

In [None]:
sents = ['[CLS] my sister eats an apple after breakfast every day . [SEP]', '[CLS] my sister eats a stone after breakfast every day . [SEP]']
run(sents, 'perp')


[42.116081023755655, 38.70850197316724]

In [None]:
sents = []
for s in testdf.sent:
    sents.append('[CLS] ' + s + ' . [SEP]')

In [None]:
perps = run(sents, 'perp')

In [None]:
def get_results_from_perps(perps):
    results = []
    for i in range(0, len(perps), 2):
        if perps[i] < perps[i + 1]:
            results.append(1)
        else:
            results.append(0)
    return results

results = get_results_from_perps(perps)

I was unable to get better than random results using BERT to get sentence probability. I believe this is because BERT outputs probability conditional on all the other tokens. This means that multiplying the probabilities for tokens in a sentence does not yield the probability of the sentence. 

## Using GPT-2 to get sentence probability

In [None]:
import math
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Model
gpt_model = GPT2Model.from_pretrained('gpt2')
gpt_model.eval()
# Load pre-trained model (weights)
gpt_model_lm = GPT2LMHeadModel.from_pretrained('gpt2-large')
gpt_model_lm.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

loss_fct = torch.nn.CrossEntropyLoss()






  0%|          | 0/3247202234 [00:00<?, ?B/s][A[A[A[A[A




  0%|          | 52224/3247202234 [00:00<2:15:16, 400076.95B/s][A[A[A[A[A




  0%|          | 261120/3247202234 [00:00<1:45:30, 512920.81B/s][A[A[A[A[A




  0%|          | 896000/3247202234 [00:00<1:16:24, 708181.49B/s][A[A[A[A[A




  0%|          | 1904640/3247202234 [00:00<55:04, 982127.89B/s] [A[A[A[A[A




  0%|          | 3354624/3247202234 [00:00<39:39, 1363441.83B/s][A[A[A[A[A




  0%|          | 4780032/3247202234 [00:00<28:53, 1870037.39B/s][A[A[A[A[A




  0%|          | 6553600/3247202234 [00:00<21:09, 2552565.94B/s][A[A[A[A[A




  0%|          | 7737344/3247202234 [00:00<16:27, 3281951.71B/s][A[A[A[A[A




  0%|          | 9681920/3247202234 [00:00<12:20, 4372242.88B/s][A[A[A[A[A




  0%|          | 11355136/3247202234 [00:01<09:36, 5616877.25B/s][A[A[A[A[A




  0%|          | 13186048/3247202234 [00:01<07:36, 7086268.35B/s][A[A[A[A[A




  0%|

In [None]:
def gpt_input(df):
    gptdf = df.sent.apply(lambda x : ' <|endoftext|> ' + x)
    gptdf_tokenized = gptdf.apply((lambda x: tokenizer.encode(x)))
    max_len = 0
    for i in gptdf_tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in gptdf_tokenized.values])
    attention_mask = np.where(padded != 0, 1, 0)
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    return input_ids, attention_mask

In [None]:
def gpt_predictions(sentence, model):
    sentence = ' <|endoftext|> ' + sentence
    tokenize_input = tokenizer.tokenize(sentence, add_prefix_space=True)
    tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    return model(tokens_tensor)
    
def get_last_hidden_state(sentence, model):
    predictions = gpt_predictions(sentence, model)
    return predictions[0].squeeze()[-1]

def gpt_score(sentence):
    sentence = ' <|endoftext|> ' + sentence
    tokenize_input = tokenizer.tokenize(sentence, add_prefix_space=True)
    tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    predictions=gpt_model_lm(tokens_tensor)
    loss = loss_fct(predictions[0].squeeze()[:-1],tokens_tensor.squeeze()[1:]).data 
    return math.exp(loss)

def gpt_tokens(input_sentence):
    sentence = '<|endoftext|> ' + input_sentence
    tokenize_input = tokenizer.tokenize(sentence)
    tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    predictions =gpt_model(tokens_tensor)
    predicted_tokens = []
    for i in range(tokens_tensor.shape[1]):
        predicted_index = torch.argmax(predictions[0][0, i]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        predicted_tokens.append(predicted_token)
    return predicted_tokens

In [None]:
def run(sents, mode='perp'):
    out = []
    with torch.no_grad():
        for sent in sents:
            if mode == 'perp':
                out.append(gpt_score(sent))
            else:
                out.append(gpt_tokens(sent))
    return out

In [None]:
run(['he put an elephant into the fridge', 'he put a turkey into the fridge'], mode='perp')

[628.2036629979144, 448.928654083428]

In [None]:
input_ids, attention_mask = gpt_input(testdf)
with torch.no_grad():
    predictions = gpt_model_lm(input_ids)

In [None]:
i = 1
loss = loss_fct(predictions[0][i, :-1, :],input_ids[i, 1:]).data 
math.exp(loss)

150.6329864510042

In [None]:
perps = []
for i in range(4042):
    loss = loss_fct(predictions[0][i, :-1, :],input_ids[i, 1:]).data 
    perps.append(math.exp(loss))

In [None]:
def get_results_from_perps(perps):
    results = []
    for i in range(0, len(perps), 2):
        if perps[i] < perps[i + 1]:
            results.append(1)
        else:
            results.append(0)
    return results

results = get_results_from_perps(perps)

In [None]:
perps[:10]

[140.16610427159335,
 150.6329864510042,
 102.33351636157093,
 132.5085569937971,
 71.10768670874523,
 67.34356791600864,
 46.63606115424475,
 62.187596293418444,
 146.08338508822348,
 115.9838952424347]

In [None]:
testdf[:20]

Unnamed: 0,id,sent,isvalid
0,1,he put an elephant into the fridge,0.0
1,1,he put a turkey into the fridge,1.0
2,2,my sister eats an apple after breakfast every day,1.0
3,2,my sister eats a stone after breakfast every day,0.0
4,3,money can be used for buying cars,1.0
5,3,money can be used for buying stars,0.0
6,4,New York is located in the northeastern part o...,1.0
7,4,USA is located in the northeastern part of New...,0.0
8,5,a man can better see stars and the moon in day...,0.0
9,5,a man can hardly see stars and the moon in day...,1.0


In [None]:
!source activate fastai

In [None]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python3 eval/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv

Accuracy: 71.3508%


These results using the GPT-2 small model match the ~70% baseline reported in the paper. 
Using the large model, I was able to get 71.35% accuracy. 

Next, I experiment with using the final hidden state output of the GPT-2 model to classify a sentence as for or against common sense. 

In [None]:
def run_classification(sents, model):
    import numpy as np
    x = np.zeros((len(sents), 768))
    with torch.no_grad():
        for i in range(len(sents)):
            h = get_last_hidden_state(sents[i], model)
            x[i] = h
    return x

In [None]:
sents = [x for x in traindf.sent]
x = run_classification(sents, gpt_model)
y = traindf.isvalid

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
x_test = run_classification([x for x in testdf.sent], gpt_model)

In [None]:
y_test = testdf.isvalid

In [None]:
classifier.score(x_test, y_test)

0.591044037605146

Now, I try to find examples that are incorrectly classified by my GPT-2 based model. 

In [None]:
test_answers = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/trial/taskA_trial_answer.csv', header=None)

In [None]:
results

In [None]:
wrong_examples = x_test[test_answers[1] != pd.Series(results)]

In [None]:
items = []
for row in wrong_examples.itertuples():
    items.append({'id' : row.id, 'sent' : row.sent0})
    items.append({'id' : row.id, 'sent' : row.sent1})
wrongdf = pd.DataFrame(items)

In [39]:
wrongdf.sent

0       a man can better see stars and the moon in day...
1       a man can hardly see stars and the moon in day...
2                                   I work 25 hours a day
3                                    I work 8 hours a day
4        I changed my direction when passing a crossroads
                              ...                        
1273                    Jim downloads music from the book
1274              Bob goes to bed because he feels sleepy
1275             Bob goes to work because he feels sleepy
1276    people have to hold onto their hats because of...
1277    people have to hold onto their shoes because o...
Name: sent, Length: 1278, dtype: object

In [3]:
with open('wrong_examples.txt', 'w') as f:
    for s in wrongdf.sent:
        f.write(str(s) + "\n")

NameError: name 'wrongdf' is not defined

## Finetuning BERT using transformers library

In [31]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def get_bert_inputs(df):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for i, row in df.iterrows():
        tokendict = tokenizer.encode_plus(row['sent0'], row['sent1'], max_length=50, pad_to_max_length=True)
        input_ids.append(tokendict['input_ids'])
        attention_masks.append(tokendict['attention_mask'])
        token_type_ids.append(tokendict['token_type_ids'])
    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(token_type_ids)

In [32]:
input_ids, attention_masks, token_type_ids = get_bert_inputs(xy)

In [33]:
input_ids_test, attention_masks_test, token_type_ids_test = get_bert_inputs(xytest)

In [34]:
class MyBertForSequenceClassification(nn.Module):  
    def __init__(self, num_labels=1):
        super(MyBertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        return pooled_output[0]

In [35]:
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel, BertForMaskedLM
max_seq_length = 50
class text_dataset(Dataset):
    def __init__(self,x_y_list):
        self.x_y_list = x_y_list
        
    def __getitem__(self,index):           
        input_ids = self.x_y_list[0][index]
        token_type_ids = self.x_y_list[1][index]
        attention_mask = self.x_y_list[2][index]
        label = self.x_y_list[3][index]
        return input_ids, token_type_ids, attention_mask, label
    
    def __len__(self):
        return len(self.x_y_list[0])

In [36]:
import torch
import time
import os
import copy
import torch.nn.functional as F
import numpy as np

def train_model(model, criterion, optimizer, scheduler, device, num_epochs=25):
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                print("VALIDATION")
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            corrects = 0
                        
            # Iterate over data.
            for inputs, token_types, mask, target in dataloaders_dict[phase]:
                inputs = inputs.to(device)
                token_types = token_types.to(device)
                mask = mask.to(device)
                target = target.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs, token_type_ids=token_types, attention_mask=mask)
                    outputs = F.softmax(outputs,dim=1)  
                    loss = criterion(outputs, target)
                    # backward + optimize only if in training phase
                    if phase == 'train':                        
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                #print('Running loss : {:.4f}'.format(running_loss))
                corrects += torch.sum(torch.max(outputs, 1)[1] == target)
            epoch_loss = running_loss / dataset_sizes[phase]
            acc = corrects.double() / dataset_sizes[phase]
            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} accuracy: {:.4f}'.format(
                phase, acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(float(best_loss)))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [37]:
from sklearn.model_selection import train_test_split
batch_size = 16
epochs = 10
invalid = torch.tensor(xy.invalid_sent)
train_indices, val_indices = train_test_split([i for i in range(10000)])
train_lists = [input_ids[train_indices], token_type_ids[train_indices], attention_masks[train_indices], invalid[train_indices]]
val_lists = [input_ids[val_indices], token_type_ids[val_indices], attention_masks[val_indices], invalid[val_indices]]
test_lists = [input_ids_test, token_type_ids_test, attention_masks_test, invalid]
training_dataset = text_dataset(x_y_list = train_lists)
val_dataset = text_dataset(x_y_list = val_lists)
test_dataset = text_dataset(x_y_list = test_lists)

dataloaders_dict = {'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, num_workers=0),
                   'val':torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
                   }
dataset_sizes = {'train':len(train_lists[0]),
                'val':len(val_lists[0])}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [24]:
len(train_indices)

7500

In [38]:
model = MyBertForSequenceClassification().to(device)

In [39]:
#from torch import optim
from transformers.optimization import AdamW
from transformers import get_linear_schedule_with_warmup
lrlast = .001
lrmain = 2e-5
optim = AdamW(model.bert.parameters(), lr=lrmain, eps=1e-8)
criterion = nn.CrossEntropyLoss()

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = (len(train_indices) / batch_size) * epochs)

In [40]:
model_ft1 = train_model(model, criterion, optim, scheduler, device, num_epochs=epochs)

starting
Epoch 0/9
----------
train total loss: 0.6505 
train accuracy: 0.6176
VALIDATION
val total loss: 0.5882 
val accuracy: 0.7072
saving with loss of 0.5881836494445801 improved over previous 100
Epoch 1/9
----------
train total loss: 0.5276 
train accuracy: 0.7779
VALIDATION
val total loss: 0.5820 
val accuracy: 0.7176
saving with loss of 0.581999572134018 improved over previous 0.5881836494445801
Epoch 2/9
----------
train total loss: 0.4515 
train accuracy: 0.8576
VALIDATION
val total loss: 0.5868 
val accuracy: 0.7156
Epoch 3/9
----------


KeyboardInterrupt: 

In [None]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=0)
results = []
for ipt, toktypes, maskt, targett in test_loader:
    ipt = ipt.to(device)
    toktypes = toktypes.to(device)
    maskt = maskt.to(device)
    outputs = model_ft1(ipt, toktypes, maskt)
    outputs = F.softmax(outputs,dim=1)
    for i in range(len(outputs)):
        predicted_index = torch.argmax(outputs[i]).item()
        results.append(predicted_index)

In [None]:
len(results)

2021

In [4]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python3 data/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv


NameError: name 'results' is not defined

In [None]:
test_answers = pd.read_csv('data/trial/taskA_trial_answer.csv', header=None)
wrong_examples = x_test[test_answers[1] != pd.Series(results)]
items = []
for row in wrong_examples.itertuples():
    items.append({'id' : row.id, 'sent' : row.sent0})
    items.append({'id' : row.id, 'sent' : row.sent1})
wrongdf = pd.DataFrame(items)
with open('wrong_examples.txt', 'w') as f:
    for s in wrongdf.sent:
        f.write(str(s) + "\n")

# Using RoBERTa

In [6]:
import torch
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
def get_bert_inputs(df):
    input_ids = []
    attention_masks = []
    for i, row in df.iterrows():
        tokendict = tokenizer.encode_plus(row['sent0'], row['sent1'], max_length=50, pad_to_max_length=True)
        input_ids.append(tokendict['input_ids'])
        attention_masks.append(tokendict['attention_mask'])
    return torch.tensor(input_ids), torch.tensor(attention_masks)

In [12]:
input_ids, attention_masks = get_bert_inputs(xy)

In [5]:
from torch import nn
from transformers import RobertaForSequenceClassification
class MyRoBertaForSequenceClassification(nn.Module):
  
    def __init__(self, num_labels=1):
        super(MyRoBertaForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = RobertaForSequenceClassification.from_pretrained('roberta-base')
        #self.classifier = nn.Linear(768, num_labels)
        #nn.init.xavier_normal_(self.classifier.weight)
    def forward(self, input_ids, attention_mask=None, labels=None):
        pooled_output = self.bert(input_ids, attention_mask)
        return pooled_output[0]


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MyRoBertaForSequenceClassification().to(device)

In [6]:
device

device(type='cuda', index=0)

In [7]:
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertModel, BertForMaskedLM
max_seq_length = 50
class text_dataset(Dataset):
    def __init__(self,x_y_list):
        self.x_y_list = x_y_list
        
    def __getitem__(self,index):           
        input_ids = self.x_y_list[0][index]
        attention_mask = self.x_y_list[1][index]
        label = self.x_y_list[2][index]
        return input_ids, attention_mask, label
    
    def __len__(self):
        return len(self.x_y_list[0])

In [13]:
from sklearn.model_selection import train_test_split
batch_size = 16
epochs = 10
invalid = torch.tensor(xy.invalid_sent)
train_indices, val_indices = train_test_split([i for i in range(10000)])
train_lists = [input_ids[train_indices], attention_masks[train_indices], invalid[train_indices]]
val_lists = [input_ids[val_indices], attention_masks[val_indices], invalid[val_indices]]
training_dataset = text_dataset(x_y_list = train_lists)
val_dataset = text_dataset(x_y_list = val_lists)

dataloaders_dict = {'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, num_workers=0),
                   'val':torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
                   }
dataset_sizes = {'train':len(train_lists[0]),
                'val':len(val_lists[0])}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [11]:
#from torch import optim
from transformers.optimization import AdamW
from transformers import get_linear_schedule_with_warmup
lrlast = .001
lrmain = 2e-5
optim = AdamW(model.bert.parameters(), lr=lrmain, eps=1e-8)
criterion = nn.CrossEntropyLoss()

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = (len(train_indices) / batch_size) * epochs)

In [12]:
import torch
import time
import os
import copy
import torch.nn.functional as F
import numpy as np

def train_model(model, criterion, optimizer, scheduler, device, num_epochs=25):
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                print("VALIDATION")
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            corrects = 0
                        
            # Iterate over data.
            for inputs, mask, target in dataloaders_dict[phase]:
                inputs = inputs.to(device)
                mask = mask.to(device)
                target = target.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs, attention_mask=mask)
                    outputs = F.softmax(outputs,dim=1)  
                    loss = criterion(outputs, target)
                    # backward + optimize only if in training phase
                    if phase == 'train':                        
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                #print('Running loss : {:.4f}'.format(running_loss))
                corrects += torch.sum(torch.max(outputs, 1)[1] == target)
            epoch_loss = running_loss / dataset_sizes[phase]
            acc = corrects.double() / dataset_sizes[phase]
            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} accuracy: {:.4f}'.format(
                phase, acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')
                
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    #print('Best val Acc: {:4f}'.format(float(best_loss)))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [13]:
model_ft1 = train_model(model, criterion, optim, scheduler, device, num_epochs=3)


starting
Epoch 0/2
----------
train total loss: 0.5799 
train accuracy: 0.7013
VALIDATION
val total loss: 0.4807 
val accuracy: 0.8168
saving with loss of 0.48069591817855833 improved over previous 100
Epoch 1/2
----------
train total loss: 0.4654 
train accuracy: 0.8385
VALIDATION
val total loss: 0.4626 
val accuracy: 0.8412
saving with loss of 0.46262081069946287 improved over previous 0.48069591817855833
Epoch 2/2
----------
train total loss: 0.4265 
train accuracy: 0.8809
VALIDATION
val total loss: 0.4560 
val accuracy: 0.8484
saving with loss of 0.45603187313079835 improved over previous 0.46262081069946287
Training complete in 6m 26s


In [14]:
input_ids_test, attention_masks_test = get_bert_inputs(xytest)
test_lists = [input_ids_test, attention_masks_test, invalid]
test_dataset = text_dataset(x_y_list = test_lists)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, num_workers=0)
        

In [15]:
x_eval = pd.read_csv('data/test/subtaskA_test_data.csv')
input_ids_eval, attention_masks_eval = get_bert_inputs(x_eval)
eval_lists = [input_ids_eval, attention_masks_eval, invalid]
eval_dataset = text_dataset(x_y_list = eval_lists)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size, num_workers=0)
                            

In [None]:
results = []
for ipt, maskt, targett in test_loader:
    ipt = ipt.to(device)
    maskt = maskt.to(device)
    outputs = model_ft1(ipt, maskt)
    outputs = F.softmax(outputs,dim=1)
    for i in range(len(outputs)):
        predicted_index = torch.argmax(outputs[i]).item()
        results.append(predicted_index)

In [17]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python3 eval/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv


Accuracy: 85.0074%


In [None]:
test_answers = pd.read_csv('data/trial/taskA_trial_answer.csv', header=None)
wrong_examples = x_test[test_answers[1] != pd.Series(results)]
items = []
for row in wrong_examples.itertuples():
    items.append({'id' : row.id, 'sent' : row.sent0})
    items.append({'id' : row.id, 'sent' : row.sent1})
wrongdf = pd.DataFrame(items)
with open('wrong_examples.txt', 'w') as f:
    for s in wrongdf.sent:
        f.write(str(s) + "\n")

## Evaluation

In [46]:
pd.concat([x_test.id, pd.Series(results)], axis=1).to_csv('./predictions.csv', header=False, index=False)
!python eval/taskA_scorer.py --gold-labels data/trial/taskA_trial_answer.csv --pred-labels predictions.csv

python3: can't open file 'eval/taskA_scorer.py': [Errno 2] No such file or directory
