## HW1: Bag-of-Words based Natural Language Inference

### Part II - MNLI

In [47]:
import numpy as np
import pandas as pd
pd.set_option('max_rows', 50)
pd.set_option('display.max_colwidth', -1)
import matplotlib.pyplot as plt
%matplotlib inline

import random
import spacy
import string
import pickle as pkl
from collections import Counter
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import time

#### Dataset

In [48]:
naming_dict = {'sentence1':'premise', 'sentence2':'hypothesis'}
mnli_train = pd.read_csv('data/mnli_train.tsv', sep = '\t').rename(columns = naming_dict)
mnli_val = pd.read_csv('data/mnli_val.tsv', sep = '\t').rename(columns = naming_dict)

In [49]:
mnli_train.head()

Unnamed: 0,premise,hypothesis,label,genre
0,and now that was in fifty one that 's forty years ago that it was already a problem so it 's now uh,It was already a problem forty years ago but now it 's ten times worse !,neutral,telephone
1,Jon could smell baked bread on the air and his stomach rumbled .,Jon smelt food in the air and was hungry .,neutral,fiction
2,it will be like Italian basketball with the uh with with the uh NBA,This type of Italian basketball is nothing like the NBA .,contradiction,telephone
3,well i think that 's about uh that 's about covered it for me so i think i 'll say good-bye and we 'll,Sorry but we are not done just yet .,contradiction,telephone
4,"Good job tenure , that is -- because in yet another column , she sneers at Sara Davidson for working on Dr. Quinn , Medicine Woman . Why ca n't the silly creature get a perfect job like hers ?","Dr. Quinn , Medicine Woman , was worked on by Sara Davidson .",entailment,slate


In [50]:
mnli_val.head()

Unnamed: 0,premise,hypothesis,label,genre
0,"'Not entirely , ' I snapped , harsher than intended .",I spoke more harshly than I wanted to .,entailment,fiction
1,cook and then the next time it would be my turn and i 'd try to outdo him and then he 'd try to outdo me and we we was really a lot of fun and,I would cook and then the next turn would be his and we would try to outdo each other but sometimes we would get in a fight over things .,contradiction,telephone
2,The disorder hardly seemed to exist before the stimulant Ritalin came along .,The disorder did n't seem to be as common when Ritalin was n't around .,entailment,slate
3,"The Report and Order , in large part , adopts the unanimous recommendations of the Hearing Aid Compatibility Negotiated Rulemaking Committee , an advisory committee established by the Federal Communications Commission in 1995 .",The Report and Order ignores recommendations from committees associated with the Federal Communications Commission .,contradiction,government
4,"IDPA 's OIG 's mission is to prevent , detect , and eliminate fraud , waste , abuse , and misconduct in various payment programs .",IDPA 's OIG 's mission is clear and cares about payment programs .,entailment,government


#### What kind of genres are there in the dataset?

In [51]:
mnli_train['genre'].value_counts()

telephone     4270
slate         4026
travel        3985
government    3883
fiction       3836
Name: genre, dtype: int64

In [52]:
mnli_val['genre'].value_counts()

government    1016
telephone     1005
slate         1002
fiction       995 
travel        982 
Name: genre, dtype: int64

### 3.2 Evaluating on Multi-NLI

You task is to take your best trained model, one each for Logistic Regression and Neural Net based classiﬁer, and evaluate them on the provided MultiNLI data, for each genre.

In [53]:
mnli_val_gov = mnli_val[mnli_val['genre'] == 'government']
mnli_val_tel = mnli_val[mnli_val['genre'] == 'telephone']
mnli_val_slate = mnli_val[mnli_val['genre'] == 'slate']
mnli_val_fiction = mnli_val[mnli_val['genre'] == 'fiction']
mnli_val_travel = mnli_val[mnli_val['genre'] == 'travel']

In [54]:
print(mnli_val_gov.shape)
print(mnli_val_tel.shape)
print(mnli_val_slate.shape)
print(mnli_val_fiction.shape)
print(mnli_val_travel.shape)

(1016, 4)
(1005, 4)
(1002, 4)
(995, 4)
(982, 4)


In [55]:
mnli_val_gov = mnli_val_gov.drop(['genre'], axis = 1).reset_index(drop = True)
mnli_val_tel = mnli_val_tel.drop(['genre'], axis = 1).reset_index(drop = True)
mnli_val_slate = mnli_val_slate.drop(['genre'], axis = 1).reset_index(drop = True)
mnli_val_fiction = mnli_val_fiction.drop(['genre'], axis = 1).reset_index(drop = True)
mnli_val_travel = mnli_val_travel.drop(['genre'], axis = 1).reset_index(drop = True)

In [56]:
print(mnli_val_gov.shape)
print(mnli_val_tel.shape)
print(mnli_val_slate.shape)
print(mnli_val_fiction.shape)
print(mnli_val_travel.shape)

(1016, 3)
(1005, 3)
(1002, 3)
(995, 3)
(982, 3)


#### Load best Logistic Regression model

Logistic Regression (SNLI Validation Accuracy: 63.3%)

- Vocab size: 15000
- Embedding dimension: 125 
- Interaction: element-wise product

In [57]:
class LR_BagOfWords(nn.Module):
    '''
    Bag-of-Words classification model.
    '''
    def __init__(self, vocab_size, emb_dim, interaction):
        '''
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding.
        @param interaction: the way to combine the two vector representations.
        '''
        super(LR_BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx = 0)
        self.interaction = interaction
        if interaction == 'concat':
            self.linear = nn.Linear(2 * emb_dim, 3)
        else:
            self.linear = nn.Linear(emb_dim, 3)
        
    def forward(self, premise, len_premise, hypothesis, len_hypothesis):
        '''
        @param premise: matrix of size (batch_size, max_sentence_length). Each row represents a 
            premise that is represented using n-gram index. Note that they are padded to have same length.
        @param len_premise: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each premise in the data.
        @param hypothesis: matrix of size (batch_size, max_sentence_length). Each row represents a 
            hypothesis that is represented using n-gram index. Note that they are padded to have same length.
        @param len_hypothesis: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each hypothesis in the data.
        '''
        premise_out = self.embed(premise)
        premise_out = torch.sum(premise_out, dim = 1)
        premise_out /= len_premise.view(len_premise.size()[0], 1).expand_as(premise_out).float()
        
        hypothesis_out = self.embed(hypothesis)
        hypothesis_out = torch.sum(hypothesis_out, dim = 1)
        hypothesis_out /= len_hypothesis.view(len_hypothesis.size()[0], 1).expand_as(hypothesis_out).float()
        
        # combine the two vector representations
        out = self.interaction_func(premise_out, hypothesis_out)
     
        # return logits
        out = self.linear(out.float())
        return out
    
    def interaction_func(self, premise, hypothesis):
        '''
        Combines the two vector representations based on the specified interaction method.
        '''
        if self.interaction == 'concat':
            return torch.cat((premise, hypothesis), dim = 1)
        if self.interaction == 'sum':
            return (premise + hypothesis)
        if self.interaction == 'element_wise_product':
            return (premise * hypothesis)   

In [59]:
token2id = pkl.load(open('tokens/logreg_token2id.p', 'rb'))
id2token = pkl.load(open('tokens/logreg_id2token.p', 'rb'))

In [13]:
print(len(token2id))
print(len(id2token))

15002
15002


In [14]:
#best_lr_vocab_size = 15000
best_lr_emb_dim = 125 
best_lr_interaction = 'element_wise_product'
best_lr_path = 'models/best_logreg.pth' 

In [15]:
# load the best LR model
best_lr = LR_BagOfWords(len(id2token), best_lr_emb_dim, best_lr_interaction)
best_lr.load_state_dict(torch.load(best_lr_path))
best_lr.eval()

LR_BagOfWords(
  (embed): Embedding(15002, 125, padding_idx=0)
  (linear): Linear(in_features=125, out_features=3, bias=True)
)

#### Create PyTorch DataLoader

In [16]:
PAD_IDX = 0
UNK_IDX = 1
MAX_SENTENCE_LENGTH = 10
BATCH_SIZE = 128

In [17]:
## Tokenization

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
  tokens = tokenizer(sent)
  return [token.text.lower() for token in tokens if (token.text not in punctuations)]

def tokenize_dataset(dataset):
    token_dataset = []
    all_tokens = []
    
    for sample in dataset:
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

In [18]:
class SNLIDataset(Dataset):
    '''
    Class that represents a dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    '''
    
    def __init__(self, premise_list, hypothesis_list, label_list):
        '''
        @param premise_list: list of premise tokens 
        @param hypothesis_list: list of hypothesis tokens
        @param label_list: list of labels
        '''
        self.premise_list = premise_list
        self.hypothesis_list = hypothesis_list
        self.label_list = label_list.replace({'entailment': 0, 'neutral': 1, 'contradiction': 2})
        assert (len(self.premise_list) == len(self.hypothesis_list))
        assert (len(self.premise_list) == len(self.label_list))

    def __len__(self):
        return len(self.premise_list)
        
    def __getitem__(self, key):
        '''
        Triggered when you call dataset[i]
        '''
        premise = self.premise_list[key][:MAX_SENTENCE_LENGTH]
        hypothesis = self.hypothesis_list[key][:MAX_SENTENCE_LENGTH]
        label = self.label_list[key]
        return [premise, len(premise), hypothesis, len(hypothesis), label]

In [19]:
## Collate Function
def snli_data_collate_func(batch):
    '''
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length.
    '''
    premise_list = []
    premise_length_list = []
    hypothesis_list = []
    hypothesis_length_list = []
    label_list = []
    
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[4])
        premise_length_list.append(datum[1])
        hypothesis_length_list.append(datum[3])
    
    # padding
    for datum in batch:
        padded_p_vec = np.pad(np.array(datum[0]), 
                                pad_width = ((0, MAX_SENTENCE_LENGTH - datum[1])), 
                                mode = 'constant', constant_values = 0)
        premise_list.append(padded_p_vec)
        
        padded_h_vec = np.pad(np.array(datum[2]), 
                                pad_width = ((0, MAX_SENTENCE_LENGTH - datum[3])), 
                                mode = 'constant', constant_values = 0)
        hypothesis_list.append(padded_h_vec)
        
    return [torch.from_numpy(np.array(premise_list)), torch.LongTensor(premise_length_list), 
            torch.from_numpy(np.array(hypothesis_list)), torch.LongTensor(hypothesis_length_list),
            torch.LongTensor(label_list)]

#### Combine all the data processing steps and evaluation into a single function

In [20]:
# Function for testing the model
def test_model(loader, model):
    '''
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    '''
    correct = 0
    total = 0
    model.eval()
    for premise, len_premise, hypothesis, len_hypothesis, labels in loader:
        outputs = model(premise, len_premise, hypothesis, len_hypothesis)
        criterion = torch.nn.CrossEntropyLoss()
        loss = criterion(outputs, labels)
        predicted = F.softmax(outputs, dim = 1).max(1, keepdim = True)[1]
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return loss, (100 * correct / total)

In [21]:
def evaluate_lr_mnli(mnli_val_data, model):
    premise_tokens, all_premise_tokens = tokenize_dataset(mnli_val_data['premise'])
    hypothesis_tokens, all_hypothesis_tokens = tokenize_dataset(mnli_val_data['hypothesis'])
    premise_indices = token2index_dataset(premise_tokens)
    hypothesis_indices = token2index_dataset(hypothesis_tokens)
    val_dataset = SNLIDataset(premise_indices, hypothesis_indices, mnli_val_data['label'])
    val_loader = torch.utils.data.DataLoader(dataset = val_dataset, 
                                             batch_size = BATCH_SIZE,
                                             collate_fn = snli_data_collate_func,
                                             shuffle = True)
    val_loss, val_acc = test_model(val_loader, model)
    return val_loss.item(), val_acc  

In [22]:
mnli_val_dfs = [mnli_val_gov, mnli_val_tel, mnli_val_slate, mnli_val_fiction, mnli_val_travel]
mnli_val_accs_lr = []
for df in mnli_val_dfs:
    val_acc = evaluate_lr_mnli(df, best_lr)[1]
    mnli_val_accs_lr.append(val_acc)

In [23]:
mnli_val_accs_lr

[37.696850393700785,
 35.223880597014926,
 39.12175648702595,
 41.10552763819096,
 34.92871690427699]

#### Load best Neural Network Model

Neural Network (SNLI Validation Accuracy: 65.2%)

- Vocab size: 10000
- Embedding dimension: 150
- Interaction: concat

In [24]:
class NN_BagOfWords(nn.Module):
    '''
    BagOfWords classification model.
    '''
    def __init__(self, vocab_size, emb_dim, interaction, hidden_dim1, hidden_dim2):
        '''
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding.
        @param interaction: the way to combine the two vector representations.
        @param hidden_dim1: dimension of the first hidden layer of neural nets.
        @param hidden_dim2: dimension of the second hidden layer of neural nets.
        '''
        super(NN_BagOfWords, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx = 0)
        self.interaction = interaction
        if interaction == 'concat':
            self.first_layer = nn.Linear(2 * emb_dim, hidden_dim1)
        else:
            self.first_layer = nn.Linear(emb_dim, hidden_dim1)
        self.second_layer = nn.Linear(hidden_dim1, hidden_dim2)
        self.last_linear = nn.Linear(hidden_dim2, 3)
    
    def forward(self, premise, len_premise, hypothesis, len_hypothesis):
        '''
        @param premise: matrix of size (batch_size, max_sentence_length). Each row represents a 
            premise that is represented using n-gram index. Note that they are padded to have same length.
        @param len_premise: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each premise in the data.
        @param hypothesis: matrix of size (batch_size, max_sentence_length). Each row represents a 
            hypothesis that is represented using n-gram index. Note that they are padded to have same length.
        @param len_hypothesis: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each hypothesis in the data.
        '''
        premise_out = self.embed(premise)
        premise_out = torch.sum(premise_out, dim = 1)
        premise_out /= len_premise.view(len_premise.size()[0],1).expand_as(premise_out).float()
     
        hypothesis_out = self.embed(hypothesis)
        hypothesis_out = torch.sum(hypothesis_out, dim = 1)
        hypothesis_out /= len_hypothesis.view(len_hypothesis.size()[0],1).expand_as(hypothesis_out).float()
    
        out = self.interaction_func(premise_out, hypothesis_out)
        
        # return logits
        out = F.relu(self.first_layer(out.float()))
        out = F.relu(self.second_layer(out))
        out = self.last_linear(out)
        return out
    
    def interaction_func(self, premise, hypothesis):
        '''
        Combines the two vector representations based on the specified interaction method.
        '''
        if self.interaction == 'concat':
            return torch.cat((premise, hypothesis), dim = 1)
        if self.interaction == 'sum':
            return (premise + hypothesis)
        if self.interaction == 'element_wise_product':
            return (premise * hypothesis)

In [60]:
token2id = pkl.load(open('tokens/nn_token2id.p', 'rb'))
id2token = pkl.load(open('tokens/nn_id2token.p', 'rb'))

In [26]:
print(len(token2id))
print(len(id2token))

10002
10002


In [61]:
#best_nn_vocab_size = 10000
best_nn_emb_dim = 150
best_nn_interaction = 'concat'
hidden_dim1 = 50
hidden_dim2 = 25
best_nn_path = 'models/best_nn.pth' 

In [28]:
# load the best NN model
best_nn = NN_BagOfWords(len(id2token), best_nn_emb_dim, best_nn_interaction, hidden_dim1, hidden_dim2)
best_nn.load_state_dict(torch.load(best_nn_path))
best_nn.eval()

NN_BagOfWords(
  (embed): Embedding(10002, 150, padding_idx=0)
  (first_layer): Linear(in_features=300, out_features=50, bias=True)
  (second_layer): Linear(in_features=50, out_features=25, bias=True)
  (last_linear): Linear(in_features=25, out_features=3, bias=True)
)

In [29]:
def evaluate_nn_mnli(mnli_val_data, model):
    premise_tokens, all_premise_tokens = tokenize_dataset(mnli_val_data['premise'])
    hypothesis_tokens, all_hypothesis_tokens = tokenize_dataset(mnli_val_data['hypothesis'])
    premise_indices = token2index_dataset(premise_tokens)
    hypothesis_indices = token2index_dataset(hypothesis_tokens)
    val_dataset = SNLIDataset(premise_indices, hypothesis_indices, mnli_val_data['label'])
    val_loader = torch.utils.data.DataLoader(dataset = val_dataset, 
                                             batch_size = BATCH_SIZE,
                                             collate_fn = snli_data_collate_func,
                                             shuffle = True)
    val_loss, val_acc = test_model(val_loader, model)
    return val_loss.item(), val_acc  

In [30]:
mnli_val_accs_nn = []
for df in mnli_val_dfs:
    val_acc = evaluate_nn_mnli(df, best_nn)[1]
    mnli_val_accs_nn.append(val_acc)

In [31]:
mnli_val_accs_nn

[40.45275590551181,
 43.681592039801,
 38.522954091816366,
 43.41708542713568,
 40.73319755600814]

In [32]:
genres = ['government', 'telephone', 'slate', 'fiction', 'travel']
columns = ['Logistic Regression', 'Neural Network']
val_accs_df = pd.DataFrame(list(zip(mnli_val_accs_lr, mnli_val_accs_nn)), columns = columns, index = genres)

In [33]:
pd.options.display.float_format = '{:,.4f}'.format
val_accs_df

Unnamed: 0,Logistic Regression,Neural Network
government,37.6969,40.4528
telephone,35.2239,43.6816
slate,39.1218,38.523
fiction,41.1055,43.4171
travel,34.9287,40.7332


### 3.3 Fine-tuning on MultiNLI

We will ﬁne-tune our SNLI model on training data for each MultiNLI genre (training set), and evaluate it on that genre (validation set).

In [34]:
# best trained SNLI model
best_nn

NN_BagOfWords(
  (embed): Embedding(10002, 150, padding_idx=0)
  (first_layer): Linear(in_features=300, out_features=50, bias=True)
  (second_layer): Linear(in_features=50, out_features=25, bias=True)
  (last_linear): Linear(in_features=25, out_features=3, bias=True)
)

In [35]:
# prepare training data for each genre
mnli_train['genre'].value_counts()

telephone     4270
slate         4026
travel        3985
government    3883
fiction       3836
Name: genre, dtype: int64

In [36]:
mnli_train_tel = mnli_train[mnli_train['genre'] == 'telephone']
mnli_train_slate = mnli_train[mnli_train['genre'] == 'slate']
mnli_train_travel = mnli_train[mnli_train['genre'] == 'travel']
mnli_train_gov = mnli_train[mnli_train['genre'] == 'government']
mnli_train_fiction = mnli_train[mnli_train['genre'] == 'fiction']

In [37]:
print(mnli_train_tel.shape)
print(mnli_train_slate.shape)
print(mnli_train_travel.shape)
print(mnli_train_gov.shape)
print(mnli_train_fiction.shape)

(4270, 4)
(4026, 4)
(3985, 4)
(3883, 4)
(3836, 4)


In [38]:
mnli_train_tel = mnli_train_tel.drop(['genre'], axis = 1).reset_index(drop = True)
mnli_train_slate = mnli_train_slate.drop(['genre'], axis = 1).reset_index(drop = True)
mnli_train_travel = mnli_train_travel.drop(['genre'], axis = 1).reset_index(drop = True)
mnli_train_gov = mnli_train_gov.drop(['genre'], axis = 1).reset_index(drop = True)
mnli_train_fiction = mnli_train_fiction.drop(['genre'], axis = 1).reset_index(drop = True)

In [39]:
print(mnli_train_tel.shape)
print(mnli_train_slate.shape)
print(mnli_train_travel.shape)
print(mnli_train_gov.shape)
print(mnli_train_fiction.shape)

(4270, 3)
(4026, 3)
(3985, 3)
(3883, 3)
(3836, 3)


Since our SNLI model is already trained, we do not need (nor want) to over-train our model. Hence, we can continue training just on a small number of epochs, or use a lower learning rate.

Let's try `num_epochs` = 5, `learning_rate` = 0.001 first.

In [40]:
num_epochs = 5
learning_rate = 0.001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(best_nn.parameters(), lr = learning_rate)

In [41]:
# double checking
for x in best_nn.parameters():
    print(x.shape)

torch.Size([10002, 150])
torch.Size([50, 300])
torch.Size([50])
torch.Size([25, 50])
torch.Size([25])
torch.Size([3, 25])
torch.Size([3])


In [42]:
def train_genre_model(model, train_genre_loader, val_genre_loader, num_epochs, val_1 = 50, val_2 = 100):
    '''
    Train model and evaluate for a specific genre.
    '''
    start = time.time()
    for epoch in range(num_epochs):
        for i, (premise, len_premise, hypothesis, len_hypothesis, labels) in enumerate(train_genre_loader):
            model.train()
            optimizer.zero_grad()
            outputs = model(premise, len_premise, hypothesis, len_hypothesis)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # print training loss every val_1 iterations
            if i > 0 and i % val_1 == 0:
                print(f'Epoch: [{epoch+1}/{num_epochs}], Step: [{i+1}/{len(train_genre_loader)}], Train Loss: {loss:.4f}')

            # validate every val_2 iterations
            if i > 0 and i % val_2 == 0:
                # validate
                val_loss, val_acc = test_model(val_genre_loader, model)
                print('Epoch: [{}/{}], Step: [{}/{}], Train Loss: {:.4f}, Validation Loss: {:.4f}, Validation Acc: {}'
                      .format(epoch+1, num_epochs, i+1, len(train_genre_loader), loss, val_loss, val_acc))
        print('-'*50)

    print()

    print('Final Stats:')
    print('After training for {} epochs'.format(num_epochs))
    train_loss, train_acc = test_model(train_genre_loader, model)
    print('Train Loss: {:.4f}, Train Accuracy: {}'.format(train_loss, train_acc))
    val_loss, val_acc = test_model(val_genre_loader, model)
    print('Val loss: {:.4f}, Val Accuracy: {}'.format(val_loss, val_acc))

    print(f'Total training time: {time.time() - start}')
    return val_acc, model

#### Create PyTorch DataLoader

In [43]:
def create_data_loader(df, batch_size = 32):
    '''
    Creates PyTorch DataLoader from a DataFrame for model training and evaluation.
    '''
    premise_tokens, all_premise_tokens = tokenize_dataset(df['premise'])
    hypothesis_tokens, all_hypothesis_tokens = tokenize_dataset(df['hypothesis'])
    premise_indices = token2index_dataset(premise_tokens)
    hypothesis_indices = token2index_dataset(hypothesis_tokens)
    dataset = SNLIDataset(premise_indices, hypothesis_indices, df['label'])
    loader = torch.utils.data.DataLoader(dataset = dataset,
                                         batch_size = batch_size,
                                         collate_fn = snli_data_collate_func,
                                         shuffle = True)
    return loader

In [44]:
train_tel_loader = create_data_loader(mnli_train_tel)
train_slate_loader = create_data_loader(mnli_train_slate)
train_travel_loader = create_data_loader(mnli_train_travel)
train_gov_loader = create_data_loader(mnli_train_gov)
train_fiction_loader = create_data_loader(mnli_train_fiction)

In [45]:
val_tel_loader = create_data_loader(mnli_val_tel)
val_slate_loader = create_data_loader(mnli_val_slate)
val_travel_loader = create_data_loader(mnli_val_travel)
val_gov_loader = create_data_loader(mnli_val_gov)
val_fiction_loader = create_data_loader(mnli_val_fiction)

In [46]:
# government
val_acc_gov, model_gov = train_genre_model(best_nn, train_gov_loader, val_gov_loader, num_epochs)
torch.save(model_gov.state_dict(), 'models/model_gov.pth')

Epoch: [1/5], Step: [51/122], Train Loss: 0.9939
Epoch: [1/5], Step: [101/122], Train Loss: 1.1811
Epoch: [1/5], Step: [101/122], Train Loss: 1.1811, Validation Loss: 1.2477, Validation Acc: 45.07874015748032
--------------------------------------------------
Epoch: [2/5], Step: [51/122], Train Loss: 0.9461
Epoch: [2/5], Step: [101/122], Train Loss: 0.9862
Epoch: [2/5], Step: [101/122], Train Loss: 0.9862, Validation Loss: 1.0220, Validation Acc: 47.24409448818898
--------------------------------------------------
Epoch: [3/5], Step: [51/122], Train Loss: 0.9370
Epoch: [3/5], Step: [101/122], Train Loss: 0.8111
Epoch: [3/5], Step: [101/122], Train Loss: 0.8111, Validation Loss: 1.0948, Validation Acc: 48.12992125984252
--------------------------------------------------
Epoch: [4/5], Step: [51/122], Train Loss: 0.9101
Epoch: [4/5], Step: [101/122], Train Loss: 0.6499
Epoch: [4/5], Step: [101/122], Train Loss: 0.6499, Validation Loss: 1.1584, Validation Acc: 48.12992125984252
-----------

In [47]:
# telephone
val_acc_tel, model_tel = train_genre_model(best_nn, train_tel_loader, val_tel_loader, num_epochs)
torch.save(model_tel.state_dict(), 'models/model_tel.pth')

Epoch: [1/5], Step: [51/134], Train Loss: 1.0533
Epoch: [1/5], Step: [101/134], Train Loss: 1.0438
Epoch: [1/5], Step: [101/134], Train Loss: 1.0438, Validation Loss: 1.0230, Validation Acc: 43.18407960199005
--------------------------------------------------
Epoch: [2/5], Step: [51/134], Train Loss: 0.9272
Epoch: [2/5], Step: [101/134], Train Loss: 1.0601
Epoch: [2/5], Step: [101/134], Train Loss: 1.0601, Validation Loss: 1.1692, Validation Acc: 45.57213930348259
--------------------------------------------------
Epoch: [3/5], Step: [51/134], Train Loss: 1.0609
Epoch: [3/5], Step: [101/134], Train Loss: 0.9378
Epoch: [3/5], Step: [101/134], Train Loss: 0.9378, Validation Loss: 0.9825, Validation Acc: 47.960199004975124
--------------------------------------------------
Epoch: [4/5], Step: [51/134], Train Loss: 0.9295
Epoch: [4/5], Step: [101/134], Train Loss: 0.9514
Epoch: [4/5], Step: [101/134], Train Loss: 0.9514, Validation Loss: 1.2954, Validation Acc: 46.26865671641791
----------

In [48]:
# slate
val_acc_slate, model_slate = train_genre_model(best_nn, train_slate_loader, val_slate_loader, num_epochs)
torch.save(model_slate.state_dict(), 'models/model_slate.pth')

Epoch: [1/5], Step: [51/126], Train Loss: 1.0707
Epoch: [1/5], Step: [101/126], Train Loss: 1.0819
Epoch: [1/5], Step: [101/126], Train Loss: 1.0819, Validation Loss: 0.9088, Validation Acc: 41.417165668662676
--------------------------------------------------
Epoch: [2/5], Step: [51/126], Train Loss: 1.0024
Epoch: [2/5], Step: [101/126], Train Loss: 1.0874
Epoch: [2/5], Step: [101/126], Train Loss: 1.0874, Validation Loss: 0.7009, Validation Acc: 42.81437125748503
--------------------------------------------------
Epoch: [3/5], Step: [51/126], Train Loss: 1.0118
Epoch: [3/5], Step: [101/126], Train Loss: 1.0712
Epoch: [3/5], Step: [101/126], Train Loss: 1.0712, Validation Loss: 0.7466, Validation Acc: 41.31736526946108
--------------------------------------------------
Epoch: [4/5], Step: [51/126], Train Loss: 0.8895
Epoch: [4/5], Step: [101/126], Train Loss: 1.0374
Epoch: [4/5], Step: [101/126], Train Loss: 1.0374, Validation Loss: 1.6993, Validation Acc: 42.41516966067864
----------

In [49]:
# fiction
val_acc_fiction, model_fiction = train_genre_model(best_nn, train_fiction_loader, val_fiction_loader, num_epochs)
torch.save(model_fiction.state_dict(), 'models/model_fiction.pth')

Epoch: [1/5], Step: [51/120], Train Loss: 1.1900
Epoch: [1/5], Step: [101/120], Train Loss: 0.9569
Epoch: [1/5], Step: [101/120], Train Loss: 0.9569, Validation Loss: 0.9110, Validation Acc: 44.02010050251256
--------------------------------------------------
Epoch: [2/5], Step: [51/120], Train Loss: 0.9135
Epoch: [2/5], Step: [101/120], Train Loss: 0.9029
Epoch: [2/5], Step: [101/120], Train Loss: 0.9029, Validation Loss: 0.8610, Validation Acc: 46.733668341708544
--------------------------------------------------
Epoch: [3/5], Step: [51/120], Train Loss: 0.6393
Epoch: [3/5], Step: [101/120], Train Loss: 0.8558
Epoch: [3/5], Step: [101/120], Train Loss: 0.8558, Validation Loss: 0.8840, Validation Acc: 47.93969849246231
--------------------------------------------------
Epoch: [4/5], Step: [51/120], Train Loss: 0.6479
Epoch: [4/5], Step: [101/120], Train Loss: 0.6801
Epoch: [4/5], Step: [101/120], Train Loss: 0.6801, Validation Loss: 1.7502, Validation Acc: 48.44221105527638
----------

In [50]:
# travel
val_acc_travel, model_travel = train_genre_model(best_nn, train_travel_loader, val_travel_loader, num_epochs)
torch.save(model_travel.state_dict(), 'models/model_travel.pth')

Epoch: [1/5], Step: [51/125], Train Loss: 1.0403
Epoch: [1/5], Step: [101/125], Train Loss: 1.0341
Epoch: [1/5], Step: [101/125], Train Loss: 1.0341, Validation Loss: 0.9516, Validation Acc: 45.11201629327902
--------------------------------------------------
Epoch: [2/5], Step: [51/125], Train Loss: 0.9570
Epoch: [2/5], Step: [101/125], Train Loss: 1.0023
Epoch: [2/5], Step: [101/125], Train Loss: 1.0023, Validation Loss: 1.1595, Validation Acc: 43.788187372708755
--------------------------------------------------
Epoch: [3/5], Step: [51/125], Train Loss: 0.7789
Epoch: [3/5], Step: [101/125], Train Loss: 0.7905
Epoch: [3/5], Step: [101/125], Train Loss: 0.7905, Validation Loss: 1.0877, Validation Acc: 42.973523421588595
--------------------------------------------------
Epoch: [4/5], Step: [51/125], Train Loss: 0.6550
Epoch: [4/5], Step: [101/125], Train Loss: 0.7023
Epoch: [4/5], Step: [101/125], Train Loss: 0.7023, Validation Loss: 1.1686, Validation Acc: 47.25050916496945
---------

In [51]:
mnli_val_accs_finetuned_nn = [val_acc_gov, val_acc_tel, val_acc_slate, val_acc_fiction, val_acc_travel]

In [52]:
genres = ['government', 'telephone', 'slate', 'fiction', 'travel']
columns = ['Without Fine-tuning', 'With Fine-tuning']
val_accs_df2 = pd.DataFrame(list(zip(mnli_val_accs_nn, mnli_val_accs_finetuned_nn)), columns = columns, index = genres)

In [53]:
pd.options.display.float_format = '{:,.4f}'.format
val_accs_df2

Unnamed: 0,Without Fine-tuning,With Fine-tuning
government,40.4528,46.752
telephone,43.6816,46.6667
slate,38.523,41.1178
fiction,43.4171,46.8342
travel,40.7332,43.89


#### Further evaluate each ﬁne-tuned model on every other genre, to see if that ﬁne-tuning carries over to the other genres.

In [54]:
def eval_model_genre(model_path, loader_ls):
    '''
    Load a fine-tuned model and evaluate the model on every other genre.
    '''
    model= NN_BagOfWords(len(id2token), best_nn_emb_dim, best_nn_interaction, hidden_dim1, hidden_dim2)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    val_accs = []
    for loader in loader_ls:
        val_acc = test_model(loader, model)[1]
        val_accs.append(val_acc)
    return val_accs

In [62]:
gov_model_path = 'models/model_gov.pth'
tel_model_path = 'models/model_tel.pth'
slate_model_path = 'models/model_slate.pth'
fiction_model_path = 'models/model_fiction.pth'
travel_model_path = 'models/model_travel.pth'
loader_ls = [val_gov_loader, val_tel_loader, val_slate_loader, val_fiction_loader, val_travel_loader]

In [56]:
# without fine-tuning
mnli_val_accs_nn

[40.45275590551181,
 43.681592039801,
 38.522954091816366,
 43.41708542713568,
 40.73319755600814]

In [57]:
# evaluate the fine-tuned model on every other genre
val_accs_gov = eval_model_genre(gov_model_path, loader_ls)
val_accs_tel = eval_model_genre(tel_model_path, loader_ls)
val_accs_slate = eval_model_genre(slate_model_path, loader_ls)
val_accs_fiction = eval_model_genre(fiction_model_path, loader_ls)
val_accs_travel = eval_model_genre(travel_model_path, loader_ls)

In [58]:
col_genres = ['government', 'telephone', 'slate', 'fiction', 'travel']
idx_models = ['No Fine-tuning', 'Fine-tuned on government', 'Fine-tuned on telephone', 'Fine-tuned on slate',
              'Fine-tuned on fiction', 'Fine-tuned on travel']
val_accs_df3 = pd.DataFrame([mnli_val_accs_nn, val_accs_gov, val_accs_tel, 
                            val_accs_slate, val_accs_fiction, val_accs_travel], columns = col_genres, index = idx_models)

In [59]:
pd.options.display.float_format = '{:,.4f}'.format
val_accs_df3

Unnamed: 0,government,telephone,slate,fiction,travel
No Fine-tuning,40.4528,43.6816,38.523,43.4171,40.7332
Fine-tuned on government,46.752,45.5721,39.521,42.9146,42.5662
Fine-tuned on telephone,47.5394,46.6667,41.517,44.1206,42.7699
Fine-tuned on slate,47.2441,46.2687,41.1178,42.7136,44.3992
Fine-tuned on fiction,45.6693,45.8706,40.8184,46.8342,46.1303
Fine-tuned on travel,47.0472,45.6716,41.4172,46.3317,43.89
