In [1]:
# import dependencies
import io
import nltk
import json
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader

In [2]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

FloatTensor = torch.cuda.FloatTensor if current_device == 'cuda' else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if current_device == 'cuda' else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if current_device == 'cuda' else torch.ByteTensor

In [3]:
def convCategories(categories, category_to_index):
    return [category_to_index[category] for category in categories]

In [4]:
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=1):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [5]:
def tokenize_dataset(dataset, word_to_index):
    _current_dictified = []
    for l in tqdm(dataset['tokens']):
        encoded_l = [word_to_index[i] if i in word_to_index else word_to_index['<UNK>'] for i in l]
        _current_dictified.append(encoded_l)
    return _current_dictified

In [6]:
MAX_SEN_LEN = 2500
class TensoredDataset(Dataset):
    def __init__(self, list_of_lists_of_tokens, labels):
        self.input_tensors = []
        self.target_tensors = []
        
        for i in range(0, len(list_of_lists_of_tokens)):
            self.input_tensors.append(torch.tensor(list_of_lists_of_tokens[i], dtype=torch.long))
            self.target_tensors.append(torch.tensor(labels[i], dtype=torch.long))
    
    def __len__(self):
        return len(self.input_tensors)
    
    def __getitem__(self, idx):
        # return a (input, target) tuple
        return (self.input_tensors[idx][:MAX_SEN_LEN], self.target_tensors[idx])

In [7]:
def pad_list_of_tensors(list_of_tensors, pad_token):
    max_length = max([t.size(-1) for t in list_of_tensors])
    padded_list = []
    
    for t in list_of_tensors:
        #print(t.reshape(1, -1).shape)
        #print(torch.tensor([[pad_token]*(max_length - t.size(-1))])[0].shape)
        padded_tensor = torch.cat([t.reshape(1, -1), torch.tensor([[pad_token]*(max_length - t.size(-1))], dtype=torch.long)], dim = -1)
        padded_list.append(padded_tensor)
    padded_tensor = torch.cat(padded_list, dim=0)
    return padded_tensor

def transform_labels(target_list):
    padded_list = []
    for t in target_list:
        labels = t.unsqueeze(0)
        target = torch.zeros(labels.size(0), len(category_to_index)).scatter_(1, labels, 1)
        padded_list.append(target)
    padded_tensor = torch.cat(padded_list, dim=0)
    return padded_tensor

def pad_collate_fn(batch):
    # batch is a list of sample tuples
    input_list = [s[0] for s in batch]
    target_list = [s[1] for s in batch]
    
    #pad_token = persona_dict.get_id('<pad>')
    pad_token = word_to_index['<PAD>']
    
    input_tensor = pad_list_of_tensors(input_list, pad_token)
    #target_tensor = pad_list_of_tensors(target_list, pad_token)
    target_tensor = transform_labels(target_list)
    
    return input_tensor, target_tensor


In [8]:
class LSTModel(nn.Module):
    """
    This model combines embedding, rnn and projection layer into a single model
    """
    def __init__(self, options):
        super().__init__()
        
        # create each LM part here 
        self.lookup = nn.Embedding(num_embeddings=options['num_embeddings'], embedding_dim=options['embedding_dim'], padding_idx=options['padding_idx']).from_pretrained(weights_matrix)
        self.lstm = nn.LSTM(options['input_size'], options['hidden_size'], options['num_layers'], batch_first=True)
        self.projection = nn.Linear(options['hidden_size']*options['num_layers'], options['num_labels'])
        
    def forward(self, encoded_input_sequence):
        """
        Forward method process the input from token ids to logits
        """
        embeddings = self.lookup(encoded_input_sequence)
        lstm_outputs, (hn, cn) = self.lstm(embeddings)
        logits = self.projection(hn[-1])
        
        return logits

In [9]:
OUTPUT_FILE = '/scratch/sa5154/Capstone/Models/wikitext_tokenized.p'
wiki_df =  pkl.load(open(OUTPUT_FILE, "rb"))

In [10]:
import matplotlib.pyplot as plt
temp = list(wiki_df['tokens'].str.len())
temp = sorted(temp, reverse = True)
plt.hist(temp[3000:])
plt.plot()

[]

In [11]:
categories = []
for i in list(wiki_df['mid_level_categories']):
    categories.extend(i)
categories = list(set(categories))

In [12]:
category_to_index = {categories[i]:i for i in range(0, len(categories))}
index_to_category = {v:k for k, v in category_to_index.items()}

In [13]:
wiki_df['category_tokens'] = wiki_df.apply(lambda row: convCategories(row['mid_level_categories'], category_to_index), axis=1)

In [14]:
wiki_df = wiki_df[wiki_df.astype(str)['category_tokens'] != '[]']
wiki_df = wiki_df[wiki_df.astype(str)['tokens'] != '[]']
wiki_df = wiki_df.reset_index().drop('index', axis = 1)

In [16]:
wiki_df.head()

Unnamed: 0,QID,mid_level_categories,tokens,category_tokens
0,Q2000864,[Culture.Philosophy and religion],"[affirming, the, consequent, sometimes, called...",[29]
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ...",[18]
2,Q6941060,[Geography.Europe],"[the, museum, of, work, or, arbetets, museum, ...",[25]
3,Q843920,"[History_And_Society.History and society, STEM...","[like, this, one, in, dorset, england, arable,...","[11, 31, 1]"
4,Q178999,"[STEM.Biology, STEM.Medicine]","[an, axon, from, greek, axis, or, nerve, fiber...","[31, 27]"


In [17]:
wiki_train, wiki_valid, wiki_test = train_validate_test_split(wiki_df)

In [34]:
wiki_train.head()

Unnamed: 0,QID,mid_level_categories,tokens,category_tokens
17916,Q5346784,[Culture.Language and literature],"[edwin, romanzo, elmer, one, eight, five, zero...",[6]
29737,Q4723109,[Culture.Language and literature],"[alfred, george, fysh, machin, born, one, eigh...",[6]
17976,Q1456016,"[Geography.Americas, Culture.Music]","[too, late, no, friends, is, the, first, full,...","[9, 42]"
97446,Q59149462,"[Geography.Americas, Culture.Sports, Culture.L...","[mat, as, alexis, romero, born, one, february,...","[9, 26, 6]"
91205,Q30602920,"[Culture.Plastic arts, Geography.Americas, Cul...","[the, confederate, memorial, fountain, was, hi...","[2, 9, 10, 32]"


In [35]:
y_train = list(wiki_train['category_tokens'])
y_val = list(wiki_valid['category_tokens'])

In [36]:
vocab = set([y for x in list(wiki_train['tokens']) for y in x])

In [37]:
len(vocab)

595516

In [38]:
list(vocab)[:10]

['jernigan',
 'baseley',
 'digor',
 'engyum',
 'cremated',
 'kiesenwetter',
 'futurs',
 'hrg',
 'yoshikane',
 'solm']

In [21]:
word_to_index = {"<PAD>":0, "<UNK>":1}
for word in vocab:
    if(word not in word_to_index):
        word_to_index[word]=len(word_to_index)
index_to_word = {v:k for k, v in word_to_index.items()}

In [29]:
word_to_index['the']

313866

In [22]:
wiki_tokenized_train = tokenize_dataset(wiki_train, word_to_index)

100%|██████████| 79968/79968 [00:11<00:00, 6751.62it/s]


In [23]:
wiki_tokenized_val = tokenize_dataset(wiki_valid, word_to_index)

100%|██████████| 9996/9996 [00:01<00:00, 6214.32it/s]


In [24]:
wiki_tokenized_datasets = {}
wiki_tokenized_datasets['train'] = wiki_tokenized_train
wiki_tokenized_datasets['val'] = wiki_tokenized_val

In [25]:
wiki_tensor_dataset = {}
wiki_tensor_dataset['train'] = TensoredDataset(wiki_tokenized_datasets['train'], y_train)
wiki_tensor_dataset['val'] = TensoredDataset(wiki_tokenized_datasets['val'], y_val)

In [26]:
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(wiki_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)

In [28]:
wiki_tensor_dataset["train"].__getitem__(200)

(tensor([308304, 124513, 574170, 124513, 574170, 329431, 340330, 508141, 387060,
         350387, 508141, 304212, 560307, 393424, 212730,  16159, 504072, 340330,
         508141, 387060, 484092, 340330, 592563,  54685, 137456, 366071, 356016,
         491039, 398472, 218360,  78935, 332545, 332545, 163618, 451357,  72203,
          13879, 522573,  28430,  13879, 522573, 135067, 440158, 504072, 582354,
          78935, 289860,  92848, 440158, 504072, 359854, 472292, 222932, 100658,
          10256, 340330, 508141, 387060, 428151, 283942, 308986, 338559, 592563,
         571636, 352148, 117077, 544272, 375595, 352148,  35905,  78935, 332545,
         309683, 410350, 100548,  78935, 332545, 332545, 430674, 314298, 476864,
          78935, 332545, 332545, 309683, 311456, 313866, 413527, 504072,  85371,
         484784, 276742, 570801, 160988, 561733, 103355, 561733, 103355, 571636,
         248296,  83177, 355755, 313866, 124513, 387060, 560430, 162017,  92848,
         163618, 163618, 163

In [33]:
for i, (inp, target) in enumerate(wiki_loaders['train']):
    print(target[0])
    break

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])


In [31]:
def Validate(model):
    valid_loss_cache = 0
    all_targets = []
    all_logits = []
    model.eval()
    with torch.no_grad():
        for i, (inp, target) in enumerate(wiki_loaders['val']):
            inp = inp.to(current_device)
            target = target.to(current_device)
            logits = model(inp)
            loss = criterion(logits, target)
            m = nn.Sigmoid()
            logits = m(logits)
            logits = logits.cpu().detach().numpy()
            target = target.cpu().numpy()
            all_targets.append(target)
            all_logits.append(logits)
            valid_loss_cache += loss.item()

        avg_val_loss = valid_loss_cache / (i+1)
        all_logits = np.concatenate(all_logits, axis=0)
        all_targets = np.concatenate(all_targets, axis=0)
        all_logits[all_logits > 0.5] = 1
        all_logits[all_logits <= 0.5] = 0
        prec_macro, rec_macro, f_score_macro, _ = precision_recall_fscore_support(all_targets, all_logits, average = 'macro')
        print('Validation macro prec: {}, rec:{}, f_score:{}'.format(prec_macro, rec_macro, f_score_macro))
        prec_micro, rec_micro, f_score_micro, _ = precision_recall_fscore_support(all_targets, all_logits, average = 'micro')
        print('Validation micro prec: {}, rec:{}, f_score:{}'.format(prec_micro, rec_micro, f_score_micro))
        print('Validation loss = {:.{prec}f}'.format(avg_val_loss, prec=4))
        return f_score_micro

# Pre Trained Embeddings

In [32]:
#Loading pre trained fastText word embeddings
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

fasttext_emb = load_vectors("wiki.en.align.vec")

2519371it [03:11, 13148.12it/s]


In [33]:
#Creating the weight matrix for pretrained word embeddings
vocab_size = len(index_to_word)
embed_dim = len(fasttext_emb["apple"])
weights_matrix = np.zeros((vocab_size,embed_dim))

words_found = 0
for i, word in enumerate(word_to_index):
    try: 
        weights_matrix[i] = fasttext_emb[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(embed_dim))
weights_matrix = torch.FloatTensor(weights_matrix)

In [34]:
print("Total words in vocab: {}".format(len(vocab)))
print("No. of words from vocab found in fastText: {}".format(words_found))

Total words in vocab: 595516
No. of words from vocab found in fastText: 470498


In [35]:
options = {
    'num_embeddings': len(word_to_index),
    'embedding_dim': weights_matrix.size(1),
    'num_labels':44,
    'padding_idx': word_to_index['<PAD>'],
    'input_size': weights_matrix.size(1),
    'hidden_size': 64,
    'num_layers': 1,
}

model = LSTModel(options).to(current_device)

criterion = nn.BCEWithLogitsLoss()

model_parameters = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.Adam(model_parameters, lr=0.001)


In [36]:
best_f_score_micro = -1
count = 0
PATH = '/scratch/sa5154/Capstone/Models/LSTM_Pretrained_max_len_2500_epoch_30.pth'
for epoch_number in range(30):
    print("Running Epoch:{}".format(epoch_number + 1))
    avg_loss = -1
    # do train
    model.train()

    train_loss_cache = 0

    for i, (inp, target) in enumerate(wiki_loaders['train']):
        model.train()
        optimizer.zero_grad()
        inp = inp.to(current_device)
        target = target.to(current_device)
        logits = model(inp)
        loss = criterion(logits, target)
        train_loss_cache += loss.item()
       
        loss.backward()
        optimizer.step()

        if i % 500 == 0:
            avg_loss = train_loss_cache/(i+1)
            print('Step {} avg train loss = {:.{prec}f}'.format(i, avg_loss, prec=4))

            #do valid
            f_score_micro = Validate(model)
            if( f_score_micro > best_f_score_micro):
                best_f_score_micro = f_score_micro
                torch.save({
                        'state_dict': model.state_dict()
                        }, PATH)
                print("Model saved!")

Running Epoch:1
Step 0 avg train loss = 0.7002


  'precision', 'predicted', average, warn_for)


Validation macro prec: 0.027626928699400463, rec:0.3706603290096035, f_score:0.026910420322028925
Validation micro prec: 0.029248965418206027, rec:0.2618477181090399, f_score:0.05262014502539412
Validation loss = 0.6723
Model saved!
Step 500 avg train loss = 0.1471
Validation macro prec: 0.0, rec:0.0, f_score:0.0
Validation micro prec: 0.0, rec:0.0, f_score:0.0
Validation loss = 0.1239


  'precision', 'predicted', average, warn_for)


Step 1000 avg train loss = 0.1352
Validation macro prec: 0.0, rec:0.0, f_score:0.0
Validation micro prec: 0.0, rec:0.0, f_score:0.0
Validation loss = 0.1242
Step 1500 avg train loss = 0.1313
Validation macro prec: 0.0, rec:0.0, f_score:0.0
Validation micro prec: 0.0, rec:0.0, f_score:0.0
Validation loss = 0.1236
Step 2000 avg train loss = 0.1294
Validation macro prec: 0.0, rec:0.0, f_score:0.0
Validation micro prec: 0.0, rec:0.0, f_score:0.0
Validation loss = 0.1236
Running Epoch:2
Step 0 avg train loss = 0.1153
Validation macro prec: 0.0, rec:0.0, f_score:0.0
Validation micro prec: 0.0, rec:0.0, f_score:0.0
Validation loss = 0.1238
Step 1000 avg train loss = 0.1234
Validation macro prec: 0.0, rec:0.0, f_score:0.0
Validation micro prec: 0.0, rec:0.0, f_score:0.0
Validation loss = 0.1237
Step 1500 avg train loss = 0.1236
Validation macro prec: 0.0, rec:0.0, f_score:0.0
Validation micro prec: 0.0, rec:0.0, f_score:0.0
Validation loss = 0.1236
Step 2000 avg train loss = 0.1237
Validation 

Validation macro prec: 0.17144045975396827, rec:0.10567596841662362, f_score:0.11496184389732246
Validation micro prec: 0.8153742543775255, rec:0.49523753871325893, f_score:0.6162067837277784
Validation loss = 0.0750
Model saved!
Step 1000 avg train loss = 0.0740
Validation macro prec: 0.1953721343214102, rec:0.13103932852512268, f_score:0.14841895783451964
Validation micro prec: 0.87011997714721, rec:0.533980015193128, f_score:0.6618142313959804
Validation loss = 0.0713
Model saved!
Step 1500 avg train loss = 0.0725
Validation macro prec: 0.19294479365622363, rec:0.13987970516577866, f_score:0.15152354587978092
Validation micro prec: 0.8541413439833304, rec:0.5748845906620698, f_score:0.6872271314309664
Validation loss = 0.0676
Model saved!
Step 2000 avg train loss = 0.0706
Validation macro prec: 0.2172415249019404, rec:0.14925543004052355, f_score:0.16458656326998702
Validation micro prec: 0.862605868765506, rec:0.5892011920761994, f_score:0.7001597111311714
Validation loss = 0.0654


Step 1500 avg train loss = 0.0475
Validation macro prec: 0.4426069036545713, rec:0.29425522013146815, f_score:0.3232495387741116
Validation micro prec: 0.8444814891991387, rec:0.7104540407877052, f_score:0.7716915264995239
Validation loss = 0.0516
Step 2000 avg train loss = 0.0473
Validation macro prec: 0.44115713046496996, rec:0.2952634172307953, f_score:0.33653262355808805
Validation micro prec: 0.8601255554771814, rec:0.7125577046689651, f_score:0.7794183445190156
Validation loss = 0.0495
Model saved!
Running Epoch:16
Step 0 avg train loss = 0.0435
Validation macro prec: 0.44899657091081635, rec:0.3044363922389818, f_score:0.3427105482311174
Validation micro prec: 0.8599986069513129, rec:0.7214982761643195, f_score:0.784683825865904
Validation loss = 0.0491
Model saved!
Step 500 avg train loss = 0.0457
Validation macro prec: 0.4506434071014079, rec:0.2982720277512669, f_score:0.34128001527843127
Validation micro prec: 0.8675870949281065, rec:0.708700987553322, f_score:0.780136369484

Model saved!
Step 500 avg train loss = 0.0377
Validation macro prec: 0.5644680594315163, rec:0.39400646337794315, f_score:0.4447994833460628
Validation micro prec: 0.867056294504904, rec:0.7542219365394729, f_score:0.8067127097721805
Validation loss = 0.0427
Step 1000 avg train loss = 0.0375
Validation macro prec: 0.5646446943600728, rec:0.4090663556431569, f_score:0.45214168833439977
Validation micro prec: 0.8617056305564702, rec:0.7646233857301467, f_score:0.8102668895906868
Validation loss = 0.0419
Model saved!
Step 1500 avg train loss = 0.0376
Validation macro prec: 0.5538356959226743, rec:0.4156811103028932, f_score:0.4573838159283184
Validation micro prec: 0.8531748595051999, rec:0.7718109039911178, f_score:0.8104559121310669
Validation loss = 0.0421
Model saved!
Step 2000 avg train loss = 0.0377
Validation macro prec: 0.5431646131225718, rec:0.4360601446565911, f_score:0.4703593918988252
Validation micro prec: 0.846510152284264, rec:0.7795827733302167, f_score:0.8116691509749643

Step 2000 avg train loss = 0.0331
Validation macro prec: 0.5769259390964637, rec:0.4236885969504728, f_score:0.47128534202793904
Validation micro prec: 0.8440660272721341, rec:0.7559749897738561, f_score:0.7975955610357582
Validation loss = 0.0447
Running Epoch:29
Step 0 avg train loss = 0.0274
Validation macro prec: 0.6030449698728951, rec:0.4635004218776681, f_score:0.5009627684580469
Validation micro prec: 0.8273084847349691, rec:0.7743236136270671, f_score:0.7999396317536976
Validation loss = 0.0451
Step 500 avg train loss = 0.0318
Validation macro prec: 0.582720797582668, rec:0.4550480776210985, f_score:0.49502341439956987
Validation micro prec: 0.8426451612903226, rec:0.7632209431426401, f_score:0.8009689387667495
Validation loss = 0.0438
Step 1000 avg train loss = 0.0322
Validation macro prec: 0.6084819455146997, rec:0.428399319156077, f_score:0.4762574063133124
Validation micro prec: 0.8424412167003191, rec:0.7557996844504178, f_score:0.7967720076387605
Validation loss = 0.0447

In [24]:
PATH = '/scratch/sa5154/NLP/Models/LSTM_Baseline.pth'
model = LSTModel(options).to(current_device)
model.load_state_dict(torch.load(PATH)['state_dict'])
model.eval()

LSTModel(
  (lookup): Embedding(606453, 128, padding_idx=0)
  (lstm): LSTM(128, 64, batch_first=True)
  (projection): Linear(in_features=64, out_features=44, bias=True)
)

In [133]:
all_targets = []
all_logits = []
for i, (inp, target) in enumerate(wiki_loaders['train']):
    inp = inp.to(current_device)
    logits = model(inp)
    m = nn.Sigmoid()
    logits = m(logits)
    logits = logits.cpu().detach().numpy()
    target = target.detach().numpy()
    all_targets.append(target)
    all_logits.append(logits)

In [134]:
all_logits = np.concatenate(all_logits, axis=0)
all_targets = np.concatenate(all_targets, axis=0)

In [135]:
all_logits[all_logits > 0.5] = 1
all_logits[all_logits <= 0.5] = 0

In [129]:
from sklearn.metrics import precision_recall_fscore_support
prec, rec, f_score, _ = precision_recall_fscore_support(all_targets, all_logits, average = 'micro')