In [48]:
# import dependencies
import io
import nltk
import json
import gzip
import torch
import string
import random
import operator
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.metrics import precision_recall_fscore_support, multilabel_confusion_matrix
random.seed(1)

In [2]:
num_gpus = torch.cuda.device_count()
if num_gpus > 0:
    current_device = 'cuda'
else:
    current_device = 'cpu'

FloatTensor = torch.cuda.FloatTensor if current_device == 'cuda' else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if current_device == 'cuda' else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if current_device == 'cuda' else torch.ByteTensor

In [3]:
def convCategories(categories, category_to_index):
    return [category_to_index[category] for category in categories]

In [4]:
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=1):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [5]:
def tokenize_dataset(dataset, word_to_index):
    _current_dictified = []
    for l in tqdm(dataset['tokens']):
        encoded_l = [word_to_index[i] if i in word_to_index else word_to_index['<UNK>'] for i in l]
        _current_dictified.append(encoded_l)
    return _current_dictified

In [6]:
class TensoredDataset(Dataset):
    def __init__(self, list_of_lists_of_tokens, labels, IDF_list):
        self.input_tensors = []
        self.target_tensors = []
        self.IDF_tensors = []
        for i in range(0, len(list_of_lists_of_tokens)):
            self.input_tensors.append(torch.tensor(list_of_lists_of_tokens[i], dtype=torch.long))
            self.target_tensors.append(torch.tensor(labels[i], dtype=torch.long))
            self.IDF_tensors.append(torch.tensor(IDF_list[i], dtype=torch.long))
    def __len__(self):
        return len(self.input_tensors)
    
    def __getitem__(self, idx):
        # return a (input, target) tuple
        return (self.input_tensors[idx], self.target_tensors[idx], self.IDF_tensors[idx])

In [7]:
def pad_list_of_tensors(list_of_tensors, pad_token):
    max_length = max([t.size(-1) for t in list_of_tensors])
    padded_list = []
    
    for t in list_of_tensors:
        #print(t.reshape(1, -1).shape)
        #print(torch.tensor([[pad_token]*(max_length - t.size(-1))])[0].shape)
        padded_tensor = torch.cat([t.reshape(1, -1), torch.tensor([[pad_token]*(max_length - t.size(-1))], dtype=torch.long)], dim = -1)
        padded_list.append(padded_tensor)
    padded_tensor = torch.cat(padded_list, dim=0)
    return padded_tensor

def transform_labels(target_list):
    padded_list = []
    for t in target_list:
        labels = t.unsqueeze(0)
        target = torch.zeros(labels.size(0), len(category_to_index)).scatter_(1, labels, 1)
        padded_list.append(target)
    padded_tensor = torch.cat(padded_list, dim=0)
    return padded_tensor

def pad_collate_fn(batch):
    # batch is a list of sample tuples
    input_list = [s[0] for s in batch]
    target_list = [s[1] for s in batch]
    IDF_list = [s[2] for s in batch]
    #pad_token = persona_dict.get_id('<pad>')
    pad_token = word_to_index['<PAD>']
    
    input_tensor = pad_list_of_tensors(input_list, pad_token)
    IDF_tensor = pad_list_of_tensors(IDF_list, 0)
    #target_tensor = pad_list_of_tensors(target_list, pad_token)
    target_tensor = transform_labels(target_list)
    
    return input_tensor, target_tensor, IDF_tensor


In [8]:
class LSTModel(nn.Module):
    """
    This model combines embedding, rnn and projection layer into a single model
    """
    def __init__(self, options):
        super().__init__()
        
        # create each LM part here 
        self.lookup = nn.Embedding(num_embeddings=options['num_embeddings'], embedding_dim=options['embedding_dim'], padding_idx=options['padding_idx']).from_pretrained(weights_matrix)
        self.lstm = nn.LSTM(options['input_size'], options['hidden_size'], options['num_layers'], batch_first=True)
        self.projection = nn.Linear(options['hidden_size']*options['num_layers'], options['num_labels'])
        
    def forward(self, encoded_input_sequence, attn):
        """
        Forward method process the input from token ids to logits
        """
        embeddings = self.lookup(encoded_input_sequence)
        lstm_outputs, (hn, cn) = self.lstm(embeddings)
        output_weighted = torch.sum((attn.unsqueeze(-1).float()*lstm_outputs.float()), 1)
        
        logits = self.projection(output_weighted)
        
        return logits

In [9]:
class StructuredSelfAttention(torch.nn.Module):
    """
    The class is an implementation of the paper A Structured Self-Attentive Sentence Embedding including regularization
    and without pruning. Slight modifications have been done for speedup
    """
   
    def __init__(self,batch_size,lstm_hid_dim,d_a,r,emb_dim=100,vocab_size=None,use_pretrained_embeddings = False,embeddings=None,n_classes = 1):
        """
        Initializes parameters suggested in paper
 
        Args:
            batch_size  : {int} batch_size used for training
            lstm_hid_dim: {int} hidden dimension for lstm
            d_a         : {int} hidden dimension for the dense layer
            r           : {int} attention-hops or attention heads
            max_len     : {int} number of lstm timesteps
            emb_dim     : {int} embeddings dimension
            vocab_size  : {int} size of the vocabulary
            use_pretrained_embeddings: {bool} use or train your own embeddings
            embeddings  : {torch.FloatTensor} loaded pretrained embeddings
            type        : [0,1] 0-->binary_classification 1-->multiclass classification
            n_classes   : {int} number of classes
 
        Returns:
            self
 
        Raises:
            Exception
        """
        super(StructuredSelfAttention,self).__init__()
       
        self.embeddings,emb_dim = self._load_embeddings(use_pretrained_embeddings,embeddings,vocab_size,emb_dim)
        self.lstm = torch.nn.LSTM(emb_dim,lstm_hid_dim,1,batch_first=True)
        self.linear_first = torch.nn.Linear(lstm_hid_dim,d_a)
        self.linear_first.bias.data.fill_(0)
        self.linear_second = torch.nn.Linear(d_a,r)
        self.linear_second.bias.data.fill_(0)
        self.n_classes = n_classes
        self.linear_final = torch.nn.Linear(lstm_hid_dim,self.n_classes)
        self.batch_size = batch_size       
        self.lstm_hid_dim = lstm_hid_dim
        self.hidden_state = self.init_hidden()
        self.r = r
                 
    def _load_embeddings(self,use_pretrained_embeddings,embeddings,vocab_size,emb_dim):
        """Load the embeddings based on flag"""
       
        if use_pretrained_embeddings is True and embeddings is None:
            raise Exception("Send a pretrained word embedding as an argument")
           
        if not use_pretrained_embeddings and vocab_size is None:
            raise Exception("Vocab size cannot be empty")
   
        if not use_pretrained_embeddings:
            word_embeddings = torch.nn.Embedding(vocab_size,emb_dim,padding_idx=0)
            
        elif use_pretrained_embeddings:
            word_embeddings = torch.nn.Embedding(embeddings.size(0), embeddings.size(1))
            word_embeddings.weight = torch.nn.Parameter(embeddings)
            emb_dim = embeddings.size(1)
            
        return word_embeddings,emb_dim
       
        
    def softmax(self,input, axis=1):
        """
        Softmax applied to axis=n
 
        Args:
           input: {Tensor,Variable} input on which softmax is to be applied
           axis : {int} axis on which softmax is to be applied
 
        Returns:
            softmaxed tensors
 
       
        """
 
        input_size = input.size()
        trans_input = input.transpose(axis, len(input_size)-1)
        trans_size = trans_input.size()
        input_2d = trans_input.contiguous().view(-1, trans_size[-1])
        soft_max_2d = F.softmax(input_2d)
        soft_max_nd = soft_max_2d.view(*trans_size)
        return soft_max_nd.transpose(axis, len(input_size)-1)
       
        
    def init_hidden(self):
        return (Variable(torch.zeros(1,self.batch_size,self.lstm_hid_dim)),Variable(torch.zeros(1,self.batch_size,self.lstm_hid_dim)))
       
        
    def forward(self,x):
        embeddings = self.embeddings(x)       
        outputs, self.hidden_state = self.lstm(embeddings)       
        x = F.tanh(self.linear_first(outputs))       
        x = self.linear_second(x)       
        x = self.softmax(x,1)       
        attention = x.transpose(1,2)       
        sentence_embeddings = attention@outputs       
        avg_sentence_embeddings = torch.sum(sentence_embeddings,1)/self.r
       
        output = self.linear_final(avg_sentence_embeddings)
        return output, attention
       
	   
	#Regularization
    def l2_matrix_norm(self,m):
        """
        Frobenius norm calculation
 
        Args:
           m: {Variable} ||AAT - I||
 
        Returns:
            regularized value
 
       
        """
        return torch.sum(torch.sum(torch.sum(m**2,1),1)**0.5).type(torch.DoubleTensor)

In [10]:
OUTPUT_FILE = '/scratch/sa5154/Capstone/Models/wikitext_tokenized.p'
wiki_df =  pkl.load(open(OUTPUT_FILE, "rb"))

In [11]:
categories = []
for i in list(wiki_df['mid_level_categories']):
    categories.extend(i)
categories = sorted(list(set(categories)))

In [12]:
category_to_index = {categories[i]:i for i in range(0, len(categories))}
index_to_category = {v:k for k, v in category_to_index.items()}

In [13]:
wiki_df['category_tokens'] = wiki_df.apply(lambda row: convCategories(row['mid_level_categories'], category_to_index), axis=1)

In [14]:
wiki_df = wiki_df[wiki_df.astype(str)['category_tokens'] != '[]']
wiki_df = wiki_df[wiki_df.astype(str)['tokens'] != '[]']
wiki_df = wiki_df.reset_index().drop('index', axis = 1)

In [15]:
wiki_train, wiki_valid, wiki_test = train_validate_test_split(wiki_df)

In [16]:
y_train = list(wiki_train['category_tokens'])
y_val = list(wiki_valid['category_tokens'])

In [17]:
vocab = sorted(set([y for x in list(wiki_train['tokens']) for y in x]))

In [18]:
train_list =[]
for index, row in wiki_train.iterrows():
    train_list.append(' '.join(row.tokens))

val_list =[]
for index, row in wiki_valid.iterrows():
    val_list.append(' '.join(row.tokens))


In [19]:
# Get inverse document frequency
IDF = {}
for i in tqdm(range(len(train_list))):
    tokens = train_list[i]
    for w in tokens.split(' '):
        try:
            IDF[w].add(i)
        except:
            IDF[w] = {i}

100%|██████████| 79968/79968 [00:17<00:00, 4488.24it/s]


In [20]:
for i in IDF:
    IDF[i] = len(IDF[i])
    IDF[i] = np.log(len(vocab)/(1+IDF[i]))

In [21]:
IDF_list_train = []
for i in tqdm(range(len(train_list))):
    tokens = train_list[i]
    list_temp = []
    for w in tokens.split(' '):
        if w in IDF:
            list_temp.append(IDF[w])
        else:
            list_temp.append(0)
    IDF_list_train.append(list_temp)

100%|██████████| 79968/79968 [00:14<00:00, 5676.74it/s]


In [22]:
IDF_list_val = []
for i in tqdm(range(len(val_list))):
    tokens = val_list[i]
    list_temp = []
    for w in tokens.split(' '):
        if w in IDF:
            list_temp.append(IDF[w])
        else:
            list_temp.append(0)
    IDF_list_val.append(list_temp)

100%|██████████| 9996/9996 [00:01<00:00, 6383.15it/s]


In [23]:
len(vocab)

595516

In [24]:
word_to_index = {"<PAD>":0, "<UNK>":1}
for word in vocab:
    if(word not in word_to_index):
        word_to_index[word]=len(word_to_index)
index_to_word = {v:k for k, v in word_to_index.items()}

In [25]:
wiki_tokenized_train = tokenize_dataset(wiki_train, word_to_index)

100%|██████████| 79968/79968 [00:08<00:00, 9505.45it/s] 


In [26]:
wiki_tokenized_val = tokenize_dataset(wiki_valid, word_to_index)

100%|██████████| 9996/9996 [00:01<00:00, 8694.59it/s]


In [27]:
wiki_tokenized_datasets = {}
wiki_tokenized_datasets['train'] = wiki_tokenized_train
wiki_tokenized_datasets['val'] = wiki_tokenized_val

In [28]:
wiki_tensor_dataset = {}
wiki_tensor_dataset['train'] = TensoredDataset(wiki_tokenized_datasets['train'], y_train, IDF_list_train)
wiki_tensor_dataset['val'] = TensoredDataset(wiki_tokenized_datasets['val'], y_val, IDF_list_val)

In [29]:
wiki_loaders = {}

batch_size = 32

for split, wiki_dataset in wiki_tensor_dataset.items():
    wiki_loaders[split] = DataLoader(wiki_dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate_fn)

In [30]:
def Validate(model):
    valid_loss_cache = 0
    all_targets = []
    all_logits = []
    model.eval()
    with torch.no_grad():
        for i, (inp, target, attn) in enumerate(wiki_loaders['val']):
            inp = inp.to(current_device)
            target = target.to(current_device)
            attn = attn.to(current_device)
            logits = model(inp, attn)
            loss = criterion(logits, target)
            m = nn.Sigmoid()
            logits = m(logits)
            logits = logits.cpu().detach().numpy()
            target = target.cpu().numpy()
            all_targets.append(target)
            all_logits.append(logits)
            valid_loss_cache += loss.item()

        avg_val_loss = valid_loss_cache / (i+1)
        all_logits = np.concatenate(all_logits, axis=0)
        all_targets = np.concatenate(all_targets, axis=0)
        all_logits[all_logits > 0.5] = 1
        all_logits[all_logits <= 0.5] = 0
        prec_macro, rec_macro, f_score_macro, _ = precision_recall_fscore_support(all_targets, all_logits, average = 'macro')
        print('Validation macro prec: {}, rec:{}, f_score:{}'.format(prec_macro, rec_macro, f_score_macro))
        prec_micro, rec_micro, f_score_micro, _ = precision_recall_fscore_support(all_targets, all_logits, average = 'micro')
        print('Validation micro prec: {}, rec:{}, f_score:{}'.format(prec_micro, rec_micro, f_score_micro))
        print('Validation loss = {:.{prec}f}'.format(avg_val_loss, prec=4))
        return f_score_micro

# Pre Trained Embeddings

In [31]:
#Loading pre trained fastText word embeddings
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

fasttext_emb = load_vectors("wiki.en.align.vec")

2519371it [03:16, 12812.29it/s]


In [32]:
#Creating the weight matrix for pretrained word embeddings
vocab_size = len(index_to_word)
embed_dim = len(fasttext_emb["apple"])
weights_matrix = np.zeros((vocab_size,embed_dim))

words_found = 0
for i, word in enumerate(word_to_index):
    try: 
        weights_matrix[i] = fasttext_emb[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(embed_dim))
weights_matrix = torch.FloatTensor(weights_matrix)

In [33]:
print("Total words in vocab: {}".format(len(vocab)))
print("No. of words from vocab found in fastText: {}".format(words_found))

Total words in vocab: 595516
No. of words from vocab found in fastText: 470498


In [34]:
options = {
    'num_embeddings': len(word_to_index),
    'embedding_dim': weights_matrix.size(1),
    'num_labels':44,
    'padding_idx': word_to_index['<PAD>'],
    'input_size': weights_matrix.size(1),
    'hidden_size': 128,
    'num_layers': 1,
}

model = LSTModel(options).to(current_device)
#model = StructuredSelfAttention(batch_size,64,64,1,emb_dim=100,vocab_size=len(vocab),use_pretrained_embeddings = True,embeddings=weights_matrix,n_classes = 44).to(current_device)


criterion = nn.BCEWithLogitsLoss()

model_parameters = [p for p in model.parameters() if p.requires_grad]
optimizer = optim.Adam(model_parameters, lr=0.0005)


In [35]:
model

LSTModel(
  (lookup): Embedding(595518, 300)
  (lstm): LSTM(300, 128, batch_first=True)
  (projection): Linear(in_features=128, out_features=44, bias=True)
)

In [1]:
best_f_score_micro = -1
count = 0
PATH = '/scratch/sa5154/Capstone/Models/LSTM_TFIDF.pth'
for epoch_number in range(50):
    print("Running Epoch:{}".format(epoch_number + 1))
    avg_loss = -1
    # do train
    model.train()

    train_loss_cache = 0

    for i, (inp, target, attn) in enumerate(wiki_loaders['train']):
        model.train()
        optimizer.zero_grad()
        inp = inp.to(current_device)
        target = target.to(current_device)
        attn = attn.to(current_device)
        logits= model(inp, attn)
        loss = criterion(logits, target)
        train_loss_cache += loss.item()
       
        loss.backward()
        optimizer.step()

        if i % 500 == 0:
            avg_loss = train_loss_cache/(i+1)
            print('Step {} avg train loss = {:.{prec}f}'.format(i, avg_loss, prec=4))

            #do valid
            f_score_micro = Validate(model)
            if( f_score_micro > best_f_score_micro):
                best_f_score_micro = f_score_micro
                torch.save({
                        'state_dict': model.state_dict(), 'options':options
                        }, PATH)
                print("Model saved!")

In [37]:
PATH = '/scratch/sa5154/Capstone/Models/LSTM_TFIDF.pth'
model_loaded = torch.load(PATH)
model = LSTModel(model_loaded['options']).to(current_device)
model.load_state_dict(model_loaded['state_dict'])
model.eval()

LSTModel(
  (lookup): Embedding(595518, 300)
  (lstm): LSTM(300, 128, batch_first=True)
  (projection): Linear(in_features=128, out_features=44, bias=True)
)

In [38]:
Validate(model)

Validation macro prec: 0.586680838325009, rec:0.5801327898279524, f_score:0.5737182960715755
Validation micro prec: 0.8146804522643449, rec:0.7873546426693158, f_score:0.8007845001782955
Validation loss = 0.0690


0.8007845001782955

In [46]:
def create_per_class_tables(loader, model, device, class_names, threshold=0.5):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    model.eval()
    outputs_list_nc = []
    true_list_nc = []
    with torch.no_grad():
        for data, labels, attn in loader:
            data_batch, label_batch, attn_batch = data.to(device), labels.float(), attn.to(device)
            logits = model(data_batch, attn_batch)
            outputs_bc = torch.sigmoid(logits)
            outputs_bc = outputs_bc.detach().cpu().numpy().astype(np.float)
            outputs_bc = (outputs_bc > threshold)
            outputs_list_nc.append(outputs_bc)
            true_list_nc.append(label_batch.detach().cpu().numpy().astype(np.float))
    # to np.array
    outputs_list_nc = np.vstack(outputs_list_nc)
    true_list_nc = np.vstack(true_list_nc)
    
    # per class counts
    counts_c = true_list_nc.sum(axis=0)
    
    # per class confusion matrix: TN, FN, TP, FP
    confusion_matrix_c22 = multilabel_confusion_matrix(
        true_list_nc,
        outputs_list_nc,
    )
    confusion_matrix_c4 = confusion_matrix_c22.reshape(-1, 4)
    
    # per class precision, recall, f-score
    precision_c, recall_c, f1_c, _ = precision_recall_fscore_support(
        true_list_nc,
        outputs_list_nc,
        average=None
    )
    
    # combine all metrics in a dict
    per_class_metrics = {
        "class_name": class_names,
        "count": counts_c,
        "TN": confusion_matrix_c4[:,0], 
        "FN": confusion_matrix_c4[:,2],
        "TP": confusion_matrix_c4[:,3],
        "FP": confusion_matrix_c4[:,1],
        "precision": precision_c, 
        "recall": recall_c, 
        "f1": f1_c
    }
    return pd.DataFrame(per_class_metrics)

In [49]:
per_class_metrics = create_per_class_tables(wiki_loaders['val'], model, current_device, categories, threshold=0.5)

In [50]:
per_class_metrics.sort_values('f1', ascending=False)

Unnamed: 0,class_name,count,TN,FN,TP,FP,precision,recall,f1
7,Culture.Language and literature,3631.0,6167,175,3456,198,0.945813,0.951804,0.948799
13,Culture.Sports,1567.0,8206,87,1480,223,0.869055,0.94448,0.905199
23,Geography.Oceania,468.0,9479,57,411,49,0.893478,0.878205,0.885776
31,STEM.Biology,771.0,9181,153,618,44,0.933535,0.801556,0.862526
30,History_And_Society.Transportation,551.0,9339,69,482,106,0.819728,0.874773,0.846356
18,Geography.Asia,1399.0,8384,244,1155,213,0.844298,0.82559,0.834839
5,Culture.Games and toys,109.0,9873,22,87,14,0.861386,0.798165,0.828571
9,Culture.Music,435.0,9452,61,374,109,0.774327,0.85977,0.814815
41,STEM.Space,67.0,9908,10,57,21,0.730769,0.850746,0.786207
20,Geography.Europe,2168.0,7454,571,1597,374,0.810249,0.736624,0.771684
