In [2]:
import numpy as np
import model
import torch
from torch import nn, optim
from torch.autograd import Variable

import os,timeit
import pandas as pd
from sklearn import metrics 



In [10]:
data_path = 'data/'

entities_dict = {0: "Rachel Green", 1: "Ross Geller", 2: "Chandler Bing", 3: "Monica Geller", 4: "Joey Tribbiani", 
                 5: "Phoebe Buffay", 6: "Others", 7: "None"}

def evaluate(mo, model_name = ""):

    print("Evaluating" + model_name)

    # Test input embeddings
    test_input = np.load(data_path + 'test_input.npy')

    # Test labels in form indexes from entity map
    test_label_index = np.load(data_path + 'test_label_index.npy')

    # Using gpu if available else cpu
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    total_tokens = 0
    correct_tokens = 0

    y_correct = torch.Tensor().type(torch.LongTensor)
    y_predicted = torch.Tensor().type(torch.LongTensor)

    start = timeit.default_timer()
    for i in range(test_input.shape[0]):

        #input sample shape:  (3,25) -> (3,1,25)
        #3 words each of dim 25
        inp = torch.from_numpy(test_input[i].reshape((-1,1,25))).to(device)

        #truth value for input sample: tensor([7, 4, 7])
        #each value is prediction class for the word
        truth = torch.from_numpy(test_label_index[i])
        y_correct = torch.cat((y_correct,truth))

        #predcited op shape: torch.Size([3, 8])
        out = mo(inp)

        #getting class with max probabilities
        out = torch.max(out,1)[1]
        y_predicted = torch.cat((y_predicted, out))
        
        assert y_correct.shape == y_predicted.shape, "**Shape Mismatch**"
        
        #Following code is to calculate accuracy for correct token entities

        #match each elem separately and returns a tensor of 0/1
        check = torch.eq(truth,out)

        #summing all 1's i.e. correct predictions
        correct_tokens_temp = torch.sum(check).item()
        correct_tokens += correct_tokens_temp
        
        seq_len = check.size()[0]
        total_tokens += seq_len

    print("\nTotal time taken: %.4f seconds." % (timeit.default_timer() - start))

    confusion_mat = metrics.confusion_matrix(y_correct, y_predicted)
    classification_rpt = metrics.classification_report(y_correct, y_predicted)

    #calculating accuracy for each class
    accuracy_dict = {}
    for i in range(8):
        #predictions for i-th entity is in i-th row
        total_pred = sum(confusion_mat[i])
        correct_pred = confusion_mat[i][i]
        accuracy_dict[entities_dict[i]] = round(correct_pred/total_pred, 4)

    #print("\n*****Accuracy for each entity:*****")
    #for k,v in accuracy_dict.items():
    #    print("{0:<20} {1}".format(k,v))
    
    token_accuracy = correct_tokens / total_tokens
    avg_accuracy = sum(accuracy_dict.values())/len(accuracy_dict)
    
    #print("\n{0:<40} {1:.4f}".format("Average accuracy per entity: ", avg_accuracy))
    #print("{0:<40} {1:.4f}".format("Overall accuracy (considering tokens): ", token_accuracy))
        
    return token_accuracy, avg_accuracy, accuracy_dict, confusion_mat, classification_rpt


In [14]:

inp_dim = 25
hidden_dim = 64
n_classes = 8

def evaluate_models(model_type, trained_models, msg):
    
    best_dict = {"accuracy": 0,
                 "avg_accuracy": 0,
                 "model_name": "",
                 "accuracy_dict": {},
                 "confusion_mat": ""
                }
    
    best_avg_dict = {"accuracy": 0,
                     "avg_accuracy": 0,
                     "model_name": "",
                     "accuracy_dict": {},
                     "confusion_mat": ""
                    }
    
     
    # t: 31-BiLSTM_Loss_0.1387677234002888.pt
    for t in trained_models:
        mo = model_type(inp_dim, hidden_dim, n_classes)
        mo.load_state_dict(torch.load(save_path + t))
        
        token_accuracy, avg_accuracy, accuracy_dict, confusion_mat, classification_rpt = evaluate(mo, msg + t)

        if token_accuracy > best_dict["accuracy"]:
            best_dict["accuracy"] = token_accuracy
            best_dict["avg_accuracy"] = avg_accuracy
            best_dict["model_name"] = t
            best_dict["accuracy_dict"] = accuracy_dict
            best_dict["confusion_mat"] = confusion_mat

        if avg_accuracy > best_avg_dict["avg_accuracy"]:
            best_avg_dict["accuracy"] = token_accuracy
            best_avg_dict["avg_accuracy"] = avg_accuracy
            best_avg_dict["model_name"] = t
            best_avg_dict["accuracy_dict"] = accuracy_dict
            best_avg_dict["confusion_mat"] = confusion_mat

        print("-"*50, "\n")
    
    return best_dict, best_avg_dict


In [122]:
#Evaluate Bidirectional LSTM models

#save_path = 'modelsV2/'
save_path = 'models/'

trained_models = os.listdir(save_path)
trained_models = [t for t in trained_models if "BiLSTM" in t]

best_dict, best_avg_dict = evaluate_models(model.BiLSTM, trained_models, " Bidirectional LSTM model: ")


Evaluating Bidirectional LSTM model: 1-BiLSTM_Loss_0.5018732744513754.pt

Total time taken: 61.1940 seconds.

*****Accuracy for each entity:*****
Rachel Green         0.2607
Ross Geller          0.2599
Chandler Bing        0.0
Monica Geller        0.0
Joey Tribbiani       0.0065
Phoebe Buffay        0.1262
Others               0.5672
None                 0.9931

Average accuracy per entity:             0.2767
Overall accuracy (considering tokens):   0.8762
-------------------------------------------------- 

Evaluating Bidirectional LSTM model: 11-BiLSTM_Loss_0.1801687417329021.pt

Total time taken: 59.9882 seconds.

*****Accuracy for each entity:*****
Rachel Green         0.687
Ross Geller          0.5873
Chandler Bing        0.5958
Monica Geller        0.6367
Joey Tribbiani       0.633
Phoebe Buffay        0.6485
Others               0.7955
None                 0.9902

Average accuracy per entity:             0.6967
Overall accuracy (considering tokens):   0.9402
--------------------

In [138]:
print("***Best Average Accuracy Model BiLSTM:", best_avg_dict["model_name"], "\n")

print("{0:<40} {1:.4f}".format("Average accuracy per entity: ", best_avg_dict["avg_accuracy"]))
print("{0:<40} {1:.4f}\n".format("Overall accuracy (considering tokens): ", best_avg_dict["accuracy"]))

pd.DataFrame(best_avg_dict["confusion_mat"], columns = entities_dict.values(),index = entities_dict.values())


***Best Average Accuracy Model BiLSTM: 41-BiLSTM_Loss_0.1258366622119277.pt 

Average accuracy per entity:             0.7141
Overall accuracy (considering tokens):   0.9423



Unnamed: 0,Rachel Green,Ross Geller,Chandler Bing,Monica Geller,Joey Tribbiani,Phoebe Buffay,Others,None
Rachel Green,747,29,14,10,21,16,198,16
Ross Geller,52,882,37,24,86,27,368,36
Chandler Bing,46,29,629,18,15,14,184,20
Monica Geller,45,24,28,581,27,9,151,13
Joey Tribbiani,55,51,19,7,590,20,160,19
Phoebe Buffay,41,26,20,17,14,540,136,14
Others,162,100,63,27,101,51,3433,356
,15,17,6,13,12,15,430,50111


In [15]:
save_path = 'models/'

mo = model.BiLSTM(inp_dim, hidden_dim, n_classes)
mo.load_state_dict(torch.load(save_path + "41-BiLSTM_Loss_0.1258366622119277.pt"))
token_accuracy, avg_accuracy, accuracy_dict, confusion_mat, classification_rpt = evaluate(mo, " Bi-LSTM model: ")

print(classification_rpt)

Evaluating Bi-LSTM model: 

Total time taken: 62.8041 seconds.
             precision    recall  f1-score   support

          0       0.64      0.71      0.67      1051
          1       0.76      0.58      0.66      1512
          2       0.77      0.66      0.71       955
          3       0.83      0.66      0.74       878
          4       0.68      0.64      0.66       921
          5       0.78      0.67      0.72       808
          6       0.68      0.80      0.73      4293
          7       0.99      0.99      0.99     50619

avg / total       0.94      0.94      0.94     61037



In [132]:
#Evaluating Normal LSTM models

trained_models = os.listdir(save_path)
trained_models = [t for t in trained_models if "SimpleLSTM" in t]

best_dict_S, best_avg_dict_S = evaluate_models(model.SimpleLSTM, trained_models, " Normal LSTM model: ")


Evaluating Normal LSTM model: 1000-SimpleLSTM_FinalLoss_1.018389134275678e-05.pt

Total time taken: 32.7453 seconds.

*****Accuracy for each entity:*****
Rachel Green         0.4215
Ross Geller          0.2937
Chandler Bing        0.4335
Monica Geller        0.0911
Joey Tribbiani       0.0456
Phoebe Buffay        0.0347
Others               0.181
None                 0.9786

Average accuracy per entity:             0.3100
Overall accuracy (considering tokens):   0.8481
-------------------------------------------------- 

Evaluating Normal LSTM model: SimpleLSTM_FinalLoss_0.10881423137443184.pt

Total time taken: 30.8198 seconds.

*****Accuracy for each entity:*****
Rachel Green         0.7298
Ross Geller          0.75
Chandler Bing        0.7183
Monica Geller        0.705
Joey Tribbiani       0.7481
Phoebe Buffay        0.7116
Others               0.8185
None                 0.9942

Average accuracy per entity:             0.7719
Overall accuracy (considering tokens):   0.9553
--------

In [137]:
print("***Best Average Accuracy Model LSTM:", best_avg_dict_S["model_name"], "\n")

print("{0:<40} {1:.4f}".format("Average accuracy per entity: ", best_avg_dict_S["avg_accuracy"]))
print("{0:<40} {1:.4f}\n".format("Overall accuracy (considering tokens): ", best_avg_dict_S["accuracy"]))

pd.DataFrame(best_avg_dict_S["confusion_mat"], columns = entities_dict.values(),index = entities_dict.values())


***Best Average Accuracy Model LSTM: SimpleLSTM_FinalLoss_0.10881423137443184.pt 

Average accuracy per entity:             0.7719
Overall accuracy (considering tokens):   0.9553



Unnamed: 0,Rachel Green,Ross Geller,Chandler Bing,Monica Geller,Joey Tribbiani,Phoebe Buffay,Others,None
Rachel Green,767,71,8,22,39,10,114,20
Ross Geller,23,1134,28,20,91,15,178,23
Chandler Bing,25,102,686,16,25,10,79,12
Monica Geller,40,70,12,619,33,6,86,12
Joey Tribbiani,33,76,11,7,689,12,83,10
Phoebe Buffay,23,77,18,12,29,575,67,7
Others,98,231,32,24,115,38,3514,241
,9,18,15,8,5,3,235,50326


In [17]:
save_path = 'models/'

mo = model.SimpleLSTM(inp_dim, hidden_dim, n_classes)
mo.load_state_dict(torch.load(save_path + "SimpleLSTM_FinalLoss_0.10881423137443184.pt"))
token_accuracy, avg_accuracy, accuracy_dict, confusion_mat, classification_rpt = evaluate(mo, " LSTM model: ")

print(classification_rpt)

Evaluating LSTM model: 

Total time taken: 37.9662 seconds.
             precision    recall  f1-score   support

          0       0.75      0.73      0.74      1051
          1       0.64      0.75      0.69      1512
          2       0.85      0.72      0.78       955
          3       0.85      0.71      0.77       878
          4       0.67      0.75      0.71       921
          5       0.86      0.71      0.78       808
          6       0.81      0.82      0.81      4293
          7       0.99      0.99      0.99     50619

avg / total       0.96      0.96      0.96     61037

