In [1]:
import spacy
import numpy as np
import re
import itertools
from collections import Counter
import json
import pandas as pd
import torch
import dill as pickle
import time
from data_module.data_preprocessor import preprocess_question, get_label
from collections import Counter
from pandas_ml import ConfusionMatrix

## Load Test Dataset, Models, and Preprocessing Module

In [2]:
df_test = pd.read_csv("ir_test_dataset.csv")

In [3]:
models = []
models.append(torch.load('CNN_single_and_ensemble_learning_related/model1.model'))
models.append(torch.load('CNN_single_and_ensemble_learning_related/model2.model'))
models.append(torch.load('CNN_single_and_ensemble_learning_related/model3.model'))
models.append(torch.load('CNN_single_and_ensemble_learning_related/model4.model'))

In [4]:
text_field = pickle.load(open("CNN_single_and_ensemble_learning_related/text_vocab.pkl", "rb"))
label_field = pickle.load(open("CNN_single_and_ensemble_learning_related/label_vocab.pkl", "rb"))

## Create function for evaluation

In [5]:
def model_prediction(model, text_field, label_field, test_data):
    res = []
    time_predictions = []
    for text in test_data:
        start_time = time.time()
        text = preprocess_question(text, text_field, use_gpu=True)
        model.eval()
        y = model(text)
        label_string = get_label(y, label_field)
        time_predictions.append(time.time() - start_time)
        res.append(label_string)
        del text
        torch.cuda.empty_cache()
    avg_time = np.average(time_predictions)
    #return prediction result and avg time to predict a comment
    return res, avg_time

In [6]:
def most_voted(res_column_stack):
    most_voted_res = []
    for row in res_column_stack:
        c = Counter(row)
        most_voted_res.append(int(c.most_common(1)[0][0]))
    return most_voted_res

In [7]:
def ensemble_model_prediction(models, text_field, label_field, test_data):
    res_all = []
    avg_time_each_model = []
    for model in models:
        res_tmp, avg_time_tmp = model_prediction(model, text_field, label_field, test_data)
        res_all.append(res_tmp)
        avg_time_each_model.append(avg_time_tmp)
    res_all = np.column_stack(res_all)
    most_voted_res = most_voted(res_all)
    
    #return prediction result and avg time of each model to predict a comment
    return most_voted_res, avg_time_each_model

## Run single model

In [8]:
single_res, avg_time = model_prediction(models[0], text_field, label_field, df_test.text.values)

  logloss = F.log_softmax(logit) # log of softmax


In [None]:
single_res # evaluation dataset result

## Run ensemble model

In [9]:
ensemble_res, avg_time_each_model = ensemble_model_prediction(models, text_field, label_field, df_test.text.values)

  logloss = F.log_softmax(logit) # log of softmax


In [None]:
ensemble_res # evaluation dataset result

In [10]:
confusion_matrix = ConfusionMatrix(df_test.label, ensemble_res)

In [11]:
confusion_matrix.print_stats() #1 is Positive, 0 is Neutral, and -1 is Negative

Confusion Matrix:

Predicted  -1   0    1  __all__
Actual                         
-1         25   4   16       45
0           7  46   21       74
1           2  12  867      881
__all__    34  62  904     1000


Overall Statistics:

Accuracy: 0.938
95% CI: (0.9212200503389666, 0.9521381518866409)
No Information Rate: ToDo
P-Value [Acc > NIR]: 7.163076138174108e-05
Kappa: 0.686009176635031
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        -1          0          1
Population                                   1000       1000       1000
P: Condition positive                          45         74        881
N: Condition negative                         955        926        119
Test outcome positive                          34         62        904
Test outcome negative                         966        938         96
TP: True Positive                              25         46        867
TN: True Negative                             946    

## Calculate average time predictions

In [12]:
avg_time_model_1 = avg_time_each_model[0]

In [13]:
df_train = pd.read_csv("ir_train_dataset.csv")

In [None]:
ensemble_res, avg_time_each_model = ensemble_model_prediction(models, text_field, label_field, df_train.text.values)

  logloss = F.log_softmax(logit) # log of softmax


In [None]:
avg_time_model_1 = (avg_time_each_model[0] + avg_time_model_1)/2

In [None]:
avg_time_model_1 # average time to predict a comment

In [None]:
1/avg_time_model_1 #records per sec