# For collab only

In [1]:
# !pip install scikit-learn-extra



In [2]:
# !pip install -q pyyaml h5py

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# %cd /content/drive/Othercomputers/My Laptop/Documents/UCA DSAI/Internship 2/Code/text-models/scripts_and_notebooks/

/content/drive/Othercomputers/My Laptop/Documents/UCA DSAI/Internship 2/Code/text-models/scripts_and_notebooks


# Evaluating the results of protorynet models

In [2]:
import pickle
import pandas as pd
import numpy as np
import re
import sys
import time
import myfunctions
sys.path.append('../src/protoryNet/')
from protoryNet import ProtoryNet

[nltk_data] Downloading package punkt to
[nltk_data]     /nfshome/students/cm007951/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Go to main directory
%cd ..

/nfshome/students/cm007951/text-models


# Load data

To see how these datasets were created see the ```protorynet_prototype_initializations.ipynb``` notebook. 

In [4]:
# Loading the train, val and testing splits

directory =  'datasets/cornell_corpus/cornell_prepro_characters_70train_20val_10test/'

x_train = pickle.load(open(directory + 'x_train', 'rb'))
x_test = pickle.load(open(directory + 'x_test', 'rb'))
y_train = pickle.load(open(directory + 'y_train', 'rb'))
y_test = pickle.load(open(directory + 'y_test', 'rb'))

# Functions

In [5]:
def score_trajectory(protorynet_model, list_of_sentences):
    '''
    Given a list of sentences (usually a list of prototypes), it returns the prediction for each of them. 
    Inputs:
    protorynet_model: a protorynet model
    list_of_sentences: a list of sentences
    '''
    pred = []
    for prot in list_of_sentences:
        pred.append(protorynet_model.predict([prot])[0])
    return pred

In [8]:
def eval_protorynet_after_training(x_train, y_train, x_test, y_test, model_directory, model_name, results_path): 
    
    '''
    This function takes a trained protorynet model and computes the train and evaluation accuracy, final prototypes, their predictions scores. 
    It returns the results and saves them in a pickle file. 
    Inputs:
        x_train: list of texts that belong to the train set. Each element of the list can contain many sentences.
        y_train: binary target variables of the train set
        x_test: ist of texts that belong to the test set. Each element of the list can contain many sentences.
        y_test: binary target variables of the test set
        model_directory: directory where the model was saved
        model_name: name of the model. Without '.h5' at the end
        results_path: directory to save the results
    '''

    # --------------------------
    # Data preprocessing
    # --------------------------

    # Guarantee target variable is integer
    y_train = [int(y) for y in y_train]
    y_test = [int(y) for y in y_test]

    # Split text into lists of sentences 
    x_train = myfunctions.split_sentences(x_train)
    x_test = myfunctions.split_sentences(x_test)

    # Make a list of sentences (only for training set)
    train_sentences = []
    for p in x_train:
        train_sentences.extend(p)

    # We remove very short or very long sentences since they behave as outliers.
    train_sentences = [i for i in train_sentences if len(i)>5 and len(i)<100]

    # --------------------------
    # Load model and it's results
    # --------------------------

    model_path = model_directory + model_name

    # Number of prototypes
    try: 
        # Extract it from information derived from training
        train_info = pickle.load(open(model_path + '.pickle', 'rb'))
        number_prototypes = train_info['args'].number_prototypes
    except: 
        # Extract number of prototypes from models name
        number_prototypes = int(re.search('[0-9]*prototypes', model_name).group(0).replace('prototypes',''))

    # Load model
    pNet_saved = ProtoryNet()
    model = pNet_saved.createModel(np.zeros((number_prototypes, 512)), number_prototypes)
    model.load_weights(model_path + '.h5')

    # Sentence embedding using the finetune embedder in the model
    
    start = time.time()
    train_sentences_embedded = pNet_saved.embed(train_sentences)
    print('Embedding time (minutes):', (time.time() - start) / 60)

    # --------------------------
    # Evaluate
    # --------------------------

    # Evaluate the model on training and testing data
    start = time.time()
    preds_train, accuracy_train = pNet_saved.evaluate(x_train, y_train)
    preds_test, accuracy_test = pNet_saved.evaluate(x_test, y_test)
    print('Evaluation time (minutes):', (time.time() - start) / 60)

    # --------------------------
    # Final prototypes
    # --------------------------
    start = time.time()

    # Final_prototypes
    final_prototypes = pNet_saved.showPrototypes(train_sentences, train_sentences_embedded, number_prototypes, printOutput=False, return_prototypes = True)

    # Prediction score of prototypes
    final_prototypes_pred_scores = score_trajectory(pNet_saved, list(final_prototypes.values()))

    print('Prototypes time (minutes):', (time.time() - start) / 60)
    # --------------------------
    # Save and return results
    # --------------------------    

    # Save and return results
    results = {'accuracy_train': accuracy_train,
              'accuracy_test': accuracy_test,
              'predictions_on_test': preds_test,
              'final_prototypes': final_prototypes,
              'final_prototypes_pred_scores': final_prototypes_pred_scores
              }

    pickle.dump(results, open(results_path + 'eval' + model_name + '.pickle', 'wb'))
    return results 

In [36]:
def eval_examples(list_of_examples_text, x_train, model_directory, model_name, results_path):
    
    '''
    This function takes a trained protorynet model and a list of texts and computes the protorynet predictions for such texts, along with the prototypes associated to each sentence each text. 
    It returns the results and saves them in a pickle file. 
    Inputs:
        list_of_examples_text: a list of strings. Each string can contain several sentences.
        x_train: list of texts that belong to the train set. Each element of the list can contain many sentences.
        model_directory: directory where the model was saved
        model_name: name of the model. Without '.h5' at the end
        results_path: directory to save the results
    '''

    # --------------------------
    # Data preprocessing
    # --------------------------

    # Split text into lists of sentences 
    x_train = myfunctions.split_sentences(x_train)
    list_of_examples_text = myfunctions.split_sentences(list_of_examples_text)

    # Make a list of sentences (only for training set)
    train_sentences = []
    for p in x_train:
        train_sentences.extend(p)

    # We remove very short or very long sentences since they behave as outliers.
    train_sentences = [i for i in train_sentences if len(i)>5 and len(i)<100]

    # --------------------------
    # Load model and it's results
    # --------------------------

    model_path = model_directory + model_name

    # Number of prototypes
    try: 
        # Extract it from information derived from training
        train_info = pickle.load(open(model_path + '.pickle', 'rb'))
        number_prototypes = train_info['args'].number_prototypes
    except: 
        # Extract number of prototypes from models name
        number_prototypes = int(re.search('[0-9]*prototypes', model_name).group(0).replace('prototypes',''))


    # Load model
    pNet_saved = ProtoryNet()
    model = pNet_saved.createModel(np.zeros((number_prototypes, 512)), number_prototypes)
    model.load_weights(model_path + '.h5')

    start = time.time()
    # Sentence embedding using the finetune embedder in the model
    train_sentences_embedded = pNet_saved.embed(train_sentences)
    print('Embedding time (minutes):', (time.time() - start) / 60)
    
    # --------------------------
    # Final prototypes
    # --------------------------

    start = time.time()
    
    # Final_prototypes
    final_prototypes = pNet_saved.showPrototypes(train_sentences, train_sentences_embedded, number_prototypes, printOutput=False, return_prototypes = True)

    # Prediction score of prototypes
    final_prototypes_pred_scores = score_trajectory(pNet_saved, list(final_prototypes.values()))
    
    print('Prototypes time (minutes):', (time.time() - start) / 60)

    # --------------------------
    # Predictions and prototypes for specific examples
    # --------------------------
    start = time.time()
    
    preds = []
    prototypes_matching = []
    prototypes_matching_pred_scores = []

    for example in list_of_examples_text:
        preds.append(pNet_saved.predict(example)[0])
        matched_prototypes = pNet_saved.showTrajectory(example, train_sentences, train_sentences_embedded, number_prototypes)
        prototypes_matching.append(matched_prototypes)
        prototypes_matching_pred_scores.append(score_trajectory(pNet_saved, matched_prototypes))

    print('Preds and prototypes for sentences time (minutes):', (time.time() - start) / 60)
    
    # --------------------------
    # Save and return results
    # -------------------------- 

    results = {'predictions': preds,
               'prototypes_matching': prototypes_matching,
               'prototypes_matching_pred_scores': prototypes_matching_pred_scores}

    pickle.dump(results, open(results_path + 'eval_ex' + model_name + '.pickle', 'wb'))
    return results

# Exact results for our models

In [17]:
# Models
models = [
          'cornell_prepro_characters_70train_20val_10test__20epochs__10prototypes__randomtype_init__30000sample_size_sentences__16init_prototypes_seed', 
          'cornell_prepro_characters_70train_20val_10test__20epochs__10prototypes__randomtype_init__30000sample_size_sentences__81init_prototypes_seed', 
          'cornell_prepro_characters_70train_20val_10test__20epochs__10prototypes__randomtype_init__30000sample_size_sentences__128init_prototypes_seed', 
          'cornell_prepro_characters_70train_20val_10test__20epochs__30prototypes__randomtype_init__30000sample_size_sentences__16init_prototypes_seed',
          'cornell_prepro_characters_70train_20val_10test__20epochs__30prototypes__randomtype_init__30000sample_size_sentences__81init_prototypes_seed',
          'cornell_prepro_characters_70train_20val_10test__20epochs__30prototypes__randomtype_init__30000sample_size_sentences__128init_prototypes_seed',
          'cornell_prepro_characters_70train_20val_10test__20epochs__50prototypes__randomtype_init__30000sample_size_sentences__16init_prototypes_seed',
          'cornell_prepro_characters_70train_20val_10test__20epochs__50prototypes__randomtype_init__30000sample_size_sentences__81init_prototypes_seed',
          'cornell_prepro_characters_70train_20val_10test__20epochs__50prototypes__randomtype_init__30000sample_size_sentences__128init_prototypes_seed',
         ]

# Define some short name for each model. For readability.
models_short_name = ['10 prototypes model 1', '10 prototypes model 2', '10 prototypes model 3',
                    '30 prototypes model 1', '30 prototypes model 2', '30 prototypes model 3',
                    '50 prototypes model 1', '50 prototypes model 2', '50 prototypes model 3']


In [10]:
# Evaluate models after training
# Get train and test validation accuaracy, final prototypes, among others...
for model_name in models:
    results = eval_protorynet_after_training(x_train, y_train, x_test, y_test, 
                                   model_directory = 'results/protorynet_models/',
                                   model_name = model_name, 
                                   results_path= 'results/protorynet_models/')

[db] model.input =  KerasTensor(type_spec=TensorSpec(shape=(None,), dtype=tf.string, name='input_4'), name='input_4', description="created by layer 'input_4'")
[db] protoLayerName =  proto_layer
[db] protoLayer =  <protoryNet.ProtoryNet.createModel.<locals>.prototypeLayer object at 0x7fbbe896e850>
[db] protoLayer.output =  (<KerasTensor: shape=(1, None, 10) dtype=float32 (created by layer 'proto_layer')>, <KerasTensor: shape=(10, 512) dtype=float32 (created by layer 'proto_layer')>)
[db] distanceLayer.output =  KerasTensor(type_spec=TensorSpec(shape=(1, None, 10), dtype=tf.float32, name=None), name='distance_layer/PartitionedCall:0', description="created by layer 'distance_layer'")
Model: "custom_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None,)]                 0         
                                                                 
 keras_layer_3 (Keras

 lstm_5 (LSTM)               [(1, None, 128),          91648     
                              (1, 128),                          
                              (1, 128)]                          
                                                                 
 tf.__operators__.getitem_5   (1, 128)                 0         
 (SlicingOpLambda)                                               
                                                                 
 dense_5 (Dense)             (1, 1)                    129       
                                                                 
 tf.compat.v1.squeeze_5 (TFO  (1,)                     0         
 pLambda)                                                        
                                                                 
 model_20 (Functional)       ((1, None, 50),           256823424 
                              (50, 512))                         
                                                                 
 model_21 

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None,)]                 0         
                                                                 
 keras_layer_8 (KerasLayer)  (None, 512)               256797824 
                                                                 
 tf.expand_dims_8 (TFOpLambd  (1, None, 512)           0         
 a)                                                              
                                                                 
 proto_layer (prototypeLayer  ((1, None, 50),          25600     
 )                            (50, 512))                         
                                                                 
 distance_layer (distanceLay  (1, None, 50)            0         
 er)                                                             
                                                                 
 lstm_8 (L

In [22]:
results_path = 'results/protorynet_models/'

# Load results for each model

predictions_on_test = dict()
test_accuracy = []
for model_name, model_short in zip(models, models_short_name): 
    
    train_results = pickle.load(open(results_path + model_name + '.pickle', 'rb')) 
    evaluation_results = pickle.load(open(results_path + 'eval' + model_name + '.pickle', 'rb'))
    predictions_on_test[model_short] = evaluation_results['predictions_on_test']
    test_accuracy.append(evaluation_results['accuracy_test'])
    
    
predictions_on_test['ground truth'] = y_test
predictions_on_test['text_with_punctuation'] = x_test

pd.DataFrame(predictions_on_test).to_csv(results_path + 'predictions_on_test.csv')
pd.DataFrame({'model':models_short_name, 'test_accuracy':test_accuracy})

Unnamed: 0,model,test_accuracy
0,10 prototypes model 1,0.651452
1,10 prototypes model 2,0.701245
2,10 prototypes model 3,0.688797
3,30 prototypes model 1,0.556017
4,30 prototypes model 2,0.680498
5,30 prototypes model 3,0.692946
6,50 prototypes model 1,0.481328
7,50 prototypes model 2,0.53112
8,50 prototypes model 3,0.547718


In [29]:
chosen_dialogues_idx = [2361, 1152, 1652, 73, 329, 499, 394, 1788]
chosen_dialogues = x_test[chosen_dialogues_idx]
chosen_dialogues

2361    So isn't it time you unwrapped your present? W...
1152    You know, all I ever wanted was to measure up ...
1652    Keep painting. Promise me. Just sit here, I'll...
73      Mrs. Strode, Michael Myers is here to kill his...
329     "There's no shame in getting a little therapy"...
499     There are many things that can exert control o...
394     Thanks George I have did it sound that bad? Ri...
1788    Why is it taking so long? If it isn't a baby w...
Name: text_with_punctuation, dtype: object

In [37]:
# Get results in specific dialogues
for model_name in models:
    results = eval_examples(chosen_dialogues, 
                            x_train, 
                            model_directory = 'results/protorynet_models/', 
                            model_name = model_name, 
                            results_path= 'results/protorynet_models/')

[db] model.input =  KerasTensor(type_spec=TensorSpec(shape=(None,), dtype=tf.string, name='input_15'), name='input_15', description="created by layer 'input_15'")
[db] protoLayerName =  proto_layer
[db] protoLayer =  <protoryNet.ProtoryNet.createModel.<locals>.prototypeLayer object at 0x7fbc32a86e90>
[db] protoLayer.output =  (<KerasTensor: shape=(1, None, 10) dtype=float32 (created by layer 'proto_layer')>, <KerasTensor: shape=(10, 512) dtype=float32 (created by layer 'proto_layer')>)
[db] distanceLayer.output =  KerasTensor(type_spec=TensorSpec(shape=(1, None, 10), dtype=tf.float32, name=None), name='distance_layer/PartitionedCall:0', description="created by layer 'distance_layer'")
Model: "custom_model_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None,)]                 0         
                                                                 
 keras_layer_13 (

                                                                 
 lstm_15 (LSTM)              [(1, None, 128),          71168     
                              (1, 128),                          
                              (1, 128)]                          
                                                                 
 tf.__operators__.getitem_15  (1, 128)                 0         
  (SlicingOpLambda)                                              
                                                                 
 dense_15 (Dense)            (1, 1)                    129       
                                                                 
 tf.compat.v1.squeeze_15 (TF  (1,)                     0         
 OpLambda)                                                       
                                                                 
 model_60 (Functional)       ((1, None, 10),           256802944 
                              (10, 512))                         
          

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_20 (InputLayer)       [(None,)]                 0         
                                                                 
 keras_layer_18 (KerasLayer)  (None, 512)              256797824 
                                                                 
 tf.expand_dims_18 (TFOpLamb  (1, None, 512)           0         
 da)                                                             
                                                                 
 proto_layer (prototypeLayer  ((1, None, 30),          15360     
 )                            (30, 512))                         
                                                                 
 distance_layer (distanceLay  (1, None, 30)            0         
 er)                                                             
                                                                 
 lstm_18 (

 OpLambda)                                                       
                                                                 
 model_80 (Functional)       ((1, None, 50),           256823424 
                              (50, 512))                         
                                                                 
 model_81 (Functional)       (1, None, 50)             256823424 
                                                                 
Total params: 256,915,201
Trainable params: 256,915,201
Non-trainable params: 0
_________________________________________________________________
Embedding time (minutes): 0.6971542080243428
Prototypes time (minutes): 2.360746204853058
Preds and prototypes for sentences time (minutes): 0.9126195987065633
[db] model.input =  KerasTensor(type_spec=TensorSpec(shape=(None,), dtype=tf.string, name='input_23'), name='input_23', description="created by layer 'input_23'")
[db] protoLayerName =  proto_layer
[db] protoLayer =  <protoryNet.Pro