# Importing packages

In [1]:
import time
import argparse
import tensorflow as tf
import tensorflow_hub as hub
from sklearn_extra.cluster import KMedoids
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import pickle
import os
import sys
import myfunctions
import nltk
nltk.download('punkt')
sys.path.append('../src/protoryNet/')
from protoryNet import ProtoryNet

[nltk_data] Downloading package punkt to
[nltk_data]     /nfshome/students/cm007951/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1]:
%cd ..

/nfshome/students/cm007951/protorynet


# Import datasets (original example of the hotel)

In [27]:
cornell_prepro_characters = pd.read_csv('datasets/cornell_corpus/cornell_prepro_characters.csv')
cornell_prepro_characters

Unnamed: 0.1,Unnamed: 0,characterID,movieID,character_name,gender,movie_title,movie_year,text_with_punctuation,text
0,0,u0,m0,BIANCA,F,10 things i hate about you,1999,They do not! I hope so. Let's go. Okay you're ...,They do not I hope so Lets go Okay youre gonna...
1,1,u100,m6,AMY,F,8mm,1999,She died in her sleep three days ago. It was i...,She died in her sleep three days ago It was in...
2,2,u1001,m65,PETE,M,from dusk till dawn,1996,Six-fifty. Knock yourself out. That's all that...,Sixfifty Knock yourself out Thats all thats be...
3,3,u1007,m66,BLONDELL,F,g.i. jane,1997,Wow Uh don't see it. There's no signature. But...,Wow Uh dont see it Theres no signature But han...
4,4,u1008,m66,C.O.,M,g.i. jane,1997,"Of course, but there's more Uh, V.I.P. securit...",Of course but theres more Uh VIP security arra...
...,...,...,...,...,...,...,...,...,...
2399,2399,u983,m64,ALICE,F,friday the 13th,2009,Maybe we should wait for Mr. Christy. The kill...,Maybe we should wait for Mr Christy The killer...
2400,2400,u985,m64,BILL,M,friday the 13th,2009,It's over twenty miles to the crossroads. Stev...,Its over twenty miles to the crossroads Stevel...
2401,2401,u989,m64,MARCIE,F,friday the 13th,2009,Gotta pee. You're lying on my bladder. Like wa...,Gotta pee Youre lying on my bladder Like waves...
2402,2402,u993,m64,STEVE,M,friday the 13th,2009,I've got to go to town and pick up the trailer...,Ive got to go to town and pick up the trailer ...


In [28]:
myfunctions.protorynet_dataset_format(directory = 'datasets/cornell_corpus/cornell_prepro_characters/', 
                          df = cornell_prepro_characters, 
                          text_variable = 'text_with_punctuation', 
                          label_variable = 'gender', 
                          reference_label = 'F', 
                          return_sets = False, 
                          test_size=0.20)

In [29]:
dir = "datasets/cornell_corpus/cornell_prepro_characters/"
with open (dir + 'y_train', 'rb') as fp:
    y_train = pickle.load(fp)

with open (dir + 'x_train', 'rb') as fp:
    train_not_clean = pickle.load(fp)

with open (dir + 'x_test', 'rb') as fp:
    test_not_clean = pickle.load(fp)

with open (dir + 'y_test', 'rb') as fp:
    y_test = pickle.load(fp)

# Data preprocessing

In [30]:
#this method is to split the paragraphs into sentences
def gen_sents(para):
    res = []
    for p in para:
#         sents = p.split(".")
        sents = nltk.tokenize.sent_tokenize(p)
        res.append(sents)
    return res

train_noclean_sents = gen_sents(train_not_clean)
test_noclean_sents = gen_sents(test_not_clean)

In [31]:
x_train = train_noclean_sents
x_test = test_noclean_sents

#optional: just to make sure the label values are integers
y_train = [int(y) for y in y_train]
y_test = [int(y) for y in y_test]

In [32]:
#import Google Sentence encoder, to convert sentences into vector values
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model_sentence_encoder = hub.load(module_url)
print("module %s loaded" % module_url)

def embed(input):
    return model_sentence_encoder(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [33]:
# Make a list of sentences
sample_sentences = []
for p in train_noclean_sents:
    sample_sentences.extend(p)

# We remove very long sentences since they behave as outliers.
# Therefore when using k-medoids they become their own cluster. 
sample_sentences = [i for i in sample_sentences if len(i)>5 and len(i)<100]
# Take 30000 sentences to initialize the prototypes
# This step is done because k-medoids runs into memory issues when handleling too many samples
sample_sentences = sample_sentences[:30000]    
    
#compute embeddings of sentences
sample_sentences_embedded = embed(sample_sentences)

In [34]:
number_prototypes = 10

# Prototype initialization with k-medoids

In [51]:
# k_protos, vect_size = 10, 512 #512 because we have the sentences are transformed into vectors of size 512
kmedoids = KMedoids(n_clusters = number_prototypes, random_state=0).fit(sample_sentences_embedded)
k_cents = kmedoids.cluster_centers_

  "its corresponding cluster ({k}).".format(k=k)


# Model training

In [52]:
pNet = ProtoryNet() 

In [53]:
model = pNet.createModel(k_cents)

[db] model.input =  KerasTensor(type_spec=TensorSpec(shape=(None,), dtype=tf.string, name='input_6'), name='input_6', description="created by layer 'input_6'")
[db] protoLayerName =  proto_layer
[db] protoLayer =  <protoryNet.ProtoryNet.createModel.<locals>.prototypeLayer object at 0x7f0345299f10>
[db] protoLayer.output =  (<KerasTensor: shape=(1, None, 10) dtype=float32 (created by layer 'proto_layer')>, <KerasTensor: shape=(10, 512) dtype=float32 (created by layer 'proto_layer')>)
[db] distanceLayer.output =  KerasTensor(type_spec=TensorSpec(shape=(1, None, 10), dtype=tf.float32, name=None), name='distance_layer/PartitionedCall:0', description="created by layer 'distance_layer'")
Model: "custom_model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None,)]                 0         
                                                                 
 keras_layer_5 (Keras

Note that the initial prototypes produce repetitions. 
Later in the training the prototypes "don't move", they get stucked and this might be due to the poor initialization. 

In [55]:
prototypes = pNet.showPrototypes(sample_sentences, sample_sentences_embedded, nprototypes, printOutput=False, return_prototypes = True)
prototypes

{0: "I'll never live as a thief!",
 1: "I don't understand the allure of dehydrated food.",
 2: "I don't understand the allure of dehydrated food.",
 3: 'I believe there was a conspiracy, but not the government.',
 4: 'I believe there was a conspiracy, but not the government.',
 5: 'I believe there was a conspiracy, but not the government.',
 6: 'I believe there was a conspiracy, but not the government.',
 7: "I don't understand the allure of dehydrated food.",
 8: '"Mister Crowley, what\'s inside of your head " We\'re going to Jersey?',
 9: "I don't understand the allure of dehydrated food."}

In [13]:
start_time = time.time()
pNet.train(x_train,y_train,x_test,y_test, epochs=1, saveModel=True, model_name="cornell_prepro_characters_1epoch_10proto")
execution_time = (time.time() - start_time) / 60
print(execution_time)

Epoch  0
i =   0
Evaluate on valid set:  0.5384615384615384
This is the best eval res, saving the model...
saving model now = 2022-07-21 16:18:19.346849
just saved
i =   50
i =   100
i =   150
i =   200
Evaluate on valid set:  0.5384615384615384
i =   250
i =   300
i =   350
i =   400
Evaluate on valid set:  0.5384615384615384
i =   450
i =   500
i =   550
i =   600
Evaluate on valid set:  0.5384615384615384
i =   650
i =   700
i =   750
i =   800
Evaluate on valid set:  0.5384615384615384
i =   850
i =   900
i =   950
i =   1000
Evaluate on valid set:  0.4594594594594595
i =   1050
i =   1100
i =   1150
i =   1200
Evaluate on valid set:  0.5384615384615384
i =   1250
i =   1300
i =   1350
i =   1400
Evaluate on valid set:  0.5384615384615384
i =   1450
i =   1500
i =   1550
i =   1600
Evaluate on valid set:  0.5384615384615384
i =   1650
i =   1700
i =   1750
i =   1800
Evaluate on valid set:  0.5384615384615384
i =   1850
i =   1900
21.75816112756729


# Model testing (from saved model)

In [62]:
nprototypes = 10
model_path = 'cornell_prepro_characters_1epoch_10proto' + '.h5'

pNet_saved = ProtoryNet()
model = pNet_saved.createModel(np.zeros((nprototypes, 512)), nprototypes)
model.load_weights(model_path)

[db] model.input =  KerasTensor(type_spec=TensorSpec(shape=(None,), dtype=tf.string, name='input_8'), name='input_8', description="created by layer 'input_8'")
[db] protoLayerName =  proto_layer
[db] protoLayer =  <protoryNet.ProtoryNet.createModel.<locals>.prototypeLayer object at 0x7f0230f92450>
[db] protoLayer.output =  (<KerasTensor: shape=(1, None, 10) dtype=float32 (created by layer 'proto_layer')>, <KerasTensor: shape=(10, 512) dtype=float32 (created by layer 'proto_layer')>)
[db] distanceLayer.output =  KerasTensor(type_spec=TensorSpec(shape=(1, None, 10), dtype=tf.float32, name=None), name='distance_layer/PartitionedCall:0', description="created by layer 'distance_layer'")
Model: "custom_model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None,)]                 0         
                                                                 
 keras_layer_7 (Keras

In [63]:
sample_sentences_embedded = pNet_saved.embed(sample_sentences)

The evaluation results state that everything should be classified as "male speaker", the test accuracy is low

In [58]:
pNet_saved.evaluate(x_test, y_test)

([0.16879003,
  0.1690114,
  0.17131598,
  0.1670248,
  0.1720708,
  0.16736056,
  0.16735892,
  0.16768745,
  0.16962436,
  0.16529681,
  0.16732323,
  0.16539885,
  0.1772302,
  0.17029823,
  0.17064475,
  0.17424919,
  0.1658156,
  0.1680691,
  0.1672418,
  0.1697042,
  0.16963784,
  0.16880204,
  0.16838636,
  0.16708624,
  0.17060466,
  0.16377603,
  0.16790749,
  0.17006014,
  0.17070696,
  0.1720554,
  0.16768876,
  0.16449663,
  0.1716536,
  0.1657432,
  0.16846131,
  0.16925485,
  0.16597886,
  0.17240399,
  0.16415972,
  0.17035015,
  0.17562735,
  0.16948648,
  0.16914037,
  0.17246786,
  0.1700544,
  0.16699113,
  0.16964304,
  0.16687196,
  0.1683403,
  0.17166764,
  0.17375025,
  0.17128104,
  0.16944048,
  0.1641847,
  0.16845511,
  0.16951805,
  0.16427477,
  0.17182611,
  0.16751496,
  0.16588496,
  0.1632746,
  0.16923879,
  0.16346031,
  0.1727642,
  0.16724423,
  0.16664238,
  0.16605346,
  0.16996521,
  0.16535571,
  0.17591824,
  0.1658052,
  0.16526477,
  0.17029

The prototypes are redundant.

In [64]:
prototypes = pNet_saved.showPrototypes(sample_sentences, sample_sentences_embedded, nprototypes, printOutput=False, return_prototypes = True)
prototypes

{0: 'Heh heh heh.',
 1: 'No, no.',
 2: 'Oh no!',
 3: 'Alright.',
 4: 'Oh no.',
 5: 'Oh yeah?',
 6: 'Oh no.',
 7: "I'm sorry.",
 8: 'Oh no!',
 9: 'Oh no.'}

In [59]:
# testS = ["I'm a women",
#          "I was a waitress for 5 years"]
testS = x_test[1]
print(testS)
pNet_saved.predict(testS)

['My loyalty was never to my country.', 'Because it is my duty.', 'Yes, Mr. President.', "That's affirmative.", 'Roger.', 'Air Force One, acknowledged.', 'tNT.', 'EMERGENCY PARACHUTE LAUNCH RAMP.', 'About goddamn time.', 'Over the Black Sea.', 'I can probably get us to Turkey or Georgia.', 'Not even close.', "Hell, we can't even make Syria or Iraq.", "We've stopped dumping but we've only got about twenty minutes of fuel left.", 'Avionics compartment!', "It's the only place.", "You better get Zedeck down there fast Unless, of course, you'd rather be a martyr than a savior.", 'Well it worked.', 'why did they do that?', 'We checked the manifest.', 'Everyone was accounted for.', 'Nine.', 'Dead.', 'Sir, this plane carries the President of the United States.']


array([0.1690114], dtype=float32)

In [65]:
trajectory = pNet_saved.showTrajectory(testS, sample_sentences, sample_sentences_embedded)
print(trajectory)

['No, no.', 'Heh heh heh.', 'Oh yeah?', 'Oh yeah?', 'Alright.', 'No, no.', "I'm sorry.", "I'm sorry.", 'Heh heh heh.', 'Heh heh heh.', 'Heh heh heh.', 'No, no.', 'No, no.', 'Heh heh heh.', "I'm sorry.", 'Heh heh heh.', 'Heh heh heh.', 'Oh yeah?', 'Heh heh heh.', 'Heh heh heh.', 'Alright.', 'Alright.', 'Alright.', 'Heh heh heh.']


In [66]:
def score_trajectory(list_of_sentences):
    '''
    given a list of sentences (usually a list of prototypes), it returns the prediction for each of them
    '''
    pred = []
    for prot in list_of_sentences:
        pred.append(pNet_saved.predict([prot])[0])
    return pred

In [67]:
# predictions for the prototypes
score_trajectory(prototypes.values())

[0.17499548,
 0.17575051,
 0.17195183,
 0.1776307,
 0.17195183,
 0.179942,
 0.17195183,
 0.17362915,
 0.17195183,
 0.17195183]

In [68]:
score_trajectory(trajectory)

[0.17575051,
 0.17499548,
 0.179942,
 0.179942,
 0.1776307,
 0.17575051,
 0.17362915,
 0.17362915,
 0.17499548,
 0.17499548,
 0.17499548,
 0.17575051,
 0.17575051,
 0.17499548,
 0.17362915,
 0.17499548,
 0.17499548,
 0.179942,
 0.17499548,
 0.17499548,
 0.1776307,
 0.1776307,
 0.1776307,
 0.17499548]

**Note that all prototypes by their own would be classified as "male", that is, the prediction for each prototype would be "male".**

# Prototype initialization at random

In [18]:
# k_protos, vect_size = 10, 512 #512 because we have the sentences are transformed into vectors of size 512
random_idx = np.random.choice(sample_sentences_embedded.shape[0], size = number_prototypes, replace=False)
k_cents_random = np.array(sample_sentences_embedded)[random_idx, :]

# Model training

In [19]:
pNet = ProtoryNet() 

In [20]:
model = pNet.createModel(k_cents_random)

[db] model.input =  KerasTensor(type_spec=TensorSpec(shape=(None,), dtype=tf.string, name='input_5'), name='input_5', description="created by layer 'input_5'")
[db] protoLayerName =  proto_layer
[db] protoLayer =  <protoryNet.ProtoryNet.createModel.<locals>.prototypeLayer object at 0x7f3c0741d090>
[db] protoLayer.output =  (<KerasTensor: shape=(1, None, 10) dtype=float32 (created by layer 'proto_layer')>, <KerasTensor: shape=(10, 512) dtype=float32 (created by layer 'proto_layer')>)
[db] distanceLayer.output =  KerasTensor(type_spec=TensorSpec(shape=(1, None, 10), dtype=tf.float32, name=None), name='distance_layer/PartitionedCall:0', description="created by layer 'distance_layer'")
Model: "custom_model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None,)]                 0         
                                                                 
 keras_layer_4 (Keras

With random selection it is less probable for prototypes to be the same.

In [15]:
prototypes = pNet.showPrototypes(sample_sentences, sample_sentences_embedded, number_prototypes, printOutput=False, return_prototypes = True)
prototypes

{0: 'Please will you believe me.',
 1: "Let them stay 'til morning.",
 2: "So, how'd a shrink ever get to be a priest?",
 3: "You can't ask me that!",
 4: "Hi, I'm Steven.",
 5: 'You definitely do.',
 6: 'On your piano, that is the swap.',
 7: 'Books.',
 8: 'Hello, Gabriel.',
 9: 'Is this like your old convent?'}

In [16]:
start_time = time.time()
pNet.train(x_train,y_train,x_test,y_test, epochs=1, saveModel=True, model_name="cornell_prepro_characters_1epoch_10proto")
execution_time = (time.time() - start_time) / 60
print(execution_time)

Epoch  0
i =   0
Evaluate on valid set:  0.4386694386694387
This is the best eval res, saving the model...
saving model now = 2022-07-25 14:46:35.345064
just saved
i =   50
i =   100
i =   150
i =   200
Evaluate on valid set:  0.5467775467775468
This is the best eval res, saving the model...
saving model now = 2022-07-25 14:49:18.078429
just saved
i =   250
i =   300
i =   350
i =   400
Evaluate on valid set:  0.5384615384615384
i =   450
i =   500
i =   550
i =   600
Evaluate on valid set:  0.5384615384615384
i =   650
i =   700
i =   750
i =   800
Evaluate on valid set:  0.5987525987525988
This is the best eval res, saving the model...
saving model now = 2022-07-25 14:56:15.925498
just saved
i =   850
i =   900
i =   950
i =   1000
Evaluate on valid set:  0.4594594594594595
i =   1050
i =   1100
i =   1150
i =   1200
Evaluate on valid set:  0.5467775467775468
i =   1250
i =   1300
i =   1350
i =   1400
Evaluate on valid set:  0.5384615384615384
i =   1450
i =   1500
i =   1550
i =   

In [23]:
x_train[0:2]

[['Soon then.',
  "I'll save the last dance for you.",
  "They're running me ragged.",
  'Nothing but question day and night I love it!',
  "Come to dinner and 1'1 tell you all about it.",
  "There's a Bajoran band at the officer's mess.",
  "You can' t imagine them, Jean Luc.",
  "They're kids!",
  'All with advance degrees in xenobiology and out to conquer every disease in the quadrant.',
  'That was another time.',
  'But we do have one advantage.',
  'He needs your blood to live.',
  'He might come after you first.',
  "I can't be sure but the rate of decay seems to be accelerating.",
  'As a result the temporal sequencing was never activated.',
  'Remember, he was supposed to replace you at nearly your current age.',
  'He was engineered to skip thirty years of life.',
  'But since the RNA sequencing was never activated, his cellular structure has started to break down.',
  "He's dying.",
  'The more I studied his DNA the more confusing it got.',
  'Finally I could only come to on

In [22]:
start_time = time.time()
pNet.train(x_train,y_train,x_test,y_test, epochs=1, saveModel=True, model_name="cornell_prepro_characters_1epoch_10proto", returnValidationAccuracy = True)
execution_time = (time.time() - start_time) / 60
print(execution_time)

Epoch  0
i =   0


KeyboardInterrupt: 

# Model testing (from saved model)

In [18]:
model_path = 'cornell_prepro_characters_1epoch_10proto' + '.h5'

pNet_saved = ProtoryNet()
model = pNet_saved.createModel(np.zeros((number_prototypes, 512)), number_prototypes)
model.load_weights(model_path)

[db] model.input =  KerasTensor(type_spec=TensorSpec(shape=(None,), dtype=tf.string, name='input_2'), name='input_2', description="created by layer 'input_2'")
[db] protoLayerName =  proto_layer
[db] protoLayer =  <protoryNet.ProtoryNet.createModel.<locals>.prototypeLayer object at 0x7fb419327250>
[db] protoLayer.output =  (<KerasTensor: shape=(1, None, 10) dtype=float32 (created by layer 'proto_layer')>, <KerasTensor: shape=(10, 512) dtype=float32 (created by layer 'proto_layer')>)
[db] distanceLayer.output =  KerasTensor(type_spec=TensorSpec(shape=(1, None, 10), dtype=tf.float32, name=None), name='distance_layer/PartitionedCall:0', description="created by layer 'distance_layer'")
Model: "custom_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None,)]                 0         
                                                                 
 keras_layer_1 (Keras

In [19]:
sample_sentences_embedded = pNet_saved.embed(sample_sentences)

The evaluation results state that everything should be classified as "male speaker", the test accuracy is low

In [20]:
pNet_saved.evaluate(x_test, y_test)

([0.4835634,
  0.48410264,
  0.48057833,
  0.49563274,
  0.4869299,
  0.47352034,
  0.5124197,
  0.48087862,
  0.48483503,
  0.47263864,
  0.4904226,
  0.49059698,
  0.4920593,
  0.4829499,
  0.49741504,
  0.49083006,
  0.48096627,
  0.5149506,
  0.48934457,
  0.4848144,
  0.500455,
  0.4836769,
  0.48949644,
  0.49015084,
  0.48976552,
  0.4954207,
  0.50848293,
  0.48579854,
  0.489393,
  0.47969067,
  0.49661282,
  0.50852746,
  0.47646448,
  0.48497608,
  0.49140352,
  0.49369612,
  0.5086437,
  0.4758523,
  0.49391797,
  0.48972082,
  0.4938262,
  0.48292345,
  0.49029905,
  0.4869105,
  0.49913675,
  0.47586304,
  0.48828253,
  0.4774784,
  0.48700586,
  0.48327342,
  0.48628867,
  0.4840986,
  0.4875245,
  0.5060973,
  0.5041454,
  0.48440757,
  0.49492946,
  0.4766115,
  0.4848245,
  0.4867894,
  0.4956751,
  0.48220205,
  0.5126782,
  0.47674713,
  0.50812817,
  0.48352152,
  0.49053317,
  0.4983942,
  0.4880965,
  0.49172395,
  0.5020753,
  0.511954,
  0.50016737,
  0.4877936

Some of the prototypes changed from the originals

In [22]:
prototypes = pNet_saved.showPrototypes(sample_sentences, sample_sentences_embedded, number_prototypes, printOutput=False, return_prototypes = True)
prototypes

{0: 'Please will you believe me.',
 1: "Let them stay 'til morning.",
 2: "So, how'd a shrink ever get to be a priest?",
 3: "You can't ask me that!",
 4: "Hi, I'm Steven.",
 5: 'You definitely do.',
 6: 'On your piano, that is the swap.',
 7: 'Books.',
 8: 'Hello, Gabriel.',
 9: 'Is this like your old convent?'}

In [24]:
testS = x_test[1]
print(testS)
pNet_saved.predict(testS)

['My loyalty was never to my country.', 'Because it is my duty.', 'Yes, Mr. President.', "That's affirmative.", 'Roger.', 'Air Force One, acknowledged.', 'tNT.', 'EMERGENCY PARACHUTE LAUNCH RAMP.', 'About goddamn time.', 'Over the Black Sea.', 'I can probably get us to Turkey or Georgia.', 'Not even close.', "Hell, we can't even make Syria or Iraq.", "We've stopped dumping but we've only got about twenty minutes of fuel left.", 'Avionics compartment!', "It's the only place.", "You better get Zedeck down there fast Unless, of course, you'd rather be a martyr than a savior.", 'Well it worked.', 'why did they do that?', 'We checked the manifest.', 'Everyone was accounted for.', 'Nine.', 'Dead.', 'Sir, this plane carries the President of the United States.']


array([0.48410264], dtype=float32)

In [25]:
trajectory = pNet_saved.showTrajectory(testS, sample_sentences, sample_sentences_embedded)
trajectory

["You can't ask me that!",
 "You can't ask me that!",
 "Hi, I'm Steven.",
 'You definitely do.',
 'Hello, Gabriel.',
 'Books.',
 'Books.',
 'Books.',
 'Hello, Gabriel.',
 'Books.',
 "Let them stay 'til morning.",
 "You can't ask me that!",
 "Let them stay 'til morning.",
 "Let them stay 'til morning.",
 'Books.',
 "You can't ask me that!",
 "So, how'd a shrink ever get to be a priest?",
 'You definitely do.',
 "You can't ask me that!",
 'On your piano, that is the swap.',
 'You definitely do.',
 'Books.',
 'Books.',
 'On your piano, that is the swap.']

In [26]:
def score_trajectory(list_of_sentences):
    '''
    given a list of sentences (usually a list of prototypes), it returns the prediction for each of them
    '''
    pred = []
    for prot in list_of_sentences:
        pred.append(pNet_saved.predict([prot])[0])
    return pred

In [27]:
score_trajectory(prototypes.values())

[0.5000223,
 0.5024166,
 0.50159353,
 0.5137318,
 0.50853634,
 0.49461144,
 0.5110517,
 0.50164145,
 0.5008188,
 0.5145239]

In [28]:
score_trajectory(trajectory)

[0.5137318,
 0.5137318,
 0.50853634,
 0.49461144,
 0.5008188,
 0.50164145,
 0.50164145,
 0.50164145,
 0.5008188,
 0.50164145,
 0.5024166,
 0.5137318,
 0.5024166,
 0.5024166,
 0.50164145,
 0.5137318,
 0.50159353,
 0.49461144,
 0.5137318,
 0.5110517,
 0.49461144,
 0.50164145,
 0.50164145,
 0.5110517]

# Estimation of training time using leave-one-group-out

In [112]:
# Number of days to train models using leave-one-group-out
groups = 10
epochs = 20
execution_time * epochs * groups / 60 / 24

3.3425393065920574

# Exploring

In [3]:
# Load dataset
cornell_prepro_characters = pd.read_csv('datasets/cornell_corpus/cornell_prepro_characters.csv')

# Split data
X = cornell_prepro_characters['text_with_punctuation']
y = np.array(cornell_prepro_characters['gender'] == 'F').astype(int)

X_train, X_val, X_test, y_train, y_val, y_test = myfunctions.balanced_split_train_val_test(X, y, train_split = 0.7, val_split = 0.2, test_split = 0.1, random_seed = 32)

# Saving to pickle format
directory =  'datasets/cornell_corpus/cornell_prepro_characters_70train_20val_10test/'

with open(directory +'x_train', 'wb') as f:
     pickle.dump(X_train, f)
with open(directory +'x_val', 'wb') as f:
     pickle.dump(X_val, f)
with open(directory +'x_test', 'wb') as f:
     pickle.dump(X_test, f)

with open(directory +'y_train', 'wb') as f:
     pickle.dump(y_train, f)
with open(directory +'y_val', 'wb') as f:
     pickle.dump(y_val, f)
with open(directory +'y_test', 'wb') as f:
     pickle.dump(y_test, f)

In [2]:
!python code/train_protorynet.py --dataset_path=datasets/cornell_corpus/cornell_prepro_characters_70train_20val_10test/ --results_path=results/protorynet_models/ --results_prefix=cornell_prepro_characters_70train_20val_10test --epochs=2 --number_prototypes=10 --type_init=random --sample_size_sentences=20000 --init_prototypes_seed=16

[nltk_data] Downloading package punkt to
[nltk_data]     /nfshome/students/cm007951/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Traceback (most recent call last):
  File "code/train_protorynet.py", line 52, in <module>
    sample_size_sentences = arg.sample_size_sentences
NameError: name 'arg' is not defined
