In [1]:
import gzip
import pickle
import numpy as np
import h5py
import argparse
import sys
import re
import codecs
from itertools import izip

In [2]:
# Helper functions

# Converts indices to words 
def convert_to_words(indices, indices_to_word):
    return (' '.join([indices_to_word[ind] for ind in indices]))

In [3]:
directory = '../data/MovieTriple/'

# Loading all the possible files into memory
with open(directory + 'Training.triples.pkl') as f:
    train_set = pickle.load(f)
    
with open(directory + 'Validation.triples.pkl') as f:
    valid_set = pickle.load(f)
    
with open(directory + 'Test.triples.pkl') as f:
    test_set = pickle.load(f)

with open(directory + 'Training.dict.pkl') as f:
    word_mappings = pickle.load(f)
    
with open(directory + 'Word2Vec_WordEmb.pkl') as f:
    emb_wordvec = pickle.load(f)
    
with open(directory + 'MT_WordEmb.pkl') as f:
    emb_mt = pickle.load(f)

In [4]:
# Training.dict.pkl: Dictionary with 10000 words extracted from the training 
# set (Training_Shuffled_Dataset.txt). These terms represent 97.97% of the 
# entire training set.

# Not entirely sure what the other two numbers reprsent in the word index table
# Maybe corresponds to the counts in train... or something?

print(len(word_mappings))
word_mappings[0:5]

10003


[('raining', 4959, 53, 48),
 ('writings', 9977, 18, 15),
 ('yellow', 2155, 175, 142),
 ('four', 341, 2299, 2081),
 ('prices', 5660, 43, 40)]

In [5]:
# Move through the list of words and indices and generate a dictionary
# matching the indices to words

# indices -> word
indices_to_word = {}
for word_ex in word_mappings: 
    indices_to_word[word_ex[1]] = word_ex[0]
    
# word -> indices
word_to_indices = {}
for word_ex in word_mappings: 
    word_to_indices[word_ex[0]] = word_ex[1]

In [6]:
# It looks like the </s> <s> denotes different speakers
# We want to break out the first to examples and then generate the 
# third as output
print(convert_to_words(train_set[0], indices_to_word))

# For now we can join the first two sentences and assume that the encoder will figure it out with the </s><s>
# Afterwards, we can think about ways to incorporate the three uttterances

line = ' '.join([indices_to_word[ind] for ind in train_set[0]])
line = line.split('</s> <s>')
context = line[0] + '</s> <s>' + line[1] 
output = line[2]

# So our input would be
print(context)
# And our output would be
print(output)

# I'll now generate matrices with that format for the rest of the data. 
# Everything will be padded with a 10003 character at the end

<s> you lied to me so many times -- </s> <s> reggie -- trust me once more -- please . </s> <s> can i really believe you this time , <person> ? </s>
<s> you lied to me so many times -- </s> <s> reggie -- trust me once more -- please . 
 can i really believe you this time , <person> ? </s>


In [7]:
pattern = [word_to_indices['</s>'], word_to_indices['<s>']]

for ind in range(len(train_set[0]))[::-1]:
    if pattern == train_set[0][ind:ind+2]:
        break_pt = ind
        break
        
context = train_set[0][:break_pt]
output = train_set[0][break_pt+2:]

print(convert_to_words(context, indices_to_word))
print(convert_to_words(output, indices_to_word))

<s> you lied to me so many times -- </s> <s> reggie -- trust me once more -- please .
can i really believe you this time , <person> ? </s>


In [8]:
# Apply above basic parsing to all contexts and outputs

PADDING = 10003
full_context = []
full_output = []
max_len_context = 0
max_len_output = 0 

for i in range(len(train_set)):
    for ind in range(len(train_set[i]))[::-1]:
        if pattern == train_set[i][ind:ind+2]:
            break_pt = ind
            break

    context = train_set[i][:break_pt]
    output = train_set[i][break_pt+2:]
    
    max_len_output = max(max_len_output, len(output))
    max_len_context = max(max_len_context, len(context))
    
    full_context.append(context)
    full_output.append(output)
    
# Add padding to all contexts and outputs
for i in range(len(full_context)):
    full_context[i] = full_context[i] + [PADDING] * (max_len_context - len(full_context[i]))
    full_output[i] = full_output[i] + [PADDING] * (max_len_output - len(full_output[i]))
    
full_context = np.array(full_context)
full_output = np.array(full_output)

print(full_context.shape)
print(full_output.shape)

(196308, 1500)
(196308, 1483)


In [9]:
# Embeddings map to the generated word_dict 
print(emb_wordvec)
print(emb_wordvec[0].shape)
print(emb_mt)
print(emb_mt[0].shape)

[array([[  6.87465274e-04,  -4.67660745e-03,   9.75181500e-03, ...,
          6.63852702e-03,   1.36801081e-02,   1.22574021e-02],
       [ -1.38690870e-03,  -4.40222603e-03,  -4.37635566e-03, ...,
         -7.12008740e-03,  -8.75116355e-03,   1.79811661e-03],
       [  1.16901258e-02,  -6.77742948e-03,  -1.96688132e-03, ...,
          7.17334130e-03,  -8.84545310e-03,   7.13838460e-03],
       ..., 
       [  1.82406867e-01,  -9.06357709e-01,   8.65739462e-01, ...,
          5.20825236e-01,   8.90028319e-01,   1.24629413e+00],
       [  1.05338908e-02,  -8.79297778e-01,  -1.47780843e+00, ...,
          1.16881122e+00,  -3.22146225e-01,   2.51425509e+00],
       [ -4.78275664e-01,   4.13101999e-01,   2.49392604e-01, ...,
         -1.25812961e+00,  -1.46552975e+00,   4.08299012e-01]]), array([[ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0