In [1]:
import os, re, sys, time, json, codecs, copy
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from inspect import getargspec

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec

from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
os.chdir("/Users/meif/Desktop/SI 630 NLP/Project/")

from Code.Input_functions import *

% matplotlib inline 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Initialization
EMBEDDING_WORD = 'Embedding/glove.6B/glove.6B.100d.txt'  # 'Embedding/GoogleNews-vectors-negative300.bin'
EMBEDDING_CHAR = 'Embedding/char2vecLearned25'
TRAIN_DATA_FILE = 'Dataset/cnn/questions/training'
VAL_DATA_FILE = 'Dataset/cnn/questions/validation'
TEST_DATA_FILE = 'Dataset/cnn/questions/test'

N_TRAININGPOINTS = 100000
MAX_NUM_WORDS = 20000 # for filtering

MAX_SEQUENCE_LENGTH_NEWS = 300 # median ~ 650
MAX_SEQUENCE_LENGTH_QUES = 46 # max ~ 46
EMBEDDING_DIM_WORD = 100
EMBEDDING_DIM_CHAR = 25
EMBEDDING_DIM = EMBEDDING_DIM_WORD + EMBEDDING_DIM_CHAR

UNK_WORD = "<UNK_WORD>"
UNK_CHAR = "^"
UNK_ENTITY = "<UNK_ENTITY>"

# 0. Helper Functions

In [148]:
# load_dataset(data_file, name, first=100000000, remove_stopwords=False, stem_words=False, remove_punc=False, keep_period=True)


# 1. Load Dataset

In [3]:
# Load Dataset
start_time = time.time()

print("Loading datasets")
'''
datasets = {"news":[], "questions":[], "answers":[]}
entities = [(news, questions, answer, entities)]
'''
entities = defaultdict(list)
trainsets, entities["train"] = load_dataset(TRAIN_DATA_FILE, "train", N_TRAININGPOINTS, remove_stopwords=False, stem_words=False, remove_punc=False)
valsets, entities["val"] = load_dataset(VAL_DATA_FILE, "val", N_TRAININGPOINTS//10, remove_stopwords=False, stem_words=False, remove_punc=False)
testsets, entities["test"] = load_dataset(TEST_DATA_FILE, "test", N_TRAININGPOINTS//10, remove_stopwords=False, stem_words=False, remove_punc=False)
print("Found {} questions in trainset".format(len(trainsets["answers"]))) # 380298
print("Found {} questions in valset".format(len(valsets["answers"]))) # 3924
print("Found {} questions in testset".format(len(testsets["answers"]))) # 3198

print("{} sec".format(time.time() - start_time)) # 545sec for all, 68sec for 50000

Loading datasets
Finished 1000 questions in train
Finished 2000 questions in train
Finished 3000 questions in train
Finished 4000 questions in train
Finished 5000 questions in train
Finished 6000 questions in train
Finished 7000 questions in train
Finished 8000 questions in train
Finished 9000 questions in train
Finished 10000 questions in train
Finished 11000 questions in train
Finished 12000 questions in train
Finished 13000 questions in train
Finished 14000 questions in train
Finished 15000 questions in train
Finished 16000 questions in train
Finished 17000 questions in train
Finished 18000 questions in train
Finished 19000 questions in train
Finished 20000 questions in train
Finished 21000 questions in train
Finished 22000 questions in train
Finished 23000 questions in train
Finished 24000 questions in train
Finished 25000 questions in train
Finished 26000 questions in train
Finished 27000 questions in train
Finished 28000 questions in train
Finished 29000 questions in train
Finish

In [4]:
entity_switch_dict = {"train":{}, "val":{}, "test":{}}
for idx, datasets in [("train", trainsets), ("val", valsets), ("test", testsets)]:
    for k in range(len(datasets["news"])):
        entity_switch_dict[idx][str(k)] = {}
        lst = datasets["news"][k] + " " + datasets["questions"][k]
        entity_lst = [i for i in lst.split() if i.startswith("@entity")]
        count = 0
        for ent in entity_lst:
            if ent not in entity_switch_dict[idx][str(k)]:
                entity_switch_dict[idx][str(k)][ent] = "@entity{}".format(count)
                count += 1

In [5]:
trainsets_reidx = copy.deepcopy(trainsets)
valsets_reidx = copy.deepcopy(valsets)
testsets_reidx = copy.deepcopy(testsets)
for idx, datasets, datasets_reidx in [("train", trainsets, trainsets_reidx), ("val", valsets, valsets_reidx), ("test", testsets, testsets_reidx)]:
    for k in range(len(datasets["news"])):
        datasets_reidx["news"][k] = " ".join([i if not i.startswith("@entity") else entity_switch_dict[idx][str(k)][i] for i in datasets["news"][k].split()])
        datasets_reidx["questions"][k] = " ".join([i if not i.startswith("@entity") else entity_switch_dict[idx][str(k)][i] for i in datasets["questions"][k].split()])
        datasets_reidx["answers"][k] = entity_switch_dict[idx][str(k)][datasets["answers"][k]]

In [6]:
count = 0
for i in range(len(trainsets_reidx['news'])):
    if trainsets_reidx["answers"][i] in trainsets_reidx["news"][i].split()[:300]:
        count += 1
print("{0:.2f}% of trainsets have answers".format(count/len(trainsets_reidx['news'])*100))
print("Total unique tokens in the trainset: {}".format(len(Counter([j for i in trainsets_reidx["news"] for j in i.split()]))))
print("Total unique chars in the trainset: {}".format(len(Counter([k for i in trainsets_reidx["news"] for j in i.split() for k in j]))))
print("Median news length: {}".format(np.median([len(trainsets_reidx["news"][i].split()) for i in range(len(trainsets_reidx['news']))])))

87.35% of trainsets have answers
Total unique tokens in the trainset: 82190
Total unique chars in the trainset: 188
Median news length: 699.0


In [7]:
count = 0
for i in range(len(valsets_reidx['news'])):
    if valsets_reidx["answers"][i] in valsets_reidx["news"][i].split()[:300]:
        count += 1
print("{0:.2f}% of valsets have answers".format(count/len(valsets_reidx['news'])*100))
print("Total unique tokens in the valset: {}".format(len(Counter([j for i in valsets_reidx["news"] for j in i.split()]))))
print("Total unique chars in the valset: {}".format(len(Counter([k for i in valsets_reidx["news"] for j in i.split() for k in j]))))
print("Median news length: {}".format(np.median([len(valsets_reidx["news"][i].split()) for i in range(len(valsets_reidx['news']))])))

91.21% of valsets have answers
Total unique tokens in the valset: 23273
Total unique chars in the valset: 77
Median news length: 702.5


In [8]:
count = 0
for i in range(len(testsets_reidx['news'])):
    if testsets_reidx["answers"][i] in testsets_reidx["news"][i].split()[:300]:
        count += 1
print("{0:.2f}% of testsets have answers".format(count/len(testsets_reidx['news'])*100))
print("Total unique tokens in the testset: {}".format(len(Counter([j for i in testsets_reidx["news"] for j in i.split()]))))
print("Total unique chars in the testset: {}".format(len(Counter([k for i in testsets_reidx["news"] for j in i.split() for k in j]))))
print("Median news length: {}".format(np.median([len(testsets_reidx["news"][i].split()) for i in range(len(testsets_reidx['news']))])))

93.56% of testsets have answers
Total unique tokens in the testset: 22271
Total unique chars in the testset: 77
Median news length: 650.0


In [6]:
remove_idx_train = []
for i in range(len(trainsets_reidx['news'])):
    if trainsets_reidx["answers"][i] not in trainsets_reidx["news"][i].split()[:MAX_SEQUENCE_LENGTH_NEWS]:
        remove_idx_train.append(i)
        
remove_idx_val = []
for i in range(len(valsets_reidx['news'])):
    if valsets_reidx["answers"][i] not in valsets_reidx["news"][i].split()[:MAX_SEQUENCE_LENGTH_NEWS]:
        remove_idx_val.append(i)
        
remove_idx_test = []
for i in range(len(testsets_reidx['news'])):
    if testsets_reidx["answers"][i] not in testsets_reidx["news"][i].split()[:MAX_SEQUENCE_LENGTH_NEWS]:
        remove_idx_test.append(i)

In [7]:
trainsets_reidx['news'] = [v for i,v in enumerate(trainsets_reidx['news']) if i not in remove_idx_train] 
trainsets_reidx['questions'] = [v for i,v in enumerate(trainsets_reidx['questions']) if i not in remove_idx_train] 
trainsets_reidx['answers'] = [v for i,v in enumerate(trainsets_reidx['answers']) if i not in remove_idx_train]
entities["train"] = [v for i,v in enumerate(entities["train"]) if i not in remove_idx_train]

valsets_reidx['news'] = [v for i,v in enumerate(valsets_reidx['news']) if i not in remove_idx_val] 
valsets_reidx['questions'] = [v for i,v in enumerate(valsets_reidx['questions']) if i not in remove_idx_val] 
valsets_reidx['answers'] = [v for i,v in enumerate(valsets_reidx['answers']) if i not in remove_idx_val]
entities["val"] = [v for i,v in enumerate(entities["val"]) if i not in remove_idx_val]

testsets_reidx['news'] = [v for i,v in enumerate(testsets_reidx['news']) if i not in remove_idx_test] 
testsets_reidx['questions'] = [v for i,v in enumerate(testsets_reidx['questions']) if i not in remove_idx_test] 
testsets_reidx['answers'] = [v for i,v in enumerate(testsets_reidx['answers']) if i not in remove_idx_test]
entities["test"] = [v for i,v in enumerate(entities["test"]) if i not in remove_idx_test]


In [11]:
count = 0
for i in range(len(trainsets_reidx['news'])):
    if trainsets_reidx["answers"][i] in trainsets_reidx["news"][i].split()[:300]:
        count += 1
print("{0:.2f}% of trainsets have answers".format(count/len(trainsets_reidx['news'])*100))
print("Total unique tokens in the trainset: {}".format(len(Counter([j for i in trainsets_reidx["news"] for j in i.split()]))))
print("Total unique chars in the trainset: {}".format(len(Counter([k for i in trainsets_reidx["news"] for j in i.split() for k in j]))))
print("Median news length: {}".format(np.median([len(trainsets_reidx["news"][i].split()) for i in range(len(trainsets_reidx['news']))])))

100.00% of trainsets have answers
Total unique tokens in the trainset: 77641
Total unique chars in the trainset: 180
Median news length: 658.0


In [12]:
# test
print(testsets_reidx["questions"][0])
print(testsets_reidx["news"][0])

property experts say @placeholder investment in @entity1 is set to grow
( @entity0 ) sophisticated , glamorous and spacious - - when the super - rich go house - hunting they are searching for something special . real estate in @entity1 swankier suburbs can catch a buyers eye . @entity2 , @entity3 and @entity4 have long been the stomping ground of the elite - - and are now welcoming a new wave of @entity5 investors . " the @entity6 who are coming into @entity1 now are @entity6 who themselves have worked for their money , " explains @entity7 , a @entity8 - @entity9 wealth manager based in @entity1 . " they have grown in industry and are actually part of the exciting story of the @entity5 renaissance , " she continues . " it bringing to @entity1 the best of the continent . " these investors are having a considerable impact on @entity1 property market and they mainly come from just six countries : @entity9 , @entity10 , @entity11 , @entity12 , @entity13 and @entity14 . of these , @entity9 

# 2. Word-level Embedding

## 2.0 Handle OOV

In [400]:
def getOODict():
    texts = trainsets_reidx["news"] + trainsets_reidx["questions"]

    # count all words and entities
    count_all = Counter([j for i in texts for j in i.split()])
    count_entity, count_word = [], []
    for i in count_all.keys():
        if i.startswith("@entity"):
            count_entity.append((i, count_all[i]))
        else:
            if len(i) == 1 and re.search(r"[^A-z0-9 \"\'\.\?\{\}\(\)\[\]:;!~@#$%&*<>,/+\-=_]", i):
                pass
            else:
                count_word.append((i, count_all[i]))

    # OOV, OOE, OOC
    
    notOOE = Counter(dict(count_entity)).most_common(100000000)
    notOOE = [i[0] for i in notOOE if i[1] >= 2]

    notOOV = Counter(dict(count_word)).most_common(MAX_NUM_WORDS)
    notOOV = [i[0] for i in notOOV]

    notOOC = list("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 \"'.?{}()[]:;!~@#$%&*<>,/+\-=_")
        
    OODict = {"notOOV":notOOV, "notOOE":notOOE, "notOOC":notOOC}

    return OODict

In [401]:
def getOOV(datasets, _type):
    
    # count all words and entities
    count_all = Counter([j for i in datasets[_type] for j in i.split()])
    count_entity, count_word, OOC = [], [], []
    for i in count_all.keys():
        if i.startswith("@entity"):
            count_entity.append(i)
        else:
            if len(i) == 1 and re.search(r"[^A-z0-9 \"\'\.\?\{\}\(\)\[\]:;!~@#$%&*<>,/+\-=_]", i):
                OOC.append(i)
            else:
                count_word.append(i)
    
    # OOV, OOE, OOC
    OODict = getOODict()
    
    OOE = set(count_entity) - set(OODict["notOOE"])
    OOV = set(count_word) - set(OODict["notOOV"])
    OOC = set([i for i in OOC if i != "|"])
            
    return OOV, OOE, OOC

In [402]:
def handelOOV(datasets):
    
    texts = {"news":[], "questions":[]}
    
    for _type in ["news", "questions"]:
        print('Replacing {}'.format(_type))    
        OOV, OOE, OOC = getOOV(datasets, _type)
        
        for i in range(len(datasets[_type])//1000+1):
#             start_time = time.time()
            print(i)
            
            text = datasets[_type][i*1000:(i+1)*1000]
#             text = [re.sub(r"( {} )".format(" | ".join(list(OOV))), " {} ".format(UNK_WORD), txt) for txt in text]
#             text = [re.sub(r"( {} )".format(" | ".join(list(OOC))), " {} ".format(UNK_CHAR), txt) for txt in text]
#             text = [re.sub(r"( {} )".format(" | ".join(list(OOE))), " {} ".format(UNK_ENTITY), txt) for txt in text]
            for txt in text:
                txt_lst = txt.split()
                for w in range(len(txt_lst)):
                    if txt_lst[w] in OOC:
                        txt_lst[w] = UNK_CHAR
                    elif txt_lst[w] in OOE:
                        txt_lst[w] = UNK_ENTITY
                    elif txt_lst[w] in OOV:
                        txt_lst[w] = UNK_WORD
                texts[_type].append(" ".join(txt_lst))
        
#             print("{} sec".format(time.time() - start_time)) # 1600sec x 3
            
    return texts

In [8]:
handel_OOV = False

if handel_OOV:
    trainsets_OOV = handelOOV(trainsets_reidx)
    valsets_OOV = handelOOV(valsets_reidx)
    testsets_OOV = handelOOV(testsets_reidx)

    with open("Dataset/GRU/{0}_reidx/trainsets_OOV{0}_reidx.npy".format(N_TRAININGPOINTS), "w") as f:
        json.dump(trainsets_OOV, f)
    with open("Dataset/GRU/{0}_reidx/valsets_OOV{0}_reidx.npy".format(N_TRAININGPOINTS), "w") as f:
        json.dump(valsets_OOV, f)
    with open("Dataset/GRU/{0}_reidx/testsets_OOV{0}_reidx.npy".format(N_TRAININGPOINTS), "w") as f:
        json.dump(testsets_OOV, f)
else:
    with open("Dataset/GRU/{0}_reidx/trainsets_OOV{0}_reidx.npy".format(N_TRAININGPOINTS), "r") as f:
        trainsets_OOV = json.load(f)
    with open("Dataset/GRU/{0}_reidx/valsets_OOV{0}_reidx.npy".format(N_TRAININGPOINTS), "r") as f:
        valsets_OOV = json.load(f)
    with open("Dataset/GRU/{0}_reidx/testsets_OOV{0}_reidx.npy".format(N_TRAININGPOINTS), "r") as f:
        testsets_OOV = json.load(f)

# Chop into Sentences

## 2.1 Texts to Sequences (Indexing)

In [9]:
start_time = time.time()

print('Transforming texts to sequences - Word Level')

tokenizer_word = Tokenizer(filters='', lower=False, split=" ", char_level=False)
tokenizer_word.fit_on_texts(trainsets_OOV["news"] + trainsets_OOV["questions"])

trainSeqNews_word = tokenizer_word.texts_to_sequences(trainsets_OOV["news"])
trainSeqQues_word = tokenizer_word.texts_to_sequences(trainsets_OOV["questions"])

valSeqNews_word = tokenizer_word.texts_to_sequences(valsets_OOV["news"])
valSeqQues_word = tokenizer_word.texts_to_sequences(valsets_OOV["questions"])

testSeqNews_word = tokenizer_word.texts_to_sequences(testsets_OOV["news"])
testSeqQues_word = tokenizer_word.texts_to_sequences(testsets_OOV["questions"])

word_counts = tokenizer_word.word_counts
word_index = tokenizer_word.word_index
print('Found {} unique tokens'.format(len(word_index)))
print("Median News Length: {}".format(np.median(np.array([len(i.split()) for i in trainsets_OOV["news"]] + [len(i.split()) for i in valsets_OOV["news"]]))))
print("Max Question Length: {}".format(np.max(np.array([len(i.split()) for i in trainsets_OOV["questions"]] + [len(i.split()) for i in valsets_OOV["questions"]]))))

print("{} sec".format(time.time() - start_time)) # 60sec

Transforming texts to sequences - Word Level
Found 20340 unique tokens
Median News Length: 659.0
Max Question Length: 46
119.04841995239258 sec


## 2.2 Load Word Embedding

In [10]:
start_time = time.time()

print('Indexing word vectors')

# embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_WORD, binary=True) # a word:vec dictionary
# print('Found {} word vectors of word2vec'.format(len(embeddings_index.vocab)))

embeddings_index = {}
f = open(EMBEDDING_WORD)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

print("{} sec".format(time.time() - start_time)) # 60sec

Indexing word vectors
Found 400000 word vectors.
17.413054943084717 sec


In [406]:
print(cosine_similarity(embeddings_index["news"].reshape(1,-1), embeddings_index["cnn"].reshape(1,-1)))
print(cosine_similarity(embeddings_index["news"].reshape(1,-1), embeddings_index["journal"].reshape(1,-1)))

[[0.7162254]]
[[0.5718423]]


In [11]:
print('Preparing embedding matrix - word')

nb_words = len(word_index) + 1

embedding_word_matrix = np.zeros((nb_words, EMBEDDING_DIM_WORD))
for word, i in word_index.items():
    if word in embeddings_index and word != UNK_WORD:
        embedding_word_matrix[i] = embeddings_index[word]
    elif word.startswith("@entity"):
        num = int(word[7:])
        embedding_word_matrix[i] = np.array([0.1] * 100)
        if num < 100:
            embedding_word_matrix[i][num] = 1
        elif num < 200:
            embedding_word_matrix[i][99] = 1
            embedding_word_matrix[i][num%100] = 1
        else:
            embedding_word_matrix[i][99] = 1
            embedding_word_matrix[i][98] = 1
            embedding_word_matrix[i][num%100] = 1
print('Null word embeddings: {}'.format(np.sum(np.sum(embedding_word_matrix, axis=1) == 0)))
print('Embedding shape: {}'.format(embedding_word_matrix.shape))

Preparing embedding matrix - word
Null word embeddings: 17
Embedding shape: (20341, 100)


# 3. Char-level Embedding

## 3.1 Texts to Sequences (Indexing)

In [12]:
start_time = time.time()

print('Transforming texts to sequences - Character Level')

texts = trainsets_reidx["news"] + trainsets_reidx["questions"]
texts = [re.sub(r"[^A-z0-9 \"\'\.\?\{\}\(\)\[\]:;!~@#$%&*<>,/+\-=_]", "^", text) for text in texts]

tokenizer_char = Tokenizer(filters='', lower=False, split=" ", char_level=True)
tokenizer_char.fit_on_texts(texts)

trainSeqNews_char = tokenizer_char.texts_to_sequences(trainsets_reidx["news"])
trainSeqQues_char = tokenizer_char.texts_to_sequences(trainsets_reidx["questions"])

valSeqNews_char = tokenizer_char.texts_to_sequences(valsets_reidx["news"])
valSeqQues_char = tokenizer_char.texts_to_sequences(valsets_reidx["questions"])

testSeqNews_char = tokenizer_char.texts_to_sequences(testsets_reidx["news"])
testSeqQues_char = tokenizer_char.texts_to_sequences(testsets_reidx["questions"])

char_index = tokenizer_char.word_index
char_counts = tokenizer_char.word_counts
print('Found {} unique tokens'.format(len(char_index)))
print("{} sec".format(time.time() - start_time)) # 109sec

Transforming texts to sequences - Character Level
Found 67 unique tokens
272.4809920787811 sec


## 3.2 Generate / Load Char Embedding

In [13]:
start_time = time.time()

chars = [list(re.sub(r"[^A-z0-9 \"\'\.\?\{\}\(\)\[\]:;!~@#$%&*<>,/+\-=_]", "^", text)) for text in texts]
# char2vecLearned = Word2Vec(chars, size=EMBEDDING_DIM_CHAR, min_count=5)
# char2vecLearned.save(EMBEDDING_CHAR)
char2vecLearned = Word2Vec.load(EMBEDDING_CHAR).wv
print('Found {} word vectors of word2vec'.format(len(char2vecLearned.vocab)))

print("{} sec".format(time.time() - start_time)) # 201sec

Found 64 word vectors of word2vec
106.17054200172424 sec


In [37]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(char2vecLearned["("].reshape(1,-1), char2vecLearned[")"].reshape(1,-1)))
print(cosine_similarity(char2vecLearned["?"].reshape(1,-1), char2vecLearned["!"].reshape(1,-1)))
print(cosine_similarity(char2vecLearned[","].reshape(1,-1), char2vecLearned["."].reshape(1,-1)))
print(cosine_similarity(char2vecLearned["+"].reshape(1,-1), char2vecLearned["-"].reshape(1,-1)))

[[0.8475163]]
[[0.833086]]
[[0.8597956]]
[[-0.1458748]]


In [14]:
print('Preparing embedding matrix - char')

nb_chars = len(char_index) + 1

embedding_char_matrix = np.zeros((nb_chars, EMBEDDING_DIM_CHAR))
for char, i in char_index.items():
    if char in char2vecLearned.vocab and char != UNK_CHAR:
        embedding_char_matrix[i] = char2vecLearned.word_vec(char)
print('Null char embeddings: {}'.format(np.sum(np.sum(embedding_char_matrix, axis=1) == 0)))
print('Embedding shape: {}'.format(embedding_char_matrix.shape))

Preparing embedding matrix - char
Null char embeddings: 5
Embedding shape: (68, 25)


# 4. Input and Labels -- Passage

## 4.1 Pad Word Sequences as Input

In [15]:
print('Padding sequences')

N_train = pad_sequences(trainSeqNews_word, maxlen=MAX_SEQUENCE_LENGTH_NEWS, truncating="post")
Q_train = pad_sequences(trainSeqQues_word, maxlen=MAX_SEQUENCE_LENGTH_QUES, truncating="post")
print('Shape of news tensor:', N_train.shape)
print('Shape of questions tensor:', Q_train.shape)

N_val = pad_sequences(valSeqNews_word, maxlen=MAX_SEQUENCE_LENGTH_NEWS, truncating="post")
Q_val = pad_sequences(valSeqQues_word, maxlen=MAX_SEQUENCE_LENGTH_QUES, truncating="post")

N_test = pad_sequences(testSeqNews_word, maxlen=MAX_SEQUENCE_LENGTH_NEWS, truncating="post")
Q_test = pad_sequences(testSeqQues_word, maxlen=MAX_SEQUENCE_LENGTH_QUES, truncating="post")

np.save("Dataset/GRU/{0}_reidx/N_train{0}_reidx.npy".format(N_TRAININGPOINTS), N_train)
np.save("Dataset/GRU/{0}_reidx/Q_train{0}_reidx.npy".format(N_TRAININGPOINTS), Q_train)
np.save("Dataset/GRU/{0}_reidx/N_val{0}_reidx.npy".format(N_TRAININGPOINTS), N_val)
np.save("Dataset/GRU/{0}_reidx/Q_val{0}_reidx.npy".format(N_TRAININGPOINTS), Q_val)
np.save("Dataset/GRU/{0}_reidx/N_test{0}_reidx.npy".format(N_TRAININGPOINTS), N_test)
np.save("Dataset/GRU/{0}_reidx/Q_test{0}_reidx.npy".format(N_TRAININGPOINTS), Q_test)

Padding sequences
Shape of news tensor: (87355, 300)
Shape of questions tensor: (87355, 46)


## 4.2 Pad Char Sequences as Input

In [416]:
max([len(" ".join(i.split()[:300])) for i in trainsets_reidx["news"]])

1958

In [417]:
max([len(" ".join(i.split()[:300])) for i in trainsets_reidx["questions"]])

263

In [418]:
print('Padding sequences')

N_train_char = pad_sequences(trainSeqNews_char, maxlen=1960, truncating="post")
Q_train_char = pad_sequences(trainSeqQues_char, maxlen=270, truncating="post")
print('Shape of news tensor:', N_train_char.shape)
print('Shape of questions tensor:', Q_train_char.shape)

N_val_char = pad_sequences(valSeqNews_char, maxlen=1960, truncating="post")
Q_val_char = pad_sequences(valSeqQues_char, maxlen=270, truncating="post")

N_test_char = pad_sequences(testSeqNews_char, maxlen=1960, truncating="post")
Q_test_char = pad_sequences(testSeqQues_char, maxlen=270, truncating="post")

np.save("Dataset/GRU/{0}_reidx/N_train_char{0}_reidx.npy".format(N_TRAININGPOINTS), N_train_char)
np.save("Dataset/GRU/{0}_reidx/Q_train_char{0}_reidx.npy".format(N_TRAININGPOINTS), Q_train_char)
np.save("Dataset/GRU/{0}_reidx/N_val_char{0}_reidx.npy".format(N_TRAININGPOINTS), N_val_char)
np.save("Dataset/GRU/{0}_reidx/Q_val_char{0}_reidx.npy".format(N_TRAININGPOINTS), Q_val_char)
np.save("Dataset/GRU/{0}_reidx/N_test_char{0}_reidx.npy".format(N_TRAININGPOINTS), N_test_char)
np.save("Dataset/GRU/{0}_reidx/Q_test_char{0}_reidx.npy".format(N_TRAININGPOINTS), Q_test_char)

Padding sequences
Shape of news tensor: (87355, 1960)
Shape of questions tensor: (87355, 270)


## 4.2 Input Word Embedding

In [21]:
print('Preparing input embedding')

nb_words = len(word_index) + 1
sorted_word_index = sorted(word_index.items(), key=lambda x:x[1])

embedding_input_matrix = np.zeros((nb_words, EMBEDDING_DIM))

for word, wordcode in word_index.items():

    if word in [UNK_WORD, UNK_CHAR, UNK_ENTITY]:
        continue
    
    # get word embedding
    word_level_embedding = embedding_word_matrix[wordcode]

    # get char embedding
    char_level_embedding =[]
    for char in word:
        try:
            charcode = char_index[char]
        except:
            charcode = char_index[UNK_CHAR]
        char_level_embedding.append(embedding_char_matrix[charcode])
    char_level_embedding = np.mean(np.array(char_level_embedding), axis=0)
    
    # combine word and char embedding
    embedding_input_matrix[wordcode] = np.concatenate((word_level_embedding, char_level_embedding)) # (325,)

print('Null word embeddings: {}'.format(np.sum(np.sum(embedding_input_matrix, axis=1) == 0)))
print('Embedding shape: {}'.format(embedding_input_matrix.shape))
np.save("Dataset/GRU/{0}_reidx/embedding_input_matrix{0}_reidx.npy".format(N_TRAININGPOINTS), embedding_input_matrix)

Preparing input embedding
Null word embeddings: 4
Embedding shape: (20341, 125)


In [388]:
cosine_similarity(embedding_input_matrix[word_index["@entity0"]].reshape(1,-1),
                  embedding_input_matrix[word_index["@entity1"]].reshape(1,-1))

array([[0.9238118]])

## 4.3 Output Index

In [269]:
max_a = []
for i in trainsets_reidx["news"] + valsets_reidx["news"] + testsets_reidx["news"]:
    a = list(set([int(j[7:]) for j in i.split()[:300] if j.startswith("@entity")]))
    max_a += a
print(max(max_a))
a = {word_index[i]:i for i in word_index if i.startswith("@entity") and int(i[7:]) > 100}
print([j for i in N_train for j in i if j in a])
print(max([int(i[7:]) for i in trainsets_reidx["answers"]]))

100
[]
60


In [17]:
entity_index = ["@entity{}".format(i) for i in range(101)] + [UNK_ENTITY]
entity_index = {w: index for (index, w) in enumerate(entity_index)}
print('Found unique entity: {}'.format(len(entity_index)))

Found unique entity: 102


In [18]:
y_train_multi = np.array([entity_index[trainsets_reidx["answers"][i]] for i in range(len(trainsets_reidx["answers"]))])
y_val_multi = np.array([entity_index[valsets_reidx["answers"][i]] for i in range(len(valsets_reidx["answers"]))])
y_test_multi = np.array([entity_index[testsets_reidx["answers"][i]] for i in range(len(testsets_reidx["answers"]))])
y_train_multi = to_categorical(y_train_multi, num_classes=len(entity_index))
y_val_multi = to_categorical(y_val_multi, num_classes=len(entity_index))
y_test_multi = to_categorical(y_test_multi, num_classes=len(entity_index))

print('y_train labels: {}'.format(len(y_train_multi)))
print('y_val labels: {}'.format(len(y_val_multi)))
print('y_test labels: {}'.format(len(y_test_multi)))
np.save("Dataset/GRU/{0}_reidx/y_train_multi{0}_reidx.npy".format(N_TRAININGPOINTS), y_train_multi)
np.save("Dataset/GRU/{0}_reidx/y_val_multi{0}_reidx.npy".format(N_TRAININGPOINTS), y_val_multi)
np.save("Dataset/GRU/{0}_reidx/y_test_multi{0}_reidx.npy".format(N_TRAININGPOINTS), y_test_multi)

y_train labels: 87355
y_val labels: 3579
y_test labels: 2992


In [19]:
with open("Dataset/GRU/{0}_reidx/word_index{0}_reidx.json".format(N_TRAININGPOINTS), "w") as f:
    json.dump(word_index, f)

In [20]:
with open("Dataset/GRU/{0}_reidx/entity_index{0}_reidx.json".format(N_TRAININGPOINTS), "w") as f:
    json.dump(entity_index, f)

In [227]:
y_train_bi = []
for i in range(len(News_train_word)):
    y = []
    for j in News_train_word[i]:
        if j == word_index[trainsets_reidx["answers"][i]]:
            y.append(1)
        else:
            y.append(0)
    y_train_bi.append(y)
y_train_bi = np.array(y_train_bi)

In [228]:
y_val_bi = []
for i in range(len(News_val_word)):
    y = []
    for j in News_val_word[i]:
        if j == word_index[valsets_reidx["answers"][i]]:
            y.append(1)
        else:
            y.append(0)
    y_val_bi.append(y)
y_val_bi = np.array(y_val_bi)

In [229]:
y_test_bi = []
for i in range(len(News_test_word)):
    y = []
    for j in News_test_word[i]:
        if j == word_index[testsets_reidx["answers"][i]]:
            y.append(1)
        else:
            y.append(0)
    y_test_bi.append(y)
y_test_bi = np.array(y_test_bi)

In [230]:
print('y_train labels: {}'.format(len(y_train_bi)))
print('y_val labels: {}'.format(len(y_val_bi)))
print('y_test labels: {}'.format(len(y_test_bi)))
np.save("Dataset/GRU/{0}_reidx/y_train_bi{0}_reidx.npy".format(N_TRAININGPOINTS), y_train_bi)
np.save("Dataset/GRU/{0}_reidx/y_val_bi{0}_reidx.npy".format(N_TRAININGPOINTS), y_val_bi)
np.save("Dataset/GRU/{0}_reidx/y_test_bi{0}_reidx.npy".format(N_TRAININGPOINTS), y_test_bi)

y_train labels: 8716
y_val labels: 3579
y_test labels: 2992


In [231]:
cc = []
for i in range(len(News_train_word)):
    cc.append(Counter(News_train_word[i])[word_index[trainsets_reidx["answers"][i]]])

In [22]:
word_index_reverse = {code:i for i,code in word_index.items()}
entity_index_word = [code for i, code in word_index.items() if i.startswith('@entity') or i == UNK_ENTITY]

In [35]:
option_input = np.zeros((len(N_train), len(entity_index)))
count=0
for record in N_train:
    for code in record:
        if code in entity_index_word:
            word = word_index_reverse[code]
            entity = entity_index[word]
            option_input[count][entity] = code
    count+=1
    if count%1000 == 0:
        print('load' + str(count))
np.save("Dataset/GRU/100000_reidx/option_input_train100000_reidx.npy", option_input)

load1000
load2000
load3000
load4000
load5000
load6000
load7000
load8000
load9000
load10000
load11000
load12000
load13000
load14000
load15000
load16000
load17000
load18000
load19000
load20000
load21000
load22000
load23000
load24000
load25000
load26000
load27000
load28000
load29000
load30000
load31000
load32000
load33000
load34000
load35000
load36000
load37000
load38000
load39000
load40000
load41000
load42000
load43000
load44000
load45000
load46000
load47000
load48000
load49000
load50000
load51000
load52000
load53000
load54000
load55000
load56000
load57000
load58000
load59000
load60000
load61000
load62000
load63000
load64000
load65000
load66000
load67000
load68000
load69000
load70000
load71000
load72000
load73000
load74000
load75000
load76000
load77000
load78000
load79000
load80000
load81000
load82000
load83000
load84000
load85000
load86000
load87000


In [31]:
option_input[0]

array([34., 14., 18., 22., 25.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [308]:
a = np.random.rand(*option_input.shape) * option_input

In [309]:
a[0]

array([0.        , 0.31804787, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.58736229, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.0796733 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [317]:
207/len(valsets_reidx["answers"])

0.22697368421052633