In [1]:
import numpy as np
from __future__ import division

filename = 'data/glove.6B.300d.txt' 
# (glove data set from: https://nlp.stanford.edu/projects/glove/)


def loadGloVe(filename):
    vocab = []
    embd = []
    file = open(filename,'r')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print('GloVe Loaded.')
    file.close()
    return vocab,embd

# Pre-trained GloVe embedding
vocab,embd = loadGloVe(filename)

embedding = np.asarray(embd)
embedding = embedding.astype(np.float32)

word_vec_dim = len(embd[0]) # word_vec_dim = dimension of each word vectors

GloVe Loaded.


In [2]:
from nltk.corpus import wordnet as wn
def get_synonymn(word):
    synonymns =  wn.synsets(word)[0].lemma_names()
    for name in synonymns:
        if name != word:
            return name

In [11]:
import csv
import nltk as nlp
from nltk import word_tokenize
import string
import re


summaries = []
texts = []

with open('data/data.csv', 'r',encoding="ISO-8859-1") as csvfile: 
    data = csv.DictReader(csvfile)
    for row in data:
        clean_text = row['Text'] 
        clean_summary =  row['Summaries'] 
        summaries.append(word_tokenize(clean_summary))
        texts.append(word_tokenize(clean_text))


In [12]:
import random

index = random.randint(0,len(texts)-1)

print("SAMPLE CLEANED & TOKENIZED TEXT: \n\n"+str(texts[index]))
print("\nSAMPLE CLEANED & TOKENIZED SUMMARY: \n\n"+str(summaries[index])) 

SAMPLE CLEANED & TOKENIZED TEXT: 

['Continental', "'may", 'run', 'out', 'of', 'cash', "'"]

SAMPLE CLEANED & TOKENIZED SUMMARY: 

['Shares', 'in', 'Continental', 'Airlines', 'have', 'tumbled', 'after', 'the', 'firm', 'warned', 'it', 'could', 'run', 'out', 'of', 'cash.In', 'a', 'filing', 'to', 'US', 'regulators', 'the', 'airline', 'warned', 'of', '``', 'inadequate', 'liquidity', "''", 'if', 'it', 'fails', 'to', 'reduce', 'wage', 'costs', 'by', '$', '500m', 'by', 'the', 'end', 'of', 'February', '.', 'Continental', 'also', 'said', 'that', ',', 'if', 'it', 'did', 'not', 'make', 'any', 'cuts', ',', 'it', 'expects', 'to', 'lose', '``', 'hundreds', 'of', 'millions', 'of', 'dollars', "''", 'in', '2005', 'in', 'current', 'market', 'conditions', '.', 'Failure', 'to', 'make', 'cutbacks', 'may', 'also', 'push', 'it', 'to', 'reduce', 'its', 'fleet', ',', 'the', 'group', 'said', '.', 'Shares', 'in', 'the', 'fifth', 'biggest', 'US', 'carrier', 'had', 'fallen', '6.87', '%', 'on', 'the', 'news', 'to',

In [13]:
def np_nearest_neighbour(x):
    #returns array in embedding that's most similar (in terms of cosine similarity) to x
        
    xdoty = np.multiply(embedding,x)
    xdoty = np.sum(xdoty,1)
    xlen = np.square(x)
    xlen = np.sum(xlen,0)
    xlen = np.sqrt(xlen)
    ylen = np.square(embedding)
    ylen = np.sum(ylen,1)
    ylen = np.sqrt(ylen)
    xlenylen = np.multiply(xlen,ylen)
    cosine_similarities = np.divide(xdoty,xlenylen)

    return embedding[np.argmax(cosine_similarities)]
    


def word2vec(word):  # converts a given word into its vector representation
    if word in vocab:
        return embedding[vocab.index(word)]
    else:
        try:
            synonymn = get_synonymn(word)
            if synonymn in vocab:
                return embedding[vocab.index(synonymn)]
            else:
                return embedding[vocab.index('unk')]
        except:
            return embedding[vocab.index('unk')]

def vec2word(vec):   # converts a given vector representation into the represented word 
    for x in xrange(0, len(embedding)):
            if np.array_equal(embedding[x],np.asarray(vec)):
                return vocab[x]
    return vec2word(np_nearest_neighbour(np.asarray(vec)))


In [14]:
word = "unk"
print("Vector representation of '"+str(word)+"':\n") 
print(word2vec(word)) 

Vector representation of 'unk':

[  3.00709993e-01  -4.68670011e-01  -2.06169993e-01  -8.09780002e-01
  -2.38890007e-01   2.43290007e-01   1.65379997e-02  -3.56869996e-02
  -2.23059997e-01   9.51889992e-01  -3.22730005e-01   2.19799995e-01
  -6.75240010e-02  -3.72200012e-01  -3.97179991e-01  -4.38609987e-01
   1.19670004e-01  -2.99640000e-01   2.84369998e-02  -8.75440016e-02
   1.65690005e-01  -4.94509995e-01  -6.20109975e-01  -1.65739998e-01
  -9.72179994e-02  -9.94739980e-02  -8.03069994e-02  -3.93379986e-01
  -2.41950005e-01   3.20230007e-01  -5.33200026e-01  -4.01840001e-01
  -6.71350002e-01  -7.85610005e-02   5.55459976e-01   2.99970001e-01
  -9.96500030e-02  -6.70350015e-01   1.26690000e-01  -1.86179996e-01
  -6.26209974e-02   4.52899992e-01   3.92650008e-01   2.41209999e-01
  -4.14739996e-01  -6.18900001e-01  -1.04120001e-01  -3.10429990e-01
  -6.67880010e-03  -8.32480013e-01   6.51499987e-01   9.01809990e-01
   2.41459999e-02  -7.07660019e-02  -3.95799994e-01  -3.64870012e-01
 

In [15]:
#REDUCE DATA (FOR SPEEDING UP THE NEXT STEPS)

MAXIMUM_DATA_NUM = 50000

texts = texts[0:MAXIMUM_DATA_NUM]
summaries = summaries[0:MAXIMUM_DATA_NUM]

In [16]:
vocab_limit = []
embd_limit = []

i=0
for text in texts:
    for word in text:
        if word not in vocab_limit:
            if word in vocab:
                vocab_limit.append(word)
                embd_limit.append(word2vec(word))

In [17]:
for summary in summaries:
    for word in summary:
        if word not in vocab_limit:
            if word in vocab:
                vocab_limit.append(word)
                embd_limit.append(word2vec(word))


In [None]:
if 'eos' not in vocab_limit:
    vocab_limit.append('eos')
    embd_limit.append(word2vec('eos'))
if 'unk' not in vocab_limit:
    vocab_limit.append('unk')
    embd_limit.append(word2vec('unk'))

null_vector = np.zeros([word_vec_dim])

vocab_limit.append('<PAD>')
embd_limit.append(null_vector)    

In [None]:
vec_summaries = []

for summary in summaries:
    
    vec_summary = []
    
    for word in summary:
        vec_summary.append(word2vec(word))
            
    vec_summary.append(word2vec('eos'))
    
    vec_summary = np.asarray(vec_summary)
    vec_summary = vec_summary.astype(np.float32)
    
    vec_summaries.append(vec_summary)

In [None]:
vec_texts = []

for text in texts:
    
    vec_text = []
    
    for word in text:
        vec_text.append(word2vec(word))
    
    vec_text = np.asarray(vec_text)
    vec_text = vec_text.astype(np.float32)
    
    vec_texts.append(vec_text)    

In [None]:
#Saving processed data in another file.

import pickle
with open('vocab_limit', 'wb') as fp:
    pickle.dump(vocab_limit, fp)
with open('embd_limit', 'wb') as fp:
    pickle.dump(embd_limit, fp)
with open('vec_summaries', 'wb') as fp:
    pickle.dump(vec_summaries, fp)
with open('vec_texts', 'wb') as fp:
    pickle.dump(vec_texts, fp)
