In [None]:
import pandas as pd
import numpy

In [None]:
import spacy

In [None]:
nlp = spacy.load("en")

In [None]:
def tokenize(x):
    return nlp(x, parse=False, tag=False, entity=False)


In [None]:
data_file = 'data/wikitext-2/train.txt'

In [None]:
def load_file(file_name):
    with open(data_file) as d:
        for line in d:
            yield tokenize(line)
        

In [None]:
f = load_file(data_file)

In [None]:
def get_tokens(file):
    tokens = {}
    for sent in file:
        for token in sent:
            try:
                tokens[token.text] += 1
            except:
                tokens[token.text] = 1
    return tokens

In [None]:
%%time 
tokens = get_tokens(f)

In [None]:
import math
def norm_probs(tokens):
    s = sum(tokens.values())
    return {k : round(math.log(v/float(s)),4) for k, v in tokens.items()}
def sort_probs(probs):
    return sorted(probs.items(), key=lambda x:x[1], reverse=True)

In [None]:
probs = norm_probs(tokens)
sorted_probs = sort_probs(probs)

In [None]:
import nltk

In [None]:
def get_ngrams(tokens, min_length = 5, max_N=4):
    for token, freq in tokens:
        if len(token) >= min_length:
            splits = []
            for i in range(2, max_N+1):
                splits += [list([''.join(x) for x in nltk.ngrams(list(token), i)])]
            yield token, splits

In [None]:
def get_pos_encoded(token_len,splits):
    new_splits = {}
    for ng_split in splits:
        for pos, split in enumerate(ng_split):
            if split not in new_splits:
                new_splits[split] = 0
            new_splits[split] += pos/float(token_len)
    return new_splits
        

In [None]:
%%time
all_splits = {}
for token, splits in get_ngrams(sorted_probs):
    all_splits[token] = get_pos_encoded(len(token), splits)
   

In [None]:
%%time
result = []
for key, value in all_splits.items():
    row = pd.Series(value).reset_index()
    row.insert(0, 'word', key)
    result.append(row)
    

In [None]:
result = pd.concat(result, ignore_index=True)

In [None]:
result

In [None]:
split_counts = result['index'].value_counts()

In [None]:
lens = split_counts.index.str.len().unique()
splits_by_len = [split_counts[split_counts.index.str.len() == x] for x in lens]
splits_by_len = [x / (x.sum() * (i+2)) for i,x in enumerate(splits_by_len)]
split_counts = pd.concat(splits_by_len)

In [None]:
#split_counts = split_counts / split_counts.sum()

In [None]:
split_quants = split_counts.quantile([0.25, 0.4, 0.6, 0.75, 0.85, 0.95])

In [None]:
pruned_splits = split_counts[split_counts > split_quants[0.6]]

In [None]:
pruned_splits.count()

In [None]:
t = result['index']

In [None]:
pruned_splits.index.str.len().value_counts()

In [None]:
t[result['index'].isin(pruned_splits.index)].str.len().value_counts()

In [None]:
pruned_result = result[result['index'].isin(pruned_splits.index)]

In [None]:
word_count = len(pruned_result['word'].unique())
s_count = pruned_splits.count()
word_count, s_count

In [None]:
%%time 
piv = pd.pivot_table(pruned_result, index=['word'], columns=['index']).fillna(0)

In [None]:
word_split_m = piv.values

In [None]:
%%time
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500, n_iter=7, random_state=42)
word_split_red = svd.fit_transform(word_split_m)

In [None]:
word_split_red.shape

In [None]:
split_m = numpy.ndarray((word_split_m.shape[1], word_split_red.shape[1]))

In [None]:
piv.columns[0]

In [None]:
%%time
a = word_split_m[0]
for i in range(split_m.shape[0]):
    split_v = word_split_red[word_split_m[:,i] > 0].sum(axis=0)
    if numpy.linalg.norm(split_v) > 0:
        split_v = split_v / numpy.linalg.norm(split_v)
    split_m[i] = split_v

In [None]:
split2index = lambda x : (piv.columns.get_level_values(1) == x).argmax()
index2split = lambda x : piv.columns[x]
word2vector = lambda e : word_split_red[(piv.index == e).argmax()]
split2vector = lambda e : split_m[split2index(e)]
index2word = lambda x : piv.index[x]

In [None]:
ind = 6652
w = split_m[ind]
splt = index2split(ind)
print(splt)

In [None]:
w.dot(split_m.T).argsort()[-5:][::-1]

In [None]:
index2split(13410)

In [None]:
e = piv.index[3001]

In [None]:
import scipy
e = 'eating'
evec = word2vector(e)
possible_splits = pruned_result[pruned_result['word'] == e]['index']
distances = sorted(((p, scipy.spatial.distance.cosine(evec, split2vector(p))) for p in possible_splits),key=lambda x:x[1])

In [None]:
distances

In [None]:
split2index('Edu')