In [2]:
import json, nltk, io, pickle
import numpy as np
from itertools import chain
from collections import defaultdict as dd

In [11]:
# this is just for debug reasons, I use diffrent dictionary

words = []
with io.open('/pio/data/data/reddit_sample/freqs', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 20000:
            break
        words.append(line.split()[1])

words = set(words)

### Preprocess singles and make a dictionary

In [24]:
single = []
with io.open('/pio/data/data/reddit_sample/utterances.single.shuffled', 'r', encoding='utf-8') as f:
    for line in f:
        single.append(nltk.word_tokenize(line))

In [32]:
ws = dd(lambda: 0)
for s in single:
    for w in s:
        ws[w] += 1

In [44]:
ws_sorted = sorted(ws.items(), key=lambda x: -x[1])

In [67]:
voc = list(zip(*ws_sorted[:20000])[0])
voc.insert(0, '</s>')
voc.insert(0, '<s>')
voc.insert(0, '<unk>')

i_to_w = voc
w_to_i = {v:k for (k,v) in enumerate(voc)}
voc = set(voc)

In [77]:
with open('/pio/data/data/reddit_sample/wordlist.pkl', 'w') as f:
    pickle.dump(i_to_w, f)

In [68]:
single_cut = [[1] + [w_to_i.get(w, 0) for w in s] + [2] for s in single]

In [75]:
with open('/pio/data/data/reddit_sample/utterances.single.shuffled.pkl', 'w') as f:
    pickle.dump(single_cut, f)

In [106]:
freqs = [0] * len(voc)

for s in single_cut:
    for w in s:
        freqs[w] += 1

In [109]:
freqs = np.array(freqs, dtype=np.float32)
freqs /= sum(1 for s in single_cut for w in s)

In [115]:
with open('/pio/data/data/reddit_sample/freqs.pkl', 'w') as f:
    pickle.dump(freqs, f)

### Preprocess pairs

In [85]:
pairs = []
with io.open('/pio/data/data/reddit_sample/utterances.pairs.shuffled', 'r', encoding='utf-8') as f:
    for line in f:
        pairs.append(map(nltk.word_tokenize, line.split('\t')))

In [102]:
pairs_cut = [[[1] + [w_to_i.get(w, 0) for w in u] + [2] for u in s] for s in pairs]

In [104]:
with open('/pio/data/data/reddit_sample/utterances.pairs.shuffled.pkl', 'w') as f:
    pickle.dump(pairs_cut, f)

In [105]:
pairs_cut[0]

[[1, 342, 2], [1, 28, 40, 1918, 4, 2]]

### Make a train - test split

In [126]:
test_count = len(single_cut) / 10

In [127]:
train, test = single_cut[:-test_count], single_cut[-test_count:]

In [130]:
with open('/pio/data/data/reddit_sample/utterances.single.shuffled.train.pkl', 'w') as f:
    pickle.dump(train, f)
    
with open('/pio/data/data/reddit_sample/utterances.single.shuffled.test.pkl', 'w') as f:
    pickle.dump(test, f)

### Same with pairs

In [132]:
test_count_pairs = len(pairs_cut) / 20

In [133]:
train_pairs, test_pairs = pairs_cut[:-test_count_pairs], pairs_cut[-test_count_pairs:]

In [136]:
with open('/pio/data/data/reddit_sample/utterances.pairs.shuffled.train.pkl', 'w') as f:
    pickle.dump(train_pairs, f)
    
with open('/pio/data/data/reddit_sample/utterances.pairs.shuffled.test.pkl', 'w') as f:
    pickle.dump(test_pairs, f)

# RedditV3

In [14]:
v3_all = []
v3_words = set()

with io.open('/pio/data/data/reddit_sample/v3/pairsv3.uniq.censored', encoding='utf8') as f:
    for line in f:
        s1, s2 = line.split('\t')
        ws1 = nltk.word_tokenize(s1.lower())
        ws2 = nltk.word_tokenize(s2.lower())
        v3_words |= set(ws1)
        v3_words |= set(ws2)
        v3_all.append([ws1, ws2])

In [15]:
len(v3_words)

32493

In [16]:
v3_all[241442]

[[u'when',
  u'do',
  u'you',
  u'think',
  u'we',
  u'will',
  u'see',
  u'a',
  u'fight',
  u'of',
  u'this',
  u'magnitude',
  u'again',
  u'?'],
 [u'hopefully', u',', u'never', u'.']]

In [6]:
glove_words = np.load('/pio/data/data/glove_vec/6B/glove/glove.6B.wordlist.pkl')
glove_w_to_i = {glove_words[i] : i for i in range(len(glove_words))}

In [90]:
glove_words[400002]

u'<s>'

In [19]:
test_count = len(v3_all) / 10
inds = np.random.choice(len(v3_all), size=test_count, replace=False)

In [20]:
def get_ns_for_s(s):
    return [400002] + [glove_w_to_i.get(w, 0) for w in s] + [400003]

In [21]:
v3_num = []

for s1, s2 in v3_all:
    v3_num.append([get_ns_for_s(s1), get_ns_for_s(s2)])

In [23]:
inds = set(inds)
test_set = []
train_set = []

for i in xrange(len(v3_num)):
    if i in inds:
        test_set.append(v3_num[i])
    else:
        train_set.append(v3_num[i])

In [27]:
with open('/pio/data/data/reddit_sample/v3/pairsv3.uniq.censored.glove6B.train.pkl', 'w') as f:
    pickle.dump(train_set, f)
    
with open('/pio/data/data/reddit_sample/v3/pairsv3.uniq.censored.glove6B.test.pkl', 'w') as f:
    pickle.dump(test_set, f)

In [37]:
a = [n for d in v3_num for s in d for n in s[1:-1]]

In [39]:
a = np.array(a)

In [41]:
b = np.unique(a)

In [40]:
np.histogram(a)

(array([4334973,   53231,   17589,   10620,    3574,    3230,    2162,
           1825,    1143,    1467]),
 array([      0. ,   39972.6,   79945.2,  119917.8,  159890.4,  199863. ,
         239835.6,  279808.2,  319780.8,  359753.4,  399726. ]))

In [42]:
np.histogram(b)

(array([19092,  3558,  1302,   603,   382,   262,   192,   173,   143,   149]),
 array([      0. ,   39972.6,   79945.2,  119917.8,  159890.4,  199863. ,
         239835.6,  279808.2,  319780.8,  359753.4,  399726. ]))

In [44]:
max([len(s) for d in v3_num for s in d])

26