In [99]:
import os
import time
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
%matplotlib inline

In [100]:
FREQ_THRESHOLD = 10 # only care about "words" occuring at least than many training seqs
REPLACE_MISSING_DICT_VALS_WITH='-7777777'
USE_TEST_SET_FOR_TR = True

In [101]:
if not os.path.isdir('out/'): os.makedirs('out/')

In [102]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [103]:
# calculate frequencies of words

# word -> # of seqs it occures in occurences 
freqs = {}

for x in train_df.Sequence:
    for w in np.unique(x.split(',')):
        if not freqs.has_key(w): freqs[w] = 0
        freqs[w] += 1

In [104]:
# create dictionary (word -> ix)
dictionary = {}
i = 1 # start at 1

for w in freqs.iterkeys():
    if freqs[w] >= FREQ_THRESHOLD:
        dictionary[w] = i
        i += 1

In [105]:
# add sentinel value for missing dict values
assert(not dictionary.has_key(REPLACE_MISSING_DICT_VALS_WITH))
dictionary[REPLACE_MISSING_DICT_VALS_WITH] = 1 + max(dictionary.values())

In [106]:
np.save('out/dictionary@tr=%d.npy'%(FREQ_THRESHOLD), dictionary)
print 'Dictionary size:', len(dictionary.keys())

Dictionary size: 13822


In [107]:
# make sure ids are unique actoss training/test set
train_ids = train_df['Id'].values
test_ids = test_df['Id'].values
assert(len(train_ids) + len(test_ids) == \
       len(np.unique(np.concatenate((train_ids, test_ids)))))

In [108]:
def gen_encoded_sequences(df, dictionary, extract_last=False, fill_dict_misses_with=None):

    max_len = -1
    sequences = {} # id -> ix
    last_items = {} # id -> ix

    # encode sequences as indexes from dictionary
    for ix, r in tqdm(df.iterrows()):
        seq_id = r['Id']
        words = r['Sequence'].split(',')

        sequences[seq_id] = []

        for w in words:
            
            if dictionary.has_key(w):
                sequences[seq_id].append(dictionary[w])
            else:
                if fill_dict_misses_with is None:
                    # if sequence contains word not in dictionary, skip it
                    del sequences[seq_id]
                    break
                else:
                    sequences[seq_id].append(fill_dict_misses_with)
            
        if sequences.has_key(seq_id):
            
            if extract_last:
                last_items[seq_id] = sequences[seq_id].pop()

            if max_len < len(sequences[seq_id]):
                max_len = len(sequences[seq_id])
            
    return sequences, last_items, max_len

In [109]:
if USE_TEST_SET_FOR_TR:
    combined_tr_df = train_df.append(test_df)
else:
    combined_tr_df = train_df

train_e_seq, train_e_last, train_max_len = \
    gen_encoded_sequences(combined_tr_df, dictionary=dictionary, extract_last=True)
    
len(train_e_seq)

227690it [00:19, 11465.33it/s]


101851

In [110]:
test_e_seq, test_e_last, test_max_len = \
    gen_encoded_sequences(test_df, dictionary=dictionary, extract_last=False, \
                          fill_dict_misses_with=dictionary[REPLACE_MISSING_DICT_VALS_WITH])

len(test_e_seq)

113845it [00:10, 11009.79it/s]


113845

In [111]:
# pad sequences to max_len
max_len = max(train_max_len, test_max_len)

print max_len; time.sleep(1)

for seq_id, seq in tqdm(train_e_seq.iteritems()):
    train_e_seq[seq_id] = np.array(train_e_seq[seq_id], dtype=np.int32)
    train_e_seq[seq_id] = np.pad(train_e_seq[seq_id], (max_len - len(train_e_seq[seq_id]), 0), \
                                 'constant', constant_values = (0.,0.))

for seq_id, seq in tqdm(test_e_seq.iteritems()):
    test_e_seq[seq_id] = np.array(test_e_seq[seq_id], dtype=np.int32)
    test_e_seq[seq_id] = np.pad(test_e_seq[seq_id], (max_len - len(test_e_seq[seq_id]), 0), \
                                 'constant', constant_values = (0.,0.))

347


101851it [00:02, 40799.56it/s]
113845it [00:02, 42138.14it/s]


In [112]:
np.save('out/train_e_seq@tr=%d.npy'%(FREQ_THRESHOLD), train_e_seq)
np.save('out/train_e_last@tr=%d.npy'%(FREQ_THRESHOLD), train_e_last)
np.save('out/test_e_seq@tr=%d.npy'%(FREQ_THRESHOLD), test_e_seq)