In [2]:
import os, re, pickle
import pandas as pd
import numpy as np
import datetime as dt
from tensorflow.contrib import learn


def stripXML(t):
    '''Strips XML and comments from text'''
    if len(t) == 0:
        return False
    elif t[0] in  ['<', '(']:
        return False
    else:
        return True
    
def onlyLetters(t):
    '''Keeps only letters in any language'''
    try: 
        words = re.findall(r'[^\W_0-9]+',t.decode('utf8'),re.U)
        return " ".join([w for w in words])
    except: 
        return ""


def makeRecord(t, lang, pad="<PAD>", n=70):
    '''Takes sentence and adds a comma,
    the language and a newline -> CSV record, 
    pads the sentence with <PAD> word to get to 70 words'''
    lang = lang.replace('/', '')
    try:
        sentence = onlyLetters(t).encode('utf-8')
    except UnicodeDecodeError:
        return ""
    length = len(sentence.split(' '))
    padwords = " " + " ".join([pad for i in range(n - length + 1)])
    sentence = sentence + padwords
    record = sentence + ", {0}\n".format(lang)
    return record


def splitNWords(t, n=70):
    sentence = onlyLetters(t).encode('utf-8')
    words = sentence.split(' ')
    sentences = [" ".join(words[i:i+n]) for i in range(0, len(words), n)]
    return sentences


### Little utility script to merge all the separate files into one big CSV per language


- Strip out XML tags and non letters
- Had to deal with cyrillic alphabet and non-ascii
- Split into senteces of 70 words or less and then padded with `<PAD>` for batching

In [2]:
datadir = './clean_txt_data/'
txtdir = './raw_txt/'
already_done = [d.split('_')[-1].split('.')[0] for d in os.listdir(datadir) if '.csv' in d]
langs = [d for d in os.listdir(txtdir) if os.path.isdir(txtdir + d) and d not in already_done]

In [3]:
for lang in langs:
    out_file = open(datadir + '/cleaned_data_{}.csv'.format(lang), 'a')
    lang = lang + '/'
    txt_files = os.listdir(txtdir + lang)
    print "Starting {}, {} files to read.".format(lang, len(txt_files))
    for txt in txt_files:
        #Read original text
        with open(txtdir + lang + txt, 'r') as f:
            txt = f.read()
        #Split into smaller 70 word sentences & flatten it
        txt_list = [splitNWords(t) for t in txt.split('\n') if stripXML(t)]
        txt_list = [item for sublist in txt_list for item in sublist]
        #Create CSV record: pad to 70 words, add language Split on newline
        txt_list = [makeRecord(t, lang) for t in txt_list]
        #Write to en file
        out_file.writelines(txt_list)

### Merge all languages into one large file
- Allows us to create a vocab and then split into smaller stratified sets

In [4]:
langs = [d for d in os.listdir(txtdir) if os.path.isdir(txtdir + d)]
colnames = ['txt', 'lang']
df = pd.DataFrame({}, columns=colnames)
for lang in langs:
    lang_df = pd.read_csv('./clean_txt_data/cleaned_data_{}.csv'.format(lang), names=colnames)
    df = pd.concat([df, lang_df])

    del(lang_df)
df.to_csv('./clean_txt_data/cleaned_data_all.csv', index=False)

In [3]:
df = pd.read_csv('./clean_txt_data/cleaned_data_all.csv')
print "Total sentences: {}".format(len(df))

Total sentences: 13891597


In [4]:
df.sample(5).head()

Unnamed: 0,txt,lang
9317166,Gerb pirmininke viskas ko norėčiau paprašyti t...,lt
3068382,Ψήφισα υπέρ της έκθεσης του κ Falbr διότι εστι...,el
1786811,Angesichts eines Vorschlags der auch die itali...,de
10142989,Ik ben van mening dat het van groot belang is ...,nl
1829904,folgenden Unsicherheit für den Sektor <PAD> <P...,de


### Create a vocab index
- Shuffle the data set
- split into sets of 400k - 500k records

In [5]:
#Create arrays of X and y, removed df from memory
x_text = df.txt.values
y = pd.get_dummies(df.lang).values

In [9]:
print pd.get_dummies(df.lang).columns.values

[' bg' ' cs' ' da' ' de' ' el' ' en' ' es' ' et' ' fi' ' fr' ' hu' ' it'
 ' lt' ' lv' ' nl' ' pl' ' pt' ' ro' ' sk' ' sl' ' sv']


In [7]:
#Create tf "vocab" and then map each word to the vocab index
start = dt.datetime.now()
max_document_length = max([len(t.split(" ")) for t in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Write vocabulary - will need to load it during 
vocab_path = './misc/20160113.vocab'
vocab_processor.save('./misc/20160113.vocab')
del x_text #save memory since we now have labels and data as arrays.
print "Finished vocab mapping, took: {}".format(dt.datetime.now() - start)

Finished vocab mapping, took: 0:48:23.797404


In [11]:
# Randomly shuffle data before saving so that each pickle file has a random subset. 
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x = x[shuffle_indices]
y = y[shuffle_indices]

### Create subsets of the Full DF
- Each is roughly 600000 rows
- Save each as its own file with x,y,path to vocab file

In [44]:
def store_data(filename, x, y, filepath='./mapped_data/', 
               vocab_path=vocab_path):
    #Stores subset of data as a pickle file
    dataset = {'x':x, 'y':y, 'vocab_path': vocab_path}
    with open(filepath + filename, 'wb') as f:
        pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
    print "Saved to " + filepath + filename
    return

In [46]:
filesize = 600000 #rows in each file
for i in xrange(23):
    if i == 22:
        #Open ended on the last chunk
        y_sub = y[i*filesize:]
        x_sub = x[i*filesize:]
    else:
        y_sub = y[i*filesize:(i+1)*filesize]
        x_sub = x[i*filesize:(i+1)*filesize]
    #Save as pickle
    fname = 'mapped_data_shuffled_{}.pkl'.format(i)
    store_data(fname, x_sub, y_sub)        


Saved to ./mapped_data/mapped_data_shuffled_0.pkl
Saved to ./mapped_data/mapped_data_shuffled_1.pkl
Saved to ./mapped_data/mapped_data_shuffled_2.pkl
Saved to ./mapped_data/mapped_data_shuffled_3.pkl
Saved to ./mapped_data/mapped_data_shuffled_4.pkl
Saved to ./mapped_data/mapped_data_shuffled_5.pkl
Saved to ./mapped_data/mapped_data_shuffled_6.pkl
Saved to ./mapped_data/mapped_data_shuffled_7.pkl
Saved to ./mapped_data/mapped_data_shuffled_8.pkl
Saved to ./mapped_data/mapped_data_shuffled_9.pkl
Saved to ./mapped_data/mapped_data_shuffled_10.pkl
Saved to ./mapped_data/mapped_data_shuffled_11.pkl
Saved to ./mapped_data/mapped_data_shuffled_12.pkl
Saved to ./mapped_data/mapped_data_shuffled_13.pkl
Saved to ./mapped_data/mapped_data_shuffled_14.pkl
Saved to ./mapped_data/mapped_data_shuffled_15.pkl
Saved to ./mapped_data/mapped_data_shuffled_16.pkl
Saved to ./mapped_data/mapped_data_shuffled_17.pkl
Saved to ./mapped_data/mapped_data_shuffled_18.pkl
Saved to ./mapped_data/mapped_data_shuffl