In [22]:
import os, re
import pandas as pd

def stripXML(t):
    '''Strips XML and comments from text'''
    if len(t) == 0:
        return False
    elif t[0] in  ['<', '(']:
        return False
    else:
        return True
    
def onlyLetters(t):
    '''Keeps only letters in any language'''
    try: 
        words = re.findall(r'[^\W_0-9]+',t.decode('utf8'),re.U)
        return " ".join([w for w in words])
    except: 
        return ""


def makeRecord(t, lang, pad="<PAD>", n=70):
    '''Takes sentence and adds a comma,
    the language and a newline -> CSV record, 
    pads the sentence with <PAD> word to get to 70 words'''
    lang = lang.replace('/', '')
    try:
        sentence = onlyLetters(t).encode('utf-8')
    except UnicodeDecodeError:
        return ""
    length = len(sentence.split(' '))
    padwords = " " + " ".join([pad for i in range(n - length + 1)])
    sentence = sentence + padwords
    record = sentence + ", {0}\n".format(lang)
    return record


def splitNWords(t, n=70):
    sentence = onlyLetters(t).encode('utf-8')
    words = sentence.split(' ')
    sentences = [" ".join(words[i:i+n]) for i in range(0, len(words), n)]
    return sentences


### Little utility script to merge all the separate files into one big CSV per language


- Strip out XML tags and non letters
- Had to deal with cyrillic alphabet and non-ascii
- Split into senteces of 70 words or less and then padded with `<PAD>` for batching

In [23]:
datadir = './short_clean_data/'
txtdir = './txt/'
already_done = [d.split('_')[-1].split('.')[0] for d in os.listdir(datadir) if '.csv' in d]
langs = [d for d in os.listdir(txtdir) if os.path.isdir(txtdir + d) and d not in already_done]

In [25]:
for lang in langs:
    out_file = open(datadir + '/cleaned_data_{}.csv'.format(lang), 'a')
    lang = lang + '/'
    txt_files = os.listdir(txtdir + lang)
    print "Starting {}, {} files to read.".format(lang, len(txt_files))
    for txt in txt_files:
        #Read original text
        with open(txtdir + lang + txt, 'r') as f:
            txt = f.read()
        #Split into smaller 70 word sentences & flatten it
        txt_list = [splitNWords(t) for t in txt.split('\n') if stripXML(t)]
        txt_list = [item for sublist in txt_list for item in sublist]
        #Create CSV record: pad to 70 words, add language Split on newline
        txt_list = [makeRecord(t, lang) for t in txt_list]
        #Write to en file
        out_file.writelines(txt_list)

Starting bg/, 6586 files to read.
Starting cs/, 8842 files to read.
Starting da/, 9373 files to read.
Starting de/, 9224 files to read.
Starting el/, 9271 files to read.
Starting en/, 9672 files to read.
Starting es/, 9433 files to read.
Starting et/, 8819 files to read.
Starting fi/, 9335 files to read.
Starting fr/, 9450 files to read.
Starting hu/, 8763 files to read.
Starting it/, 9486 files to read.
Starting lt/, 8819 files to read.
Starting lv/, 8787 files to read.
Starting nl/, 9433 files to read.
Starting pl/, 8821 files to read.
Starting pt/, 9434 files to read.
Starting ro/, 6576 files to read.
Starting sk/, 8804 files to read.
Starting sl/, 8742 files to read.
Starting sv/, 9402 files to read.


In [26]:
langs = [d for d in os.listdir(txtdir) if os.path.isdir(txtdir + d)]
colnames = ['txt', 'lang']
df = pd.DataFrame({}, columns=colnames)
for lang in langs:
    lang_df = pd.read_csv('./short_clean_data/cleaned_data_{}.csv'.format(lang), names=colnames)
    df = pd.concat([df, lang_df])

df.to_csv('./short_clean_data/cleaned_data_all.csv', index=False)

In [20]:
len(df)

13891602

In [21]:
df.head()

Unnamed: 0,txt,lang
0,Състав на Парламента вж протоколи<PAD> <PAD> <...,bg
1,Одобряване на протокола от предишното заседани...,bg
2,Състав на Парламента вж протоколи<PAD> <PAD> <...,bg
3,Проверка на пълномощията вж протоколи<PAD> <PA...,bg
4,Внасяне на документи вж протоколи<PAD> <PAD> <...,bg
