## Clean data files and save them on disk

In [None]:

def clean_data(old_file_address, new_file_address):
    
    import string
    from unicodedata import normalize
    import re
    
    remove_punct_map = dict.fromkeys(map(ord, string.punctuation)) # thank you Reed!
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    
    with open(old_file_address, 'r', encoding='utf-8') as old_file, open(new_file_address, 'w', encoding='utf-8') as clean_file:
        for line in old_file:
            
            # normalize all Unicode characters to ASCII (maybe not nesessary)
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            
            # remove non-printable characters
            line = re_print.sub('', line)
            
            # normalize to lowercase, remove punctuation and write to a file
            clean_file.write(line.lower().translate(remove_punct_map))


In [None]:
clean_data('europarl-v7esp.txt', 'clean_spanish.txt')

In [None]:
clean_data('europarl-v7eng.txt', 'clean_english.txt')

## Data Observation with MRJob

In [None]:
# Data contains millions of rows, so I used MRJob to handle it and test locally
# Use boilerplate to count the number of words and lines

from mrjob.job import MRJob
import string

remove_punct_map = dict.fromkeys(map(ord, string.punctuation)) #thank you Reed

class TextOverview(MRJob):

    def mapper(self, _, line):
        
        line = line.translate(remove_punct_map)
        line = line.lower()
        
        yield "total_words", len(line.split())
        yield "lines", 1

    def reducer(self, key, values):
        yield key, sum(values)


if __name__ == '__main__':
    TextOverview.run()
    
#python engl-span.py clean_english.txt
#python engl-span.py clean_spanish.txt

In [None]:
# Count a number of unique words in each dataset:

from mrjob.job import MRJob
import string

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

class TextOverview(MRJob):

    def mapper(self, _, line):

        line = line.translate(remove_punct_map)
        line = line.lower()
        for word in line.split():
            yield 'words', word

    def reducer(self, key, values):
        yield key, len(set(values))
    

if __name__ == '__main__':
    TextOverview.run()

#python engl-span.py clean_english.txt
#python engl-span.py clean_spanish.txt

***Output for english file:***
- "total_words"   48 978 039
- "lines" 1 965 734
- "unique_words" 133 052

***Output for spanish file:***
- "total_words"   51 505 465
- "lines" 1 965 734
- "unique_words" 192 038

## Prepare datafiles

In [None]:
# save combined clean txt file on disk just in case
with open('clean_english.txt', 'r') as eng, open('clean_spanish.txt', 'r') as span, open('combined.txt','w') as comb:
    
        for eline, sline in zip(eng, span):
            eline = eline.rstrip()
            sline = sline.rstrip()
            
            comb.write(f'{eline}\t{sline}\n')


In [None]:
def to_pairs(doc):
    file = open(doc, mode='r', encoding='utf-8')
    text = file.read()
    file.close()
    lines = text.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

In [None]:
# merge both files in one
pairs = to_pairs('combined.txt')

In [None]:
pairs[0]

In [7]:
def create_array(eng_file_address, esp_file_address):
    
    from numpy import array
    
    
    all_pairs = []
    with open(eng_file_address, 'r') as eng, open(esp_file_address, 'r') as span:
    
        for eline, sline in zip(eng, span):
            eline = eline.rstrip()
            sline = sline.rstrip()

            all_pairs.append([eline, sline])
        
    return array(all_pairs)

In [8]:
pairs_array = create_array('clean_english.txt', 'clean_spanish.txt')

In [111]:
for x in range(0,5):
    print(f'{x}. {pairs_array[x,0]} : {pairs_array[x,1]}')

0. resumption of the session : reanudacion del periodo de sesiones
1. i declare resumed the session of the european parliament adjourned on friday 17 december 1999 and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period : declaro reanudado el periodo de sesiones del parlamento europeo interrumpido el viernes 17 de diciembre pasado y reitero a sus senorias mi deseo de que hayan tenido unas buenas vacaciones
2. although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful : como todos han podido comprobar el gran efecto del ano 2000 no se ha producido en cambio los ciudadanos de varios de nuestros paises han sido victimas de catastrofes naturales verdaderamente terribles
3. you have requested a debate on this subject in the course of the next few days during this partsession : sus senorias han solicitado un deb

## Save data to plk file

#don't need this step
def save_data(pairs, filename):

    from sklearn.externals import joblib
    
    joblib.dump(pairs, filename) 
    
    print(f'Saved: {filename}')

save_data(pairs_array, 'english-spanish.pkl') #64Gb wow!!

## Split dataset

In [12]:
#split into train and test data and save to the file
def split_data(data, number_of_rows):
    
    from sklearn.externals import joblib
    from numpy.random import rand
    from numpy.random import shuffle
    
    shuffle(data)
    dataset = data[:number_of_rows, :]
    shuffle(dataset)
    
    unique = {}
    words = []
    for pair in dataset:
        for sentence in pair:
            for word in sentence.split(' '):
                words.append(word)
                unique[word] = 1
                    
    print(f'a number of unique words - {len(unique)}')
    print(f'a total number of words - {len(words)}')
    
    train, test = dataset[:(int(number_of_rows*0.8))], dataset[(int(number_of_rows*0.8)):]
    
    joblib.dump(dataset, 'dataset.pkl')
    joblib.dump(train, 'train.pkl')
    joblib.dump(test, 'test.pkl') 
    
    return('done')
                    

In [13]:
split_data(pairs_array, 5000)

a number of unique words - 20477
a total number of words - 257345


'done'