In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
import numpy

In [2]:
#load doc into memory
def load_doc(filename):
    file = open(filename,mode='rt',encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

### Now clean each sentence: 

    Remove all non-printable characters.
    Remove all punctuation characters.
    Normalize all Unicode characters to ASCII (e.g. Latin characters).
    Normalize the case to lowercase.
    Remove any remaining tokens that are not alphabetic.


In [4]:
def clean_pairs(lines):
    cleaned = list()
    #using regex for character filtering
    re_print = re.compile('[^%s]' %re.escape(string.printable))
    
    #prepare translation table for removing punctuation
    table = str.maketrans('','',string.punctuation)
    
    for pair in lines:
        clean_pair = list()
        for line in pair:
            #normalize unicode char
            line = normalize('NFD',line).encode('ascii','ignore')
            line = line.decode('UTF-8')
            #tokenize on white space
            line = line.split()
            #now convert it to lowercase
            line = [word.lower() for word in line]
            #remove punctuation from each token
            line = [word.translate(table) for word in line]
            #remove non-printable characters from each line
            line = [re_print.sub('',w) for w in line]
            #remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            
            #store as string
            clean_pair.append(''.join(line))
        cleaned.append(clean_pair)
    return numpy.array(cleaned)

In [5]:
#save the cleaned data to a file
def save_cleaned_data(sentences,filename):
    dump(sentences,open(filename,'wb'))
    print('Saved: %s' %filename)

In [6]:
#calling functions
#load dataset
filename = 'deu.txt'
doc = load_doc(filename)
#split it into english-german pairs
pairs = to_pairs(doc)
#clean sentences
clean_pairs = clean_pairs(pairs)
#save it into a file
save_cleaned_data(clean_pairs,'english-german.pkl')

Saved: english-german.pkl


In [7]:
clean_pairs.shape  #152,820 words #2 columns (1 for each lang)

(152820, 2)

In [8]:
clean_pairs

array([['hi', 'hallo'],
       ['hi', 'grugott'],
       ['run', 'lauf'],
       ...,
       ['ifsomeonewhodoesntknowyourbackgroundsaysthatyousoundlikeanativespeakeritmeanstheyprobablynoticedsomethingaboutyourspeakingthatmadethemrealizeyouwerentanativespeakerinotherwordsyoudontreallysoundlikeanativespeaker',
        'wennjemandderdeineherkunftnichtkenntsagtdassduwieeinmuttersprachlersprichstbedeutetdasdassmanwahrscheinlichetwasandeinersprechweisebemerkthatdaserkennenliedassdukeinmuttersprachlerbistmitanderenwortenduhorstdichnichtwirklichwieeinmuttersprachleran'],
       ['ifsomeonewhodoesntknowyourbackgroundsaysthatyousoundlikeanativespeakeritmeanstheyprobablynoticedsomethingaboutyourspeakingthatmadethemrealizeyouwerentanativespeakerinotherwordsyoudontreallysoundlikeanativespeaker',
        'wennjemandfremdesdirsagtdassdudichwieeinmuttersprachleranhorstbedeutetdaswahrscheinlicherhatetwasandeinemsprechenbemerktdassdichalsnichtmuttersprachlerverratenhatmitanderenwortenduhorstdichnichtwir

In [9]:
for i in range(100):
    print('[%s] ==> [%s]' %(clean_pairs[i][0],clean_pairs[i][1]))

[hi] ==> [hallo]
[hi] ==> [grugott]
[run] ==> [lauf]
[wow] ==> [potzdonner]
[wow] ==> [donnerwetter]
[fire] ==> [feuer]
[help] ==> [hilfe]
[help] ==> [zuhulf]
[stop] ==> [stopp]
[wait] ==> [warte]
[hello] ==> [hallo]
[itry] ==> [ichprobierees]
[iwon] ==> [ichhabgewonnen]
[iwon] ==> [ichhabegewonnen]
[smile] ==> [lacheln]
[cheers] ==> [zumwohl]
[freeze] ==> [keinebewegung]
[freeze] ==> [stehenbleiben]
[gotit] ==> [verstanden]
[gotit] ==> [einverstanden]
[heran] ==> [errannte]
[heran] ==> [erlief]
[hopin] ==> [machmit]
[hugme] ==> [druckmich]
[hugme] ==> [nimmmichindenarm]
[hugme] ==> [umarmemich]
[ifell] ==> [ichfiel]
[ifell] ==> [ichfielhin]
[ifell] ==> [ichsturzte]
[ifell] ==> [ichbinhingefallen]
[ifell] ==> [ichbingesturzt]
[iknow] ==> [ichwei]
[ilied] ==> [ichhabegelogen]
[ilost] ==> [ichhabeverloren]
[im] ==> [ichbinjahrealt]
[im] ==> [ichbin]
[imok] ==> [mirgehtsgut]
[imok] ==> [esgehtmirgut]
[noway] ==> [unmoglich]
[noway] ==> [dasgibtsdochnicht]
[noway] ==> [ausgeschlossen]
[now

### Reduce the dataset to 10k examples 
9k for training and 1k for testing

In [10]:
from pickle import load,dump
from numpy.random import rand,shuffle

#load the clean dataset
def load_clean_dataset(filename):
    return load(open(filename,'rb'))

#for saving a list of clean sentences to file
def save_clean_data(sentences,filename):
    dump(sentences,open(filename,'wb'))
    print('Saved: %s' %filename)
    
raw_dataset = load_clean_dataset('english-german.pkl')

#reducing dataset size to 10k
n_sentences = 10000
dataset = raw_dataset[:n_sentences,:]

#random shuffle
shuffle(dataset)

#split it into train(9k) and test(1k) data
train,test = dataset[:9000],dataset[9000:]

#save it 
save_clean_data(dataset,'english-german-both.pkl')
save_clean_data(train,'english-german-train.pkl')
save_clean_data(test,'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl
