In [67]:
import string
import math
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
from pickle import load
from numpy.random import rand
from numpy.random import shuffle

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [4]:
# load dataset
filename = 'spa.txt'
doc = load_doc(filename)
# split into english-spanish pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-spanish.pkl')
# spot check
for i in range(900,1000):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-spanish.pkl
[be careful] => [ten cuidado]
[be careful] => [se cuidadoso]
[be content] => [estate contento]
[be on time] => [llega a tiempo]
[be on time] => [llegue a tiempo]
[be patient] => [sea paciente]
[be serious] => [se serio]
[birds sing] => [los pajaros cantan]
[birds sing] => [los pajaros estan cantando]
[bring food] => [traed comida]
[bring help] => [traed ayuda]
[bring wine] => [trae vino]
[can i come] => [puedo ir]
[can i come] => [puedo venir]
[can i come] => [puedo acercarme]
[can i help] => [puedo ayudar]
[can i stay] => [me puedo quedar]
[carry this] => [lleva esto]
[check that] => [comprobad eso]
[check this] => [comprueba esto]
[choose one] => [escoge uno]
[come again] => [vuelve otra vez]
[come alone] => [ven solo]
[come along] => [vente]
[come along] => [venganse]
[come early] => [veni temprano]
[come early] => [ven temprano]
[come early] => [vengan temprano]
[come early] => [venga temprano]
[come on in] => [pasale]
[come on in] => [pasele]
[come on in

In [70]:
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-spanish.pkl')
new_dataset = []
shuffle(raw_dataset)
for i in raw_dataset:
    if len(i[1].split(" ")) <= 4 and len(i[0].split(" ")) <= 4:
        new_dataset.append([i[0],i[1]])
new_dataset = array(new_dataset)
n_sentences = len(dataset)
dataset = new_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
split = math.floor(len(dataset) - (len(dataset)*0.2))
train, test = dataset[:split], dataset[split:]
# save
save_clean_data(dataset, 'english-spanish-both.pkl')
save_clean_data(train, 'english-spanish-train.pkl')
save_clean_data(test, 'english-spanish-test.pkl')

Saved: english-spanish-both.pkl
Saved: english-spanish-train.pkl
Saved: english-spanish-test.pkl
