# Afrikaans to English Translation


### Preparing the  Data

Data preparation is divided into two subsections:

1. Clean Data
2. Split Text



#### 1. Cleaning Data

In [28]:
def load_doc(file_name):
    # Open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


In [29]:
# splitting the loaded text by line and then by phrase.
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

Performing cleaning to do the following:

- Removing all non-printable characters.
- Removing all punctuation characters.
- Normalizing all Unicode characters to ASCII (e.g. Latin characters).
- Normalizing the case to lowercase.
- Removing any remaining tokens that are not alphabetic.

Performing these operations on each phrase for each pair in the loaded dataset.

In [30]:
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)

In [31]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
 
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, mode='rt', encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# split a loaded document into sentences
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs
 
# clean a list of lines
def clean_pairs(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair = list()
		for line in pair:
			# normalize unicode characters
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8')
			# tokenize on white space
			line = line.split()
			# convert to lowercase
			line = [word.lower() for word in line]
			# remove punctuation from each token
			line = [word.translate(table) for word in line]
			# remove non-printable chars form each token
			line = [re_print.sub('', w) for w in line]
			# remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			# store as string
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return array(cleaned)
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)
 
# load dataset
filename = 'afr.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-afrikaans.pkl')
# spot check
for i in range(100):
	print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-afrikaans.pkl
[come in] => [gaan binne]
[she runs] => [sy hardloop]
[you lost] => [jy verloor]
[go inside] => [gaan binne]
[he is poor] => [hy is arm]
[i eat rice] => [ek eet rys]
[we all lie] => [ons almal lieg]
[im thirsty] => [ek is dors]
[thats life] => [dis die lewe]
[tom hurt me] => [tom het my beseer]
[tom may die] => [tom kan dalk dood gaan]
[what a pity] => [hoe jammer]
[youve lost] => [jy verloor]
[find the cat] => [soek die kat]
[i have a car] => [ek het n kar]
[please hurry] => [maak asseblief gou]
[we all stood] => [ons almal het gestaan]
[what a shame] => [hoe jammer]
[even tom lied] => [selfs tom het gelieg]
[im not blind] => [ek is nie blind nie]
[its too ugly] => [dis te lelik]
[let tom drive] => [laat tom bestuur]
[tom had to go] => [tom moes gaan]
[tom wont die] => [tom sal nie dood gaan nie]
[was tom there] => [was tom daar]
[we need money] => [ons het geld nodig]
[you are drunk] => [jy is dronk]
[you are drunk] => [julle is dronk]
[i hate driving] =>

## Splitting Text

In [32]:

from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-afrikaans.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-afrikaan-both.pkl')
save_clean_data(train, 'english-afrikaan-train.pkl')
save_clean_data(test, 'english-afrikaan-test.pkl')

Saved: english-afrikaan-both.pkl
Saved: english-afrikaan-train.pkl
Saved: english-afrikaan-test.pkl


The new files: 

`english-afrikaan-both.pkl`: contains all of the train and test examples that we can use to define the parameters of the problem.

`english-afrikaan-train.pkl`: contains files for train dataset.

`english-afrikaan-test.pkl`: contains files for the train and test dataset.



## Training  Neural Translation Model



In [33]:
# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# load datasets
dataset = load_clean_sentences('english-afrikaan-both.pkl')
train = load_clean_sentences('english-afrikaan-train.pkl')
test = load_clean_sentences('english-afrikaan-test.pkl')


Using the Keras Tokenize class to map words to integers, as needed for modeling. We will use separate tokenizer for the English sequences and the Afrikaans sequences. The function below-named `create_tokenizer()` will train a tokenizer on a list of phrases.

In [34]:

# fit a tokenizer

from keras.preprocessing.text import Tokenizer
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer


ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [None]:
# finding the length of the longest sequence in a list of phrases.
def max_length(lines):
	return max(len(line.split()) for line in lines)

In [None]:

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
afr_tokenizer = create_tokenizer(dataset[:, 1])
afr_vocab_size = len(ger_tokenizer.word_index) + 1
afr_length = max_length(dataset[:, 1])
print('Afrikaans Vocabulary Size: %d' % afr_vocab_size)
print(' Afrikaans Max Length: %d' % (afr_length))
