In [None]:
from translate.storage.tmx import tmxfile
import re
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer

## Dataset and Problem Selection

I chose to try to implement a many-to-many model of an RNN to translate from English to German. I found a dataset that was used in the Fifth Confrence on Machine Translation (WMT20 https://www.statmt.org/wmt20/translation-task.html) which was used to help translate news articles in a variety of languages. I used the TILDE Model - RAPID corpus for English to German found here: https://tilde-model.s3-eu-west-1.amazonaws.com/Tilde_MODEL_Corpus.html.

The data was in a format commonly used in machine translation called TMX, an XML-like format that contains pairs of sentences in both languages. I used the python library translate-toolkit to read in the file.

In [5]:
with open("RAPID_2019.UNIQUE.de-en.tmx", 'rb') as file:
    tmx_file = tmxfile(file, 'en', 'ar')

### Dataset Preprocessing

First I read in the language pairs from the .tmx file and perform an letters and space regex operation on each to clean the data and reduce the number of unique words. Then we perform a train-dev-test split on the data.

In [6]:
X, Y = [], []

for node in tmx_file.unit_iter():
    X.append(re.sub(r'[^A-Za-z\s]', '', node.source).lower())
    Y.append(re.sub(r'[^A-Za-z\s]', '', node.target).lower())

In [7]:
xd, yd = X[0:20000], Y[0:20000]
print('German:', xd[0], '\n')
print('English:', yd[0])

German: diversity innovative ideen fr kultur und kreativwirtschaft in europa ist ein vom europischen parlament initiiertes projekt das von der generaldirektion bildung und kultur der europischen kommission durchgefhrt wird 

English: diversity innovative ideas for the cultural and creative sectors in europe is a pilot project launched by the european parliament and organised by the commissions directorate general for education and culture


In order to have our data readable by the RNN, we need to turn it into vectors of numeric values. However because there are so many unique words in german and english, we can't One Hot encode the sentences, or each input sequence would be a n-length array of 100K+ length arrays, mostly made of zeros. Instead, I chose to use the tf.keras text Tokenizer to map each word in the corpus to a unique integer, and then pad the inputs with zeros to be the same size. Now the number of inputs into our RNN is the length of the longest sentence in the corpus.

In [10]:
def tokenize_and_pad(m):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(m)
    m_v = tokenizer.texts_to_sequences(m)
    
    maxlen = max(len(r) for r in m_v)
    m_vp = np.zeros((len(m_v), maxlen), dtype=int)
    for enu, row in enumerate(m_v):
        m_vp[enu, :len(row)] += row 
    
    return m_vp

x_vp = tokenize_and_pad(xd)
y_vp = tokenize_and_pad(yd)

    
    
#xd = tf.convert_to_tensor(xd)
#yd = tf.convert_to_tensor(yd)

I chose to use the Tensorflow library, since I am already familiar with it, and Tensorflow Keras has a good package for working with RNNs

## RNN Implementation

In [33]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16292971932113487791
]


In [12]:
# Hyperparameters

input_size = len(x_vp[0])
output_size = len(y_vp[0])

In [31]:
model = keras.Sequential()
model.add(layers.Embedding(input_dim=input_size, output_dim=input_size))
model.add(layers.SimpleRNN(200))
model.add(layers.Dense(output_size))
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 126)         15876     
_________________________________________________________________
simple_rnn_10 (SimpleRNN)    (None, 200)               65400     
_________________________________________________________________
dense_8 (Dense)              (None, 138)               27738     
Total params: 109,014
Trainable params: 109,014
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.compile(optimizer="adam", loss="mse", metrics=["accuracy"])
model.fit(x_vp, y_vp)

InvalidArgumentError:  indices[11,0] = 380 is not in [0, 126)
	 [[node sequential_10/embedding_4/embedding_lookup (defined at <ipython-input-32-d581b50c9b2e>:2) ]] [Op:__inference_train_function_6299]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_10/embedding_4/embedding_lookup:
 sequential_10/embedding_4/embedding_lookup/5778 (defined at /usr/lib/python3.8/contextlib.py:113)

Function call stack:
train_function


## Word2Vec Implementation



In [None]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))

In [None]:
glove_vectors = gensim.downloader.load('glove-twitter-25')

Cosine similarity = operation on output vectors for the two words

Dissimiarity = possible inverse, look into other methods from papers

In [68]:
def cosine_sim(vec1, vec2):
    return np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

def dissim(vec1, vec2):
    return np.sum(np.square(vec1 - vec2))

My Dissimilarity function is an implementation of the Euclidean Distance function. I looked into research papers on 

In [76]:

word1 = input('First word:').lower()
word2 = input('Second word:').lower()
wv1 = glove_vectors[word1]
wv2 = glove_vectors[word2]
print(cosine_sim(wv1, wv2))
print(dissim(wv1, wv2))


First word:texas
Second word:oklahoma
0.9004377
4.4934297
