In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch 
from torch import nn


In [2]:
df = pd.read_csv('datasets/cmn.txt', sep='\t', header=None, names = ['eng', 'cn', 'info'])
df.sample(10)

Unnamed: 0,eng,cn,info
9429,I want something to read.,我要些讀的東西。,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4002,The problem is Tom.,問題是湯姆。,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
28740,Tom could have dealt with the problem in a bet...,汤姆本可以用更好的方式处理问题。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
19597,The rain prevented me from going.,雨大得让我不能走。,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
29014,English is spoken by more people than any othe...,說英語的人比說任何其他語言的人多。,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4342,Don't drop this cup.,不要掉了这个杯子。,CC-BY 2.0 (France) Attribution: tatoeba.org #7...
2185,Are you busy now?,现在你忙吗？,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
21854,The offer is too good to turn down.,此提議好得令人難以拒絕。,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
19490,She ripped her dress on a branch.,她在树枝上把自己的裙子扯烂了。,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
29754,"To make matters worse, he isn't even conscious...",让事情更糟糕的是，他没有注意到他打扰到了邻居。,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [3]:
df.shape

(29909, 3)

In [4]:
#data preprocessing

#lowercase
df['cn'] = df['cn'].str.lower()
df['eng'] = df['eng'].str.lower()

#remove punctuations

import string
df['cn'] = df['cn'].str.translate(str.maketrans('', '', string.punctuation))
df['eng'] = df['eng'].str.translate(str.maketrans('', '', string.punctuation))

In [5]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [6]:
df.sample(5)

Unnamed: 0,eng,cn,info
27203,it was not long before we met again by chance,没多久，我们又碰巧遇到了。,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
20005,can i make a reservation for golf,我能预定一下打高尔夫球吗？,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
8400,lets all stay in touch,让我们保持联络。,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
18123,my problem is i dont trust you,我的问题是我不信任你。,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
14970,i have only five thousand yen,我只有5000日元。,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [7]:
# remove extra space
df['cn'] = df['cn'].str.strip()
df['eng'] = df['eng'].str.strip()

In [8]:
#add start and end marks as <start> and <end> respectively
df['cn'] = '<start> ' + df['cn'] + ' <end>'
df['eng'] = '<start> ' + df['eng'] + ' <end>'

In [9]:
#tokenization
#find vocab size of df['eng'] and df['cn']

from collections import Counter
eng_counter = Counter([word for line in df['eng'] for word in line.split()])
cn_counter = Counter([word for line in df['cn'] for word in line.split()])

eng_vocabs = list(eng_counter.keys())
chi_vocabs = list(cn_counter.keys())


print(eng_vocabs[:5], chi_vocabs[:5])




['<start>', 'hi', '<end>', 'run', 'stop'] ['<start>', '嗨。', '<end>', '你好。', '你用跑的。']


In [10]:
#find the biggest sequence lenght with that sequence

max_len_eng = max([len(line.split(' ')) for line in df['eng']])
max_len_chi = max([len(line.split(' ')) for line in df['cn']])

print(max_len_eng, max_len_chi)

34 5


In [11]:
ll =([len(line.split()) for line in df['eng']])
print(ll.index(34))
df['eng'][29907]

29908


'<start> i got fired from the company but since i have a little money saved up for the time being i wont have trouble with living expenses <end>'

In [12]:
input_words = sorted(eng_vocabs)
target_words = sorted(chi_vocabs)
print(input_words[-5:])

['zigzagged', 'zimbabwe', 'zip', 'zipper', 'zoo']


In [13]:
#Machine translation begins

num_encoder_tokens = len(input_words) + 1 #+1 for zero padding
num_decoder_tokens = len(target_words) + 1

print(num_encoder_tokens, num_decoder_tokens)

7289 26284


In [57]:
print(input_word_index['<START>'])

KeyError: '<START>'

In [14]:
#index: word
input_word_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_word_index = dict([(word, i+1) for i, word in enumerate(target_words)])
print(input_word_index.items())



In [16]:
rev_input_char_index = dict((i, word) for word, i in input_word_index.items())
rev_target_char_index = dict((i, word) for word, i in target_word_index.items())

In [15]:
#Train test split garam aba
from sklearn.model_selection import train_test_split
X = df['eng']
y = df['cn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((23927,), (5982,))

In [17]:
#generate batch data
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    for j in range(0, len(X), batch_size):

        encoder_input_data = np.zeros((batch_size, max_len_eng),dtype='float32')
        decoder_input_data = np.zeros((batch_size, max_len_chi),dtype='float32')
        decoder_target_data = np.zeros((batch_size, max_len_chi, num_decoder_tokens),dtype='float32')

        for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):

            for no, each in enumerate(input_text.split()):
                encoder_input_data[i, no] = input_word_index[each] #encoder input sequience

            for no, each in enumerate(target_text.split()):
                if no<len(target_text.split())-1:
                    decoder_input_data[i, no] = target_word_index[each] # decoder input sequence
                if no>0: #decoder target seq does not include the start token so offset by 1
                    decoder_target_data[i, no - 1, target_word_index[each]] = 1.
        yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [56]:
encoder_in_data = np.zeros((len(df['eng']), 9), dtype = 'float32')

decoder_in_data = np.zeros((len(df['cn']), 5), dtype = 'float32')

decoder_target_data = np.zeros((len(df['cn']), 5, num_decoder_tokens), dtype = 'float32')

MemoryError: Unable to allocate 14.6 GiB for an array with shape (29909, 5, 26284) and data type float32