In [104]:
import torch
import torch.nn as nn
import torch.optim as optim

import spacy
import re
import string
import torchtext
from nltk.tokenize import word_tokenize

import sklearn
from sklearn.model_selection import train_test_split

import numpy as np
import tqdm
import pandas as pd
import time

import os
import gc
import scipy

# Reference: https://www.kaggle.com/code/rahmafarag/ml-translate

## Load Data

In [11]:
# Load cleaned data directly if available
if os.path.isfile('./data/cleaned_data.csv'):
    df = pd.read_csv('./data/cleaned_data.csv')
else:

    start = time.time()
    df = pd.read_csv('./data/en-fr.csv')

    end = time.time()

    display(end - start)

df.head()

446.64206743240356

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


## Clean Data

In [48]:
### See reference

def to_lowercase(text):
    return text.lower()

# Remove website links
def remove_links(text):
    template = re.compile(r'https?://\S+|www\.\S+') 
    text = template.sub(r'', text)
    return text

# Remove HTML tags
def remove_html(text):
    template = re.compile(r'<[^>]*>') 
    text = template.sub(r'', text)
    return text


# Remove stopwords
def remove_stopwords(words, stop_words):
    return [word for word in words if word not in stop_words]

# Remove none ascii characters
def remove_non_ascii(text):
    template = re.compile(r'[^\x00-\x7E]+') 
    text = template.sub(r'', text)
    return text

# Replace none printable characters
def remove_non_printable(text):
    template = re.compile(r'[\x00-\x1F]+') 
    text = template.sub(r' ', text)
    return text

# Remove special characters
def remove_special_chars(text):
        text = re.sub("'s", '', text)
        template = re.compile('["#$%&\'()\*\+-/:;<=>@\[\]\\\\^_`{|}~]') 
        text = template.sub(r' ', text)
        return text

# Replace multiple punctuation 
def replace_multiplt_punc(text):
        text = re.sub('[.!?]{2,}', '.', text)
        text = re.sub(',+', ',', text) 
        return text

    # Remove numbers
def remove_numbers(text):
        text = re.sub('\d+', ' ', text)
        return text

def handle_spaces(text):
    # Remove extra spaces
    text = re.sub('\s+', ' ', text)
    
    # Remove spaces at the beginning and at the end of string
    text = text.strip() 
    
    return text

def remove_punctuation(text):
    """Remove punctuation from list of tokenized words"""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def text2words(text):
      return word_tokenize(text)


def clean_text(text):
    text = remove_special_chars(str(text))
    text = remove_non_ascii(text)
    text = remove_non_printable(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = to_lowercase(text)
    text = handle_spaces(text)
    words = text2words(text)
    words = ['sos'] + words + ['eos']

    return ' '.join(words)


In [50]:
train_portion = 0.01
test_portion = 0.01

num_train = int(len(df) * train_portion)
num_test = int(len(df) * test_portion)


df_subset = pd.concat((df.iloc[:num_train, :], df.iloc[-num_test:, :]))

display(len(df_subset))

df_subset.tail()

450406

Unnamed: 0,en,fr
22520371,Only with a highly overcompensatory stock–recr...,C'est seulement en cas de courbe stock–recrute...
22520372,The model predicts that the assumption made ab...,Le modèle prévoit que l'hypothèse émise au suj...
22520373,Overall the results confirm the unsatisfactory...,"Dans l'ensemble, les résultats confirment le p..."
22520374,"Error 404 — file not found Sorry, but the file...",Erreur 404 — fichier introuvable Nous sommes d...
22520375,British Columbia Lodging and Campgrounds Assoc...,British Columbia Lodging and Campgrounds Assoc...


In [51]:
# clean data only if didn't load cleaned_data
if os.path.isfile('./data/cleaned_data.csv'):
    start = time.time()
    
    df_subset['en'] = df_subset['en'].apply(lambda x: clean_text(x))
    df_subset['fr'] = df_subset['fr'].apply(lambda x: clean_text(x))
    end = time.time()
    
    display(end - start)

    df_subset.to_csv('./data/cleaned_data.csv')

df_subset.head()

108.78032684326172

Unnamed: 0,en,fr
0,sos changing lives changing society how it wor...,sos il a transform notre vie il a transform la...
1,sos site map eos,sos plan du site eos
2,sos feedback eos,sos rtroaction eos
3,sos credits eos,sos crdits eos
4,sos franais eos,sos english eos


## Construct en and fr vocabulary

In [52]:
english_vocab = []
for sent in df_subset['en']:
    for token in sent.split():
        english_vocab.append(token)
set_english_vocab = set(english_vocab)
len(set_english_vocab)

144176

In [53]:
fr_vocab = []
for sent in df_subset['fr']:
    for token in sent.split():
        fr_vocab.append(token)
set_fr_vocab = set(fr_vocab)
len(set_fr_vocab)

168281

In [54]:
display(english_vocab[:10])
display(fr_vocab[:10])

['sos',
 'changing',
 'lives',
 'changing',
 'society',
 'how',
 'it',
 'works',
 'technology',
 'drives']

['sos', 'il', 'a', 'transform', 'notre', 'vie', 'il', 'a', 'transform', 'la']

In [55]:
sorted(list(set_english_vocab))

['a',
 'aa',
 'aaa',
 'aaarated',
 'aaas',
 'aabb',
 'aabbhchhch',
 'aabc',
 'aac',
 'aacc',
 'aaccdd',
 'aaccnet',
 'aachen',
 'aacr',
 'aacs',
 'aacti',
 'aactivity',
 'aacute',
 'aad',
 'aadac',
 'aadt',
 'aae',
 'aaf',
 'aafanb',
 'aafc',
 'aafcapril',
 'aafccanada',
 'aafcpromoting',
 'aafcreport',
 'aafcs',
 'aafcthe',
 'aafcwww',
 'aag',
 'aagreement',
 'aah',
 'aahar',
 'aaharindia',
 'aahrp',
 'aahuad',
 'aai',
 'aaib',
 'aairp',
 'aaj',
 'aal',
 'aalborg',
 'aalexx',
 'aalexxintl',
 'aalge',
 'aall',
 'aalogine',
 'aama',
 'aamjiwnaang',
 'aamp',
 'aamt',
 'aanderaa',
 'aandrews',
 'aapausa',
 'aapc',
 'aapcs',
 'aapex',
 'aapq',
 'aar',
 'aarab',
 'aardvark',
 'aarestrup',
 'aargau',
 'aargauer',
 'aarhus',
 'aarias',
 'aarm',
 'aarnout',
 'aarom',
 'aaron',
 'aarsther',
 'aarticles',
 'aas',
 'aasen',
 'aasland',
 'aastrup',
 'aat',
 'aata',
 'aatami',
 'aatelma',
 'aatlantic',
 'aatomic',
 'aau',
 'aaus',
 'aaw',
 'aaweb',
 'ab',
 'aba',
 'ababa',
 'abac',
 'abaca',
 'abac

In [56]:
input_words = sorted(list(set_english_vocab))
target_words = sorted(list(set_fr_vocab))


num_encoder_tokens = len(set_english_vocab)
num_decoder_tokens = len(set_fr_vocab)
num_decoder_tokens +=1
print(num_encoder_tokens, num_decoder_tokens)

144176 168282


In [57]:
# construct word to index mapping
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# construct index to word mapping
reverse_input_index = dict([(i, word) for word, i in input_token_index.items()]) 
reverse_target_index = dict([(i, word) for word, i in target_token_index.items()]) 

In [59]:
en_len = [len(sent.split()) for sent in df_subset['en']]
fr_len = [len(sent.split()) for sent in df_subset['fr']]

en_len = np.array(en_len)
fr_len = np.array(fr_len)


In [60]:
display(np.percentile(en_len, 80))
display(np.percentile(fr_len, 80))

35.0

40.0

In [61]:
max_en_len = int(np.percentile(en_len, 80))
max_fr_len = int(np.percentile(fr_len, 80))

## Split data and construct model input & outputs

In [67]:
train_df = df_subset[:num_train]
test_df = df_subset[-num_test:]

X_train = train_df['en']
y_train = train_df['fr']

X_test = test_df['en']
y_test = test_df['fr']

X_train

0         sos changing lives changing society how it wor...
1                                          sos site map eos
2                                          sos feedback eos
3                                           sos credits eos
4                                           sos franais eos
                                ...                        
225198    sos download winter spring issue pdf kb in thi...
225199                         sos expert roundtable on eos
225200    sos making canada the destination of choice fo...
225201    sos national political infrastructure and fore...
225202                      sos the productivity volume eos
Name: en, Length: 225203, dtype: object

In [92]:
gc.collect()

3705

In [106]:
batch_size=len(X_train)
encoder_input_data = np.zeros((batch_size, max_en_len), dtype='float32')
decoder_input_data = np.zeros((batch_size, max_fr_len), dtype='float32')
#decoder_target_data = np.zeros((batch_size, max_fr_len, num_decoder_tokens), dtype='float16')
#decoder_target_data = np.zeros((batch_size, max_fr_len), dtype='float32')
decoder_target_data = scipy.sparse.csr_array(shape=(batch_size, max_fr_len, num_decoder_tokens), dtype='float32')

for i in range(batch_size):
    en = X_train[i]
    fr = y_train[i]
    
    for t, word in enumerate(en.split()):
        if t < max_en_len:
            encoder_input_data[i, t] = input_token_index[word]
            
    for t, word in enumerate(fr.split()):
        if ((t < (len(fr.split())-1)) and (t< max_fr_len) ): #we don't include the end token in decoder inputs
            decoder_input_data[i, t] = target_token_index[word]
        if ((t > 0 )and (t<max_fr_len)): # we don;t incluse start token in decoder outputs
            decoder_target_data[i, t-1, target_token_index[word]] = 1.
            #decoder_target_data[i, t-1] = target_token_index[word]
            
display(X_train[0])
display(input_token_index['changing'])
display(encoder_input_data[0])

display(y_train[0])
display(target_token_index['notre'])
display(decoder_input_data[0])

gc.collect()


TypeError: _cs_matrix.__init__() missing 1 required positional argument: 'arg1'

In [28]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data, tok_en, tok_fr, subset = False, portion=0.01):
        """
        data: csv file that has english and french sentences
        subset: flag to signal if to only use a subset of the data to save processing time
        portion: portion of the data to use if subset is true
        toked_data: path to already tokenized data
        """
        
        pass
        
    def __len__(self):
        pass
        
                
    def __getitem__(self, idx):
        """
        return:
            en: tokenized list of english sentence
            fr: tokenized lise of french sentence
        """
        pass

class TextTestDataset(torch.utils.data.Dataset):
    def __init__(self, data, tok_en, tok_fr, subset = False, portion=0.10):
        """
        data: csv file that has english and french sentences
        subset: flag to signal if to only use a subset of the data to save processing time
        portion: portion of the data to use if subset is true
        """
        pass
        
        
    def __len__(self):
        pass
        
                
    def __getitem__(self, idx):
        """
        return:
            en: tokenized list of english sentence
            fr: tokenized lise of french sentence
        """
        pass
        

