In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import spacy
import torchtext
#from torchtext.legacy.data import Field, TabularDataset
from sklearn.model_selection import train_test_split

import sklearn
import numpy as np
import tqdm
import pandas as pd
import time

In [2]:
!python -m spacy download fr_core_news_sm
!python -m spacy download en_core_web_sm

Collecting fr-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.5.0/fr_core_news_sm-3.5.0-py3-none-any.whl (16.3 MB)
     ---------------------------------------- 0.0/16.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/16.3 MB ? eta -:--:--
     --------------------------------------- 0.1/16.3 MB 660.6 kB/s eta 0:00:25
      --------------------------------------- 0.3/16.3 MB 2.5 MB/s eta 0:00:07
     -- ------------------------------------- 1.0/16.3 MB 6.5 MB/s eta 0:00:03
     ---- ----------------------------------- 1.9/16.3 MB 9.4 MB/s eta 0:00:02
     ------- -------------------------------- 2.9/16.3 MB 11.7 MB/s eta 0:00:02
     --------- ------------------------------ 3.9/16.3 MB 13.0 MB/s eta 0:00:01
     ----------- ---------------------------- 4.6/16.3 MB 13.3 MB/s eta 0:00:01
     ------------- -------------------------- 5.3/16.3 MB 13.6 MB/s eta 0:00:01
     -------------- --------------------

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 682.7 kB/s eta 0:00:19
     --------------------------------------- 0.1/12.8 MB 787.7 kB/s eta 0:00:17
     - -------------------------------------- 0.6/12.8 MB 5.0 MB/s eta 0:00:03
     ----- ---------------------------------- 1.9/12.8 MB 11.0 MB/s eta 0:00:01
     ---------- ----------------------------- 3.5/12.8 MB 17.1 MB/s eta 0:00:01
     ------------------ --------------------- 5.9/12.8 MB 23.5 MB/s eta 0:00:01
     --------------------- ------------------ 7.0/12.8 MB 23.7 MB/s eta 0:00:01
     ------------------------- -------------- 8.1/12.8 MB 23.5 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 22.8 MB/s eta 0:00:01
     ----------------------------

## Dataset

In [6]:
start = time.time()
data = pd.read_csv('./data/en-fr.csv')

end = time.time()

display(end - start)
data.head()

451.4462134838104

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [3]:
# Load the spacy tokenizer model
nlp_fr = spacy.load('fr_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

In [27]:
tokenizer_fr = lambda sentence: ['sos'] + [tok.text for tok in nlp_fr.tokenizer(str(sentence)) if tok.text != " "] + ['eos']
tokenizer_en = lambda sentence: ['sos'] + [tok.text for tok in nlp_en.tokenizer(str(sentence)) if tok.text != " "] + ['eos']

In [28]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data, tok_en, tok_fr, subset = False, portion=0.01, toked_data = None):
        """
        data: csv file that has english and french sentences
        subset: flag to signal if to only use a subset of the data to save processing time
        portion: portion of the data to use if subset is true
        toked_data: path to already tokenized data
        """
        
        if toked_data is not None:
            # tokenized data already available
            
            self.en = toked_data['en'].tolist()
            self.fr = toked_data['fr'].tolist()
            
            return
        
        title = f"./data/train_tokenized_df"
        portion_str = ".csv"
        if subset:
            data = data.iloc[:int(len(data) * portion), ]
            portion_str = f"_{portion}.csv"
            
        data['en'] = data['en'].apply(tok_en)
        data['fr'] = data['fr'].apply(tok_fr)
        
        data.to_csv(title+portion_str)
        
        display(data.head())
        
        self.en = data['en'].tolist()
        self.fr = data['fr'].tolist()
        
    def __len__(self):
        return len(self.en)
        
                
    def __getitem__(self, idx):
        """
        return:
            en: tokenized list of english sentence
            fr: tokenized lise of french sentence
        """
        return self.en[idx], self.fr[idx]

class TextTestDataset(torch.utils.data.Dataset):
    def __init__(self, data, tok_en, tok_fr, subset = False, portion=0.10, toked_data=None):
        """
        data: csv file that has english and french sentences
        subset: flag to signal if to only use a subset of the data to save processing time
        portion: portion of the data to use if subset is true
        """
        if toked_data is not None:
            # tokenized data already available
            
            self.en = toked_data['en'].tolist()
            self.fr = toked_data['fr'].tolist()
            
            return
        
        title = f"./data/test_tokenized_df"
        portion_str = ".csv"
        if subset:
            data = data.iloc[-1*int(len(data) * portion):, ]
            portion_str = f"_{portion}.csv"
            
        data['en'] = data['en'].apply(tok_en)
        data['fr'] = data['fr'].apply(tok_fr)
        
        data.to_csv(title+portion_str)
        
        display(data.head())
        
        self.en = data['en'].tolist()
        self.fr = data['fr'].tolist()
        
    def __len__(self):
        return len(self.en)
        
                
    def __getitem__(self, idx):
        """
        return:
            en: tokenized list of english sentence
            fr: tokenized lise of french sentence
        """
        return self.en[idx], self.fr[idx]
        



In [35]:
# Read in and create torch dataset

train_tokened = pd.read_csv('./data/train_tokenized_df_0.001.csv')
test_tokened = pd.read_csv('./data/test_tokenized_df_0.001.csv')

start = time.time()

train = TextDataset(data, tokenizer_en, tokenizer_fr, subset=True, portion = 0.001, toked_data = train_tokened)
test = TextTestDataset(data, tokenizer_en, tokenizer_fr, subset=True, portion = 0.001, toked_data = None) 

end = time.time()

display(end - start)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['en'] = data['en'].apply(tok_en)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['fr'] = data['fr'].apply(tok_fr)


Unnamed: 0,en,fr
22497856,"[sos, Model, for, learners, in, primary, educa...","[sos, 66.2005, -, Irlande, -, Modèle, pour, ap..."
22497857,"[sos, Comhar, na, Múinteoirí, Gaeilge, This, o...","[sos, Comhar, na, Múinteoirí, Gaeilge, Cette, ..."
22497858,"[sos, Gaillimh, Le, Gaeilge, This, website, pr...","[sos, Gaillimh, Le, Gaeilge, Ce, site, Interne..."
22497859,"[sos, National, newspapers, (, education, sect...","[sos, Presse, nationale, (, section, Education..."
22497860,"[sos, The, outcome, of, this, exercise, is, th...","[sos, 2008, Soutenir, les, jeunes, en, Europe,..."


2.863935708999634

In [38]:
test[200]

(['sos',
  'Summary',
  'of',
  'Research',
  '/',
  'Monitoring',
  'Activities',
  'Peregrine',
  'Falcon',
  'population',
  'surveys',
  'were',
  'conducted',
  'every',
  'five',
  'years',
  'between',
  '1970',
  'and',
  '2000',
  'in',
  'most',
  'regions',
  'of',
  'Canada',
  '.',
  'eos'],
 ['sos',
  'RÃ',
  '©',
  'sumÃ',
  '©',
  'des',
  'activitÃ',
  '©',
  's',
  'de',
  'recherche',
  'et',
  'de',
  'surveillance',
  'Les',
  'relevÃ',
  '©',
  's',
  'des',
  'populations',
  'de',
  'Faucons',
  'pÃ¨lerins',
  'ont',
  'Ã',
  '©',
  'tÃ',
  '©',
  'menÃ',
  '©',
  's',
  'tous',
  'les',
  'cinq',
  'ans',
  'entre',
  '1970',
  'et',
  '2000',
  'dans',
  'la',
  'plupart',
  'des',
  'rÃ',
  '©',
  'gions',
  'du',
  'Canada',
  '.',
  'eos'])