## Libraries

In [2]:
# importing the libraries
import datasets
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.utils import shuffle
from transformers import XLMRobertaTokenizerFast
from nltk import sent_tokenize

## Dataset Loading from HuggingFace

In [5]:
# loading MARC dataset
dataset  = datasets.load_dataset('amazon_reviews_multi')

No config specified, defaulting to: amazon_reviews_multi/all_languages
Found cached dataset amazon_reviews_multi (/Users/msishuvo/.cache/huggingface/datasets/amazon_reviews_multi/all_languages/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 1200000
    })
    validation: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 30000
    })
})

In [7]:
# saving the data to CSV file
for split, dataset in dataset.items():
    dataset.to_csv(f"data/amazon-reviews-multi-{split}.csv", index=None)

Creating CSV from Arrow format:   0%|          | 0/1200 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

## Data Preprocessing

In [2]:
# reading the csv files
df_train = pd.read_csv("data/amazon-reviews-multi-train.csv")
df_test = pd.read_csv("data/amazon-reviews-multi-test.csv")
df_validation = pd.read_csv("data/amazon-reviews-multi-validation.csv")

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 8 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   review_id         1200000 non-null  object
 1   product_id        1200000 non-null  object
 2   reviewer_id       1200000 non-null  object
 3   stars             1200000 non-null  int64 
 4   review_body       1200000 non-null  object
 5   review_title      1199967 non-null  object
 6   language          1200000 non-null  object
 7   product_category  1200000 non-null  object
dtypes: int64(1), object(7)
memory usage: 73.2+ MB


In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         30000 non-null  object
 1   product_id        30000 non-null  object
 2   reviewer_id       30000 non-null  object
 3   stars             30000 non-null  int64 
 4   review_body       30000 non-null  object
 5   review_title      30000 non-null  object
 6   language          30000 non-null  object
 7   product_category  30000 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.8+ MB


In [5]:
df_validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         30000 non-null  object
 1   product_id        30000 non-null  object
 2   reviewer_id       30000 non-null  object
 3   stars             30000 non-null  int64 
 4   review_body       30000 non-null  object
 5   review_title      29999 non-null  object
 6   language          30000 non-null  object
 7   product_category  30000 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.8+ MB


In [6]:
# filtering relevant columns
df_train_filtered = df_train[['review_body', 'language']]
df_test_filtered = df_test[['review_body', 'language']]
df_validation_filtered = df_validation[['review_body', 'language']]

In [7]:
print(df_train_filtered.language.unique())
print(df_test_filtered.language.unique())
print(df_validation_filtered.language.unique())

['de' 'en' 'es' 'fr' 'ja' 'zh']
['de' 'en' 'es' 'fr' 'ja' 'zh']
['de' 'en' 'es' 'fr' 'ja' 'zh']


In [8]:
# removing Chinese and Japanese languages
df_train_filtered = df_train_filtered[(df_train_filtered['language'] != 'ja') & (df_train_filtered['language'] != 'zh')]
df_test_filtered = df_test_filtered[(df_test_filtered['language'] != 'ja') & (df_test_filtered['language'] != 'zh')]
df_validation_filtered = df_validation_filtered[(df_validation_filtered['language'] != 'ja') & (df_validation_filtered['language'] != 'zh')]

In [9]:
print(df_train_filtered.language.unique())
print(df_test_filtered.language.unique())
print(df_validation_filtered.language.unique())

['de' 'en' 'es' 'fr']
['de' 'en' 'es' 'fr']
['de' 'en' 'es' 'fr']


In [10]:
df_train_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800000 entries, 0 to 799999
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   review_body  800000 non-null  object
 1   language     800000 non-null  object
dtypes: object(2)
memory usage: 18.3+ MB


In [11]:
df_test_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_body  20000 non-null  object
 1   language     20000 non-null  object
dtypes: object(2)
memory usage: 468.8+ KB


In [12]:
df_validation_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_body  20000 non-null  object
 1   language     20000 non-null  object
dtypes: object(2)
memory usage: 468.8+ KB


In [13]:
print(df_train_filtered.language.value_counts())

de    200000
en    200000
es    200000
fr    200000
Name: language, dtype: int64


In [14]:
print(df_test_filtered.language.value_counts())

de    5000
en    5000
es    5000
fr    5000
Name: language, dtype: int64


In [15]:
print(df_validation_filtered.language.value_counts())

de    5000
en    5000
es    5000
fr    5000
Name: language, dtype: int64


In [16]:
# encoding the labels
label_encoder = preprocessing.LabelEncoder()

df_train_filtered['language_label']= label_encoder.fit_transform(df_train_filtered['language'])
df_test_filtered['language_label']= label_encoder.fit_transform(df_test_filtered['language'])
df_validation_filtered['language_label']= label_encoder.fit_transform(df_validation_filtered['language'])

In [18]:
print(df_train_filtered.language_label.value_counts())

0    200000
1    200000
2    200000
3    200000
Name: language_label, dtype: int64


In [19]:
print(df_test_filtered.language_label.value_counts())

0    5000
1    5000
2    5000
3    5000
Name: language_label, dtype: int64


In [20]:
print(df_validation_filtered.language_label.value_counts())

0    5000
1    5000
2    5000
3    5000
Name: language_label, dtype: int64


In [21]:
df_train_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800000 entries, 0 to 799999
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   review_body     800000 non-null  object
 1   language        800000 non-null  object
 2   language_label  800000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 24.4+ MB


In [22]:
df_test_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   review_body     20000 non-null  object
 1   language        20000 non-null  object
 2   language_label  20000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 625.0+ KB


In [23]:
df_validation_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   review_body     20000 non-null  object
 1   language        20000 non-null  object
 2   language_label  20000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 625.0+ KB


In [107]:
# merging the rows randomly by taking 1 sample from each language
def merging_rows(df):
  percentage = 0
  factor = len(df.index)/100
  reviews = []
  labels = []
  # printing status of the operation
  for row in df.index:
    if row % factor == 0:
        print(percentage, "% complete." )
        percentage = percentage + 4
    i = 4
    if i > df.shape[0]:
      break
    else:
      df_temp = df.sample(frac = 1.0).groupby('language_label').head(1) 
      reviews.append(df_temp['review_body'].values)
      labels.append(df_temp['language_label'].values)
      # removing the rows so that there is no duplicate reviews
      df.drop(df_temp.index, inplace=True)
      df.reset_index(drop=True)
  return reviews, labels

In [117]:
df_train_temp = df_train_filtered.copy()
df_test_temp = df_test_filtered.copy()
df_validation_temp = df_validation_filtered.copy()

In [118]:
# creating training set
reviews_train, labels_train = merging_rows(df_train_temp)

0 % complete.
4 % complete.
8 % complete.
12 % complete.
16 % complete.
20 % complete.
24 % complete.
28 % complete.
32 % complete.
36 % complete.
40 % complete.
44 % complete.
48 % complete.
52 % complete.
56 % complete.
60 % complete.
64 % complete.
68 % complete.
72 % complete.
76 % complete.
80 % complete.
84 % complete.
88 % complete.
92 % complete.
96 % complete.
100 % complete.


In [119]:
# attaching labels length
labels_len_train = []
for i in labels_train:
  labels_len_train.append(len(i))

In [120]:
# creating the training dataframe
df_train_final = pd.DataFrame(list(zip(reviews_train, labels_train, labels_len_train)), 
                             columns =['reviews', 'labels', 'labels_len'])
df_train_final

Unnamed: 0,reviews,labels,labels_len
0,[produit defectueux au niveau de la couleur ro...,"[3, 0, 1, 2]",4
1,[Perfekt für trockene Augen Kontaktlinsen Träg...,"[0, 3, 2, 1]",4
2,[Calidad muy baja. La nuestra vino incluso con...,"[2, 0, 1, 3]",4
3,[Contains many of the old songs that are now h...,"[1, 3, 0, 2]",4
4,[It like it but it is a tad small for what I w...,"[1, 2, 0, 3]",4
...,...,...,...
199995,[Diese Tabs sind nicht zu empfehlen. Das Gesch...,"[0, 3, 1, 2]",4
199996,[Problemas con el envio.... Tuve que pagar bas...,"[2, 0, 1, 3]",4
199997,"[Ansicht ist das Alu Profil okay, wenn nur all...","[0, 3, 1, 2]",4
199998,"[An sich gut verarbeitet, die Gummiauflage obe...","[0, 1, 2, 3]",4


In [121]:
df_train_final['labels_len'].value_counts()

4    200000
Name: labels_len, dtype: int64

In [122]:
# svaing the dataframe to h5 format
df_train_final.to_hdf('data/marc_multilingual-merged_train_final.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['reviews', 'labels'], dtype='object')]

  df_train_final.to_hdf('data/marc_multilingual-merged_train_final.h5', key='df', mode='w')


In [123]:
# creating test set
reviews_test, labels_test = merging_rows(df_test_temp)

0 % complete.
4 % complete.
8 % complete.
12 % complete.
16 % complete.
20 % complete.
24 % complete.
28 % complete.
32 % complete.
36 % complete.
40 % complete.
44 % complete.
48 % complete.
52 % complete.
56 % complete.
60 % complete.
64 % complete.
68 % complete.
72 % complete.
76 % complete.
80 % complete.
84 % complete.
88 % complete.
92 % complete.
96 % complete.
100 % complete.


In [124]:
# attaching labels length
labels_len_test = []
for i in labels_test:
  labels_len_test.append(len(i))

In [125]:
# creating the test dataframe
df_test_final = pd.DataFrame(list(zip(reviews_test, labels_test, labels_len_test)), 
                             columns =['reviews', 'labels', 'labels_len'])
df_test_final

Unnamed: 0,reviews,labels,labels_len
0,[Utilisation ok Pratique et simple d'installat...,"[3, 1, 2, 0]",4
1,[It keeps things cold very well. My issue is w...,"[1, 0, 2, 3]",4
2,[Die Höschen wurden zügig geliefert. Ich hatte...,"[0, 1, 3, 2]",4
3,[Un año después la batería está para reemplaza...,"[2, 0, 3, 1]",4
4,[Die Enkelinnen waren begeistert und sind es n...,"[0, 2, 3, 1]",4
...,...,...,...
4995,"[Deno lleva toma de corriente estandard, lleva...","[2, 3, 0, 1]",4
4996,[I got this for Yeti mug as reminder of how mu...,"[1, 2, 0, 3]",4
4997,[Zuviel hatte ich nicht erwartet allerdings is...,"[0, 3, 1, 2]",4
4998,[Tarda como unas 15 horas en cargarse. Y si la...,"[2, 1, 0, 3]",4


In [126]:
df_test_final['labels_len'].value_counts()

4    5000
Name: labels_len, dtype: int64

In [127]:
# saving the dataframe to h5 format
df_test_final.to_hdf('data/marc_multilingual-merged_test_final.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['reviews', 'labels'], dtype='object')]

  df_test_final.to_hdf('data/marc_multilingual-merged_test_final.h5', key='df', mode='w')


In [128]:
# creating validation set
reviews_validation, labels_validation = merging_rows(df_validation_temp)

0 % complete.
4 % complete.
8 % complete.
12 % complete.
16 % complete.
20 % complete.
24 % complete.
28 % complete.
32 % complete.
36 % complete.
40 % complete.
44 % complete.
48 % complete.
52 % complete.
56 % complete.
60 % complete.
64 % complete.
68 % complete.
72 % complete.
76 % complete.
80 % complete.
84 % complete.
88 % complete.
92 % complete.
96 % complete.
100 % complete.


In [129]:
# attaching labels length
labels_len_validation = []
for i in labels_validation:
  labels_len_validation.append(len(i))

In [130]:
# creating the validation dataframe
df_validation_final = pd.DataFrame(list(zip(reviews_validation, labels_validation, labels_len_validation)), 
                             columns =['reviews', 'labels', 'labels_len'])
df_validation_final

Unnamed: 0,reviews,labels,labels_len
0,[Kam offenbar schon benutzt bei mir an. Verpac...,"[0, 2, 1, 3]",4
1,[El cambio del hilo por un material de plástic...,"[2, 1, 3, 0]",4
2,"[Funciona bien, pero es un firewall muy especi...","[2, 1, 0, 3]",4
3,[Matifie bien la peau mais pas assez hydratant...,"[3, 2, 0, 1]",4
4,"[Pinta mejor en las fotos que en la realidad, ...","[2, 1, 3, 0]",4
...,...,...,...
4995,[J’en pensais que ce manteau aurait quand même...,"[3, 2, 0, 1]",4
4996,[Great jacket you can use it for casual and ri...,"[1, 3, 0, 2]",4
4997,"[Le bas est parfait, par contre le haut pas te...","[3, 1, 2, 0]",4
4998,"[komplizierter scheiss, Son pequeñitas, tamaño...","[0, 2, 3, 1]",4


In [131]:
df_validation_final['labels_len'].value_counts()

4    5000
Name: labels_len, dtype: int64

In [132]:
# saving the dataframe to h5 format
df_validation_final.to_hdf('data/marc_multilingual-merged_validation_final.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['reviews', 'labels'], dtype='object')]

  df_validation_final.to_hdf('data/marc_multilingual-merged_validation_final.h5', key='df', mode='w')


## Loading Preprocessed Data

In [14]:
# loading preprocessed data
df_train_processed = pd.read_hdf('data/marc_multilingual-merged_train_final.h5', 'df')
df_test_processed = pd.read_hdf('data/marc_multilingual-merged_test_final.h5', 'df')
df_validation_processed = pd.read_hdf('data/marc_multilingual-merged_validation_final.h5', 'df')

In [15]:
print(df_train_processed.shape)
print(df_test_processed.shape)
print(df_validation_processed.shape)

(200000, 3)
(5000, 3)
(5000, 3)


In [16]:
df_train_processed

Unnamed: 0,reviews,labels,labels_len
0,[produit defectueux au niveau de la couleur ro...,"[3, 0, 1, 2]",4
1,[Perfekt für trockene Augen Kontaktlinsen Träg...,"[0, 3, 2, 1]",4
2,[Calidad muy baja. La nuestra vino incluso con...,"[2, 0, 1, 3]",4
3,[Contains many of the old songs that are now h...,"[1, 3, 0, 2]",4
4,[It like it but it is a tad small for what I w...,"[1, 2, 0, 3]",4
...,...,...,...
199995,[Diese Tabs sind nicht zu empfehlen. Das Gesch...,"[0, 3, 1, 2]",4
199996,[Problemas con el envio.... Tuve que pagar bas...,"[2, 0, 1, 3]",4
199997,"[Ansicht ist das Alu Profil okay, wenn nur all...","[0, 3, 1, 2]",4
199998,"[An sich gut verarbeitet, die Gummiauflage obe...","[0, 1, 2, 3]",4


In [17]:
# creating the final training set with 60,000 rows
df_train_processed = df_train_processed.head(60000)
df_train_processed

Unnamed: 0,reviews,labels,labels_len
0,[produit defectueux au niveau de la couleur ro...,"[3, 0, 1, 2]",4
1,[Perfekt für trockene Augen Kontaktlinsen Träg...,"[0, 3, 2, 1]",4
2,[Calidad muy baja. La nuestra vino incluso con...,"[2, 0, 1, 3]",4
3,[Contains many of the old songs that are now h...,"[1, 3, 0, 2]",4
4,[It like it but it is a tad small for what I w...,"[1, 2, 0, 3]",4
...,...,...,...
59995,"[Como no veo dónde tengo que reclamar, lo hago...","[2, 3, 0, 1]",4
59996,[Stabile und grosse Geldbox. Allerdings ist da...,"[0, 2, 1, 3]",4
59997,[habe den Artikel bis jetzt nicht erhalten! Mi...,"[0, 1, 2, 3]",4
59998,"[I’ve been using S100 since about 2004, and I’...","[1, 3, 0, 2]",4


In [18]:
df_train_processed['labels_len'].value_counts()

4    60000
Name: labels_len, dtype: int64

In [19]:
df_test_processed

Unnamed: 0,reviews,labels,labels_len
0,[Utilisation ok Pratique et simple d'installat...,"[3, 1, 2, 0]",4
1,[It keeps things cold very well. My issue is w...,"[1, 0, 2, 3]",4
2,[Die Höschen wurden zügig geliefert. Ich hatte...,"[0, 1, 3, 2]",4
3,[Un año después la batería está para reemplaza...,"[2, 0, 3, 1]",4
4,[Die Enkelinnen waren begeistert und sind es n...,"[0, 2, 3, 1]",4
...,...,...,...
4995,"[Deno lleva toma de corriente estandard, lleva...","[2, 3, 0, 1]",4
4996,[I got this for Yeti mug as reminder of how mu...,"[1, 2, 0, 3]",4
4997,[Zuviel hatte ich nicht erwartet allerdings is...,"[0, 3, 1, 2]",4
4998,[Tarda como unas 15 horas en cargarse. Y si la...,"[2, 1, 0, 3]",4


In [20]:
df_validation_processed

Unnamed: 0,reviews,labels,labels_len
0,[Kam offenbar schon benutzt bei mir an. Verpac...,"[0, 2, 1, 3]",4
1,[El cambio del hilo por un material de plástic...,"[2, 1, 3, 0]",4
2,"[Funciona bien, pero es un firewall muy especi...","[2, 1, 0, 3]",4
3,[Matifie bien la peau mais pas assez hydratant...,"[3, 2, 0, 1]",4
4,"[Pinta mejor en las fotos que en la realidad, ...","[2, 1, 3, 0]",4
...,...,...,...
4995,[J’en pensais que ce manteau aurait quand même...,"[3, 2, 0, 1]",4
4996,[Great jacket you can use it for casual and ri...,"[1, 3, 0, 2]",4
4997,"[Le bas est parfait, par contre le haut pas te...","[3, 1, 2, 0]",4
4998,"[komplizierter scheiss, Son pequeñitas, tamaño...","[0, 2, 3, 1]",4


### Sentence Tokenizing and Labeling

In [21]:
# tokenizing the text into sentences and finding the number of sentences
def senntence_tokenizing(df):
    review_sent_list = []
    sent_len_list = []
    for row in df.index:
        review_sent = []
        sentences_len = []
        for i in df['reviews'][row]:
            sentences = sent_tokenize(i)
            sent_len = len(sentences)
            review_sent.append(sentences)
            sentences_len.append(sent_len)
        review_sent_merged = sum(review_sent, [])
        review_sent_list.append(review_sent_merged)
        sent_len_list.append(sentences_len)
    return review_sent_list, sent_len_list

In [22]:
# performing sentence tokenization for each set
review_sent_train, sent_len_train = senntence_tokenizing(df_train_processed)
review_sent_test, sent_len_test = senntence_tokenizing(df_test_processed)
review_sent_validation, sent_len_validation = senntence_tokenizing(df_validation_processed)

In [23]:
# creating empty dataframe
df_train_tokenized = pd.DataFrame()
df_test_tokenized = pd.DataFrame()
df_validation_tokenized = pd.DataFrame()

In [24]:
# storing the reviews to the dataframe
df_train_tokenized['reviews'] = review_sent_train
df_test_tokenized['reviews'] = review_sent_test
df_validation_tokenized['reviews'] = review_sent_validation

In [25]:
# creating labels for each sentence in the training set
sent_label_train = []
sent_error_train = []
for i in df_train_tokenized.index:
    try:
        sent_label_train.append(np.repeat(df_train_processed['labels'][i], sent_len_train[i]))
    except:
        sent_error_train.append(i)
    finally:
        continue

In [26]:
len(sent_error_train)

0

In [27]:
# creating labels for each sentence in the test set
sent_label_test = []
sent_error_test = []
for i in df_test_tokenized.index:
    try:
        sent_label_test.append(np.repeat(df_test_processed['labels'][i], sent_len_test[i]))
    except:
        sent_error_test.append(i)
    finally:
        continue

In [28]:
len(sent_error_test)

0

In [29]:
# creating labels for each sentence in the validation set
sent_label_validation = []
sent_error_validation = []
for i in df_validation_tokenized.index:
    try:
        sent_label_validation.append(np.repeat(df_validation_processed['labels'][i], sent_len_validation[i]))
    except:
        sent_error_validation.append(i)
    finally:
        continue

In [30]:
len(sent_error_validation)

0

In [31]:
# storing the labels to the dataframe
df_train_tokenized['labels'] = sent_label_train
df_test_tokenized['labels'] = sent_label_test
df_validation_tokenized['labels'] = sent_label_validation

In [32]:
df_train_tokenized

Unnamed: 0,reviews,labels
0,[produit defectueux au niveau de la couleur ro...,"[3, 3, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,[Perfekt für trockene Augen Kontaktlinsen Träg...,"[0, 3, 3, 2, 2, 1]"
2,"[Calidad muy baja., La nuestra vino incluso co...","[2, 2, 2, 2, 0, 0, 0, 0, 1, 3, 3, 3, 3, 3]"
3,[Contains many of the old songs that are now h...,"[1, 1, 3, 0, 2]"
4,[It like it but it is a tad small for what I w...,"[1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 3, 3, 3]"
...,...,...
59995,"[Como no veo dónde tengo que reclamar, lo hago...","[2, 2, 2, 3, 3, 3, 0, 0, 0, 0, 1, 1]"
59996,"[Stabile und grosse Geldbox., Allerdings ist d...","[0, 0, 0, 2, 1, 1, 1, 1, 1, 3]"
59997,"[habe den Artikel bis jetzt nicht erhalten!, M...","[0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3]"
59998,"[I’ve been using S100 since about 2004, and I’...","[1, 1, 1, 1, 3, 3, 3, 0, 2, 2, 2, 2, 2, 2]"


In [33]:
df_test_tokenized

Unnamed: 0,reviews,labels
0,[Utilisation ok Pratique et simple d'installat...,"[3, 3, 1, 1, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0]"
1,"[It keeps things cold very well., My issue is ...","[1, 1, 1, 1, 1, 0, 2, 2, 3, 3]"
2,"[Die Höschen wurden zügig geliefert., Ich hatt...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 2]"
3,[Un año después la batería está para reemplaza...,"[2, 2, 0, 3, 1]"
4,[Die Enkelinnen waren begeistert und sind es n...,"[0, 0, 0, 2, 3, 3, 1, 1, 1, 1, 1]"
...,...,...
4995,"[Deno lleva toma de corriente estandard, lleva...","[2, 2, 3, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]"
4996,[I got this for Yeti mug as reminder of how mu...,"[1, 1, 1, 1, 2, 0, 0, 3]"
4997,[Zuviel hatte ich nicht erwartet allerdings is...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 2]"
4998,"[Tarda como unas 15 horas en cargarse., Y si l...","[2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 3, 3, 3]"


In [34]:
df_validation_tokenized

Unnamed: 0,reviews,labels
0,"[Kam offenbar schon benutzt bei mir an., Verpa...","[0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 3, 3]"
1,[El cambio del hilo por un material de plástic...,"[2, 1, 1, 3, 3, 0, 0]"
2,"[Funciona bien, pero es un firewall muy especi...","[2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 0, 3, 3, 3]"
3,[Matifie bien la peau mais pas assez hydratant...,"[3, 3, 2, 0, 1, 1, 1, 1, 1]"
4,"[Pinta mejor en las fotos que en la realidad, ...","[2, 1, 1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
4995,[J’en pensais que ce manteau aurait quand même...,"[3, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]"
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 1, 3, 0, 2, 2]"
4997,"[Le bas est parfait, par contre le haut pas te...","[3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
4998,"[komplizierter scheiss, Son pequeñitas, tamaño...","[0, 2, 3, 3, 3, 3, 1, 1, 1, 1]"


In [35]:
# shuffling the sentences and their labels randomly of each row for the training set
train_review_shuffled_list = []
train_labels_shuffled_list = []

for row in df_train_tokenized.index:
    review = df_train_tokenized['reviews'][row]
    labels = df_train_tokenized['labels'][row]
    review_shuffled, labels_shuffled = shuffle(np.array(review), np.array(labels))
    train_review_shuffled_list.append(list(review_shuffled))
    train_labels_shuffled_list.append(list(labels_shuffled))

In [36]:
# creating a dataframe and store the shuffled training data
df_train_tokenized_shuffled = pd.DataFrame()
df_train_tokenized_shuffled["reviews"] = train_review_shuffled_list
df_train_tokenized_shuffled["labels"] = train_labels_shuffled_list
df_train_tokenized_shuffled

Unnamed: 0,reviews,labels
0,"[Carga algo lento., I do like the range of col...","[2, 1, 1, 1, 1, 0, 1, 3, 1, 2, 1, 1, 1, 3, 1, ..."
1,[Los imanes son potentes pero el pegamento que...,"[2, 0, 3, 3, 1, 2]"
2,"[déçue de mon achat, je m'attendais à mieux !!...","[3, 0, 0, 2, 2, 3, 2, 2, 3, 0, 3, 0, 3, 1]"
3,[Je suis très déçue car je n’ai pas reçu la bo...,"[3, 0, 1, 1, 2]"
4,[El diseño de La foto es más llamativo que las...,"[2, 2, 0, 1, 0, 0, 1, 3, 2, 0, 3, 3, 2]"
...,...,...
59995,"[Le tactile fonctionne bien., Die beschriebene...","[3, 0, 2, 1, 3, 2, 1, 2, 0, 0, 3, 0]"
59996,"[Very disappointed with this purchase., Had to...","[1, 1, 1, 0, 0, 0, 1, 2, 3, 1]"
59997,[When the battery arrived put in laptop and it...,"[1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 0, 2, 2, 0]"
59998,"[Al principio no conectaba con los textos., I’...","[2, 1, 1, 3, 3, 0, 2, 2, 2, 2, 1, 1, 2, 3]"


In [37]:
# shuffling the sentences and their labels randomly of each row for the test set
test_review_shuffled_list = []
test_labels_shuffled_list = []

for row in df_validation_tokenized.index:
    review = df_validation_tokenized['reviews'][row]
    labels = df_validation_tokenized['labels'][row]
    review_shuffled, labels_shuffled = shuffle(np.array(review), np.array(labels))
    test_review_shuffled_list.append(list(review_shuffled))
    test_labels_shuffled_list.append(list(labels_shuffled))

In [38]:
# creating a dataframe and store the shuffled test data
df_test_tokenized_shuffled = pd.DataFrame()
df_test_tokenized_shuffled["reviews"] = test_review_shuffled_list
df_test_tokenized_shuffled["labels"] = test_labels_shuffled_list
df_test_tokenized_shuffled

Unnamed: 0,reviews,labels
0,"[Hab sie zurück geschickt, würde ich nicht noc...","[0, 2, 0, 3, 1, 0, 3, 1, 1, 2, 1, 1]"
1,[El cambio del hilo por un material de plástic...,"[2, 3, 1, 1, 0, 3, 0]"
2,"[Sie ist von der Größe auch genau so, wie ich ...","[0, 0, 2, 2, 3, 2, 0, 1, 1, 3, 0, 0, 3, 2]"
3,[du coup j utilise cette crème que sur ma zone...,"[3, 1, 1, 2, 1, 0, 1, 3, 1]"
4,"[Toutefois, le bouchon métallique noir était l...","[3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 0, 0, ..."
...,...,...
4995,"[As it got older, it doesn't heat as well., Id...","[1, 0, 0, 3, 2, 0, 0, 1, 1, 1, 2, 2, 1, 0]"
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 3, 0, 2, 1, 2]"
4997,[Glaspartikel oder ähnliches unter der Folie b...,"[0, 0, 0, 2, 0, 3, 0, 0, 0, 1, 0]"
4998,[We have had this license plate for almost 2 y...,"[1, 3, 1, 3, 1, 0, 1, 3, 3, 2]"


In [39]:
# shuffling the sentences and their labels randomly of each row for the validation set
validation_review_shuffled_list = []
validation_labels_shuffled_list = []

for row in df_validation_tokenized.index:
    review = df_validation_tokenized['reviews'][row]
    labels = df_validation_tokenized['labels'][row]
    review_shuffled, labels_shuffled = shuffle(np.array(review), np.array(labels))
    validation_review_shuffled_list.append(list(review_shuffled))
    validation_labels_shuffled_list.append(list(labels_shuffled))

In [40]:
# creating a dataframe and store the shuffled validation data
df_validation_tokenized_shuffled = pd.DataFrame()
df_validation_tokenized_shuffled["reviews"] = validation_review_shuffled_list
df_validation_tokenized_shuffled["labels"] = validation_labels_shuffled_list
df_validation_tokenized_shuffled

Unnamed: 0,reviews,labels
0,"[The keys feel so fresh, the colors are crisp....","[1, 0, 1, 0, 3, 1, 2, 1, 0, 1, 2, 3]"
1,[Idéal pour nettoyer car c'est doux mais n'a p...,"[3, 3, 2, 0, 0, 1, 1]"
2,[Man muss dann schon ganz schön lange fummeln ...,"[0, 3, 3, 1, 0, 1, 0, 0, 2, 3, 2, 2, 0, 2]"
3,[They were so lightweight I could not get them...,"[1, 0, 1, 1, 3, 1, 1, 2, 3]"
4,"[Leicht zu reinigen, einfach alle Teile in die...","[0, 1, 0, 0, 1, 3, 0, 2, 3, 0, 0, 0, 0, 0, 1, ..."
...,...,...
4995,[Muy buena para preparar cualquier tipo de com...,"[2, 0, 2, 1, 0, 1, 0, 1, 3, 0, 2, 1, 1, 0]"
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 2, 1, 2, 3, 0]"
4997,"[Erstes Glas im Eimer gelandet., Ich habe dies...","[0, 0, 0, 2, 3, 0, 0, 1, 0, 0, 0]"
4998,"[I would purchase this again., We have had thi...","[1, 1, 2, 3, 1, 3, 1, 0, 3, 3]"


### Mapping Languages to the Sentences

In [41]:
langs = ['de', 'en', 'es', 'fr']

In [51]:
# finding the language name according to the labels for each sentence in the training set
lang_list_train = []
for label in train_labels_shuffled_list:
  lang_list_train.append([langs[i] for i in label])

In [52]:
len(lang_list_train)

60000

In [53]:
# finding the language name according to the labels for each sentence in the test set
lang_list_test = []
for label in test_labels_shuffled_list:
  lang_list_test.append([langs[i] for i in label])

In [54]:
len(lang_list_test)

5000

In [55]:
# finding the language name according to the labels for each sentence in the validation set
lang_list_validation = []
for label in validation_labels_shuffled_list:
  lang_list_validation.append([langs[i] for i in label])

In [56]:
len(lang_list_validation)

5000

In [57]:
# attaching language names to the dataframes
df_train_tokenized_shuffled['languages'] = lang_list_train
df_test_tokenized_shuffled['languages'] = lang_list_test
df_validation_tokenized_shuffled['languages'] = lang_list_validation

In [58]:
df_train_tokenized_shuffled

Unnamed: 0,reviews,labels,languages
0,"[Carga algo lento., I do like the range of col...","[2, 1, 1, 1, 1, 0, 1, 3, 1, 2, 1, 1, 1, 3, 1, ...","[es, en, en, en, en, de, en, fr, en, es, en, e..."
1,[Los imanes son potentes pero el pegamento que...,"[2, 0, 3, 3, 1, 2]","[es, de, fr, fr, en, es]"
2,"[déçue de mon achat, je m'attendais à mieux !!...","[3, 0, 0, 2, 2, 3, 2, 2, 3, 0, 3, 0, 3, 1]","[fr, de, de, es, es, fr, es, es, fr, de, fr, d..."
3,[Je suis très déçue car je n’ai pas reçu la bo...,"[3, 0, 1, 1, 2]","[fr, de, en, en, es]"
4,[El diseño de La foto es más llamativo que las...,"[2, 2, 0, 1, 0, 0, 1, 3, 2, 0, 3, 3, 2]","[es, es, de, en, de, de, en, fr, es, de, fr, f..."
...,...,...,...
59995,"[Le tactile fonctionne bien., Die beschriebene...","[3, 0, 2, 1, 3, 2, 1, 2, 0, 0, 3, 0]","[fr, de, es, en, fr, es, en, es, de, de, fr, de]"
59996,"[Very disappointed with this purchase., Had to...","[1, 1, 1, 0, 0, 0, 1, 2, 3, 1]","[en, en, en, de, de, de, en, es, fr, en]"
59997,[When the battery arrived put in laptop and it...,"[1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 0, 2, 2, 0]","[en, es, en, es, en, fr, es, en, es, es, de, e..."
59998,"[Al principio no conectaba con los textos., I’...","[2, 1, 1, 3, 3, 0, 2, 2, 2, 2, 1, 1, 2, 3]","[es, en, en, fr, fr, de, es, es, es, es, en, e..."


In [59]:
df_test_tokenized_shuffled

Unnamed: 0,reviews,labels,languages
0,"[Hab sie zurück geschickt, würde ich nicht noc...","[0, 2, 0, 3, 1, 0, 3, 1, 1, 2, 1, 1]","[de, es, de, fr, en, de, fr, en, en, es, en, en]"
1,[El cambio del hilo por un material de plástic...,"[2, 3, 1, 1, 0, 3, 0]","[es, fr, en, en, de, fr, de]"
2,"[Sie ist von der Größe auch genau so, wie ich ...","[0, 0, 2, 2, 3, 2, 0, 1, 1, 3, 0, 0, 3, 2]","[de, de, es, es, fr, es, de, en, en, fr, de, d..."
3,[du coup j utilise cette crème que sur ma zone...,"[3, 1, 1, 2, 1, 0, 1, 3, 1]","[fr, en, en, es, en, de, en, fr, en]"
4,"[Toutefois, le bouchon métallique noir était l...","[3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 0, 0, ...","[fr, fr, fr, de, de, de, de, de, de, es, de, f..."
...,...,...,...
4995,"[As it got older, it doesn't heat as well., Id...","[1, 0, 0, 3, 2, 0, 0, 1, 1, 1, 2, 2, 1, 0]","[en, de, de, fr, es, de, de, en, en, en, es, e..."
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 3, 0, 2, 1, 2]","[en, en, fr, de, es, en, es]"
4997,[Glaspartikel oder ähnliches unter der Folie b...,"[0, 0, 0, 2, 0, 3, 0, 0, 0, 1, 0]","[de, de, de, es, de, fr, de, de, de, en, de]"
4998,[We have had this license plate for almost 2 y...,"[1, 3, 1, 3, 1, 0, 1, 3, 3, 2]","[en, fr, en, fr, en, de, en, fr, fr, es]"


In [60]:
df_validation_tokenized_shuffled

Unnamed: 0,reviews,labels,languages
0,"[The keys feel so fresh, the colors are crisp....","[1, 0, 1, 0, 3, 1, 2, 1, 0, 1, 2, 3]","[en, de, en, de, fr, en, es, en, de, en, es, fr]"
1,[Idéal pour nettoyer car c'est doux mais n'a p...,"[3, 3, 2, 0, 0, 1, 1]","[fr, fr, es, de, de, en, en]"
2,[Man muss dann schon ganz schön lange fummeln ...,"[0, 3, 3, 1, 0, 1, 0, 0, 2, 3, 2, 2, 0, 2]","[de, fr, fr, en, de, en, de, de, es, fr, es, e..."
3,[They were so lightweight I could not get them...,"[1, 0, 1, 1, 3, 1, 1, 2, 3]","[en, de, en, en, fr, en, en, es, fr]"
4,"[Leicht zu reinigen, einfach alle Teile in die...","[0, 1, 0, 0, 1, 3, 0, 2, 3, 0, 0, 0, 0, 0, 1, ...","[de, en, de, de, en, fr, de, es, fr, de, de, d..."
...,...,...,...
4995,[Muy buena para preparar cualquier tipo de com...,"[2, 0, 2, 1, 0, 1, 0, 1, 3, 0, 2, 1, 1, 0]","[es, de, es, en, de, en, de, en, fr, de, es, e..."
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 2, 1, 2, 3, 0]","[en, en, es, en, es, fr, de]"
4997,"[Erstes Glas im Eimer gelandet., Ich habe dies...","[0, 0, 0, 2, 3, 0, 0, 1, 0, 0, 0]","[de, de, de, es, fr, de, de, en, de, de, de]"
4998,"[I would purchase this again., We have had thi...","[1, 1, 2, 3, 1, 3, 1, 0, 3, 3]","[en, en, es, fr, en, fr, en, de, fr, fr]"


### Word Tokenizing and Labeling

In [61]:
# word tokenizing and finding the number of words
def tokenswithlength(df):
  review_tokens_list = []
  token_len_list = []
  for row in df.index:
    review_token = []
    tokens_len = []
    for i in df['reviews'][row]:
      tokens = i.split()
      token_len = len(tokens)
      review_token.append(tokens)
      tokens_len.append(token_len)
    review_token_merged = sum(review_token, [])
    review_tokens_list.append(review_token_merged)
    token_len_list.append(tokens_len)
  return review_tokens_list, token_len_list

In [62]:
# performing the word tokenization
review_token_train, token_len_train = tokenswithlength(df_train_tokenized_shuffled)
review_token_test, token_len_test = tokenswithlength(df_test_tokenized_shuffled)
review_token_validation, token_len_validation = tokenswithlength(df_validation_tokenized_shuffled)

In [63]:
# storing the tokens for each set
df_train_tokenized_shuffled['review_tokens'] = review_token_train
df_test_tokenized_shuffled['review_tokens'] = review_token_test
df_validation_tokenized_shuffled['review_tokens'] = review_token_validation

In [64]:
df_train_tokenized_shuffled

Unnamed: 0,reviews,labels,languages,review_tokens
0,"[Carga algo lento., I do like the range of col...","[2, 1, 1, 1, 1, 0, 1, 3, 1, 2, 1, 1, 1, 3, 1, ...","[es, en, en, en, en, de, en, fr, en, es, en, e...","[Carga, algo, lento., I, do, like, the, range,..."
1,[Los imanes son potentes pero el pegamento que...,"[2, 0, 3, 3, 1, 2]","[es, de, fr, fr, en, es]","[Los, imanes, son, potentes, pero, el, pegamen..."
2,"[déçue de mon achat, je m'attendais à mieux !!...","[3, 0, 0, 2, 2, 3, 2, 2, 3, 0, 3, 0, 3, 1]","[fr, de, de, es, es, fr, es, es, fr, de, fr, d...","[déçue, de, mon, achat,, je, m'attendais, à, m..."
3,[Je suis très déçue car je n’ai pas reçu la bo...,"[3, 0, 1, 1, 2]","[fr, de, en, en, es]","[Je, suis, très, déçue, car, je, n’ai, pas, re..."
4,[El diseño de La foto es más llamativo que las...,"[2, 2, 0, 1, 0, 0, 1, 3, 2, 0, 3, 3, 2]","[es, es, de, en, de, de, en, fr, es, de, fr, f...","[El, diseño, de, La, foto, es, más, llamativo,..."
...,...,...,...,...
59995,"[Le tactile fonctionne bien., Die beschriebene...","[3, 0, 2, 1, 3, 2, 1, 2, 0, 0, 3, 0]","[fr, de, es, en, fr, es, en, es, de, de, fr, de]","[Le, tactile, fonctionne, bien., Die, beschrie..."
59996,"[Very disappointed with this purchase., Had to...","[1, 1, 1, 0, 0, 0, 1, 2, 3, 1]","[en, en, en, de, de, de, en, es, fr, en]","[Very, disappointed, with, this, purchase., Ha..."
59997,[When the battery arrived put in laptop and it...,"[1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 0, 2, 2, 0]","[en, es, en, es, en, fr, es, en, es, es, de, e...","[When, the, battery, arrived, put, in, laptop,..."
59998,"[Al principio no conectaba con los textos., I’...","[2, 1, 1, 3, 3, 0, 2, 2, 2, 2, 1, 1, 2, 3]","[es, en, en, fr, fr, de, es, es, es, es, en, e...","[Al, principio, no, conectaba, con, los, texto..."


In [65]:
df_test_tokenized_shuffled

Unnamed: 0,reviews,labels,languages,review_tokens
0,"[Hab sie zurück geschickt, würde ich nicht noc...","[0, 2, 0, 3, 1, 0, 3, 1, 1, 2, 1, 1]","[de, es, de, fr, en, de, fr, en, en, es, en, en]","[Hab, sie, zurück, geschickt,, würde, ich, nic..."
1,[El cambio del hilo por un material de plástic...,"[2, 3, 1, 1, 0, 3, 0]","[es, fr, en, en, de, fr, de]","[El, cambio, del, hilo, por, un, material, de,..."
2,"[Sie ist von der Größe auch genau so, wie ich ...","[0, 0, 2, 2, 3, 2, 0, 1, 1, 3, 0, 0, 3, 2]","[de, de, es, es, fr, es, de, en, en, fr, de, d...","[Sie, ist, von, der, Größe, auch, genau, so,, ..."
3,[du coup j utilise cette crème que sur ma zone...,"[3, 1, 1, 2, 1, 0, 1, 3, 1]","[fr, en, en, es, en, de, en, fr, en]","[du, coup, j, utilise, cette, crème, que, sur,..."
4,"[Toutefois, le bouchon métallique noir était l...","[3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 0, 0, ...","[fr, fr, fr, de, de, de, de, de, de, es, de, f...","[Toutefois,, le, bouchon, métallique, noir, ét..."
...,...,...,...,...
4995,"[As it got older, it doesn't heat as well., Id...","[1, 0, 0, 3, 2, 0, 0, 1, 1, 1, 2, 2, 1, 0]","[en, de, de, fr, es, de, de, en, en, en, es, e...","[As, it, got, older,, it, doesn't, heat, as, w..."
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 3, 0, 2, 1, 2]","[en, en, fr, de, es, en, es]","[Great, jacket, you, can, use, it, for, casual..."
4997,[Glaspartikel oder ähnliches unter der Folie b...,"[0, 0, 0, 2, 0, 3, 0, 0, 0, 1, 0]","[de, de, de, es, de, fr, de, de, de, en, de]","[Glaspartikel, oder, ähnliches, unter, der, Fo..."
4998,[We have had this license plate for almost 2 y...,"[1, 3, 1, 3, 1, 0, 1, 3, 3, 2]","[en, fr, en, fr, en, de, en, fr, fr, es]","[We, have, had, this, license, plate, for, alm..."


In [66]:
df_validation_tokenized_shuffled

Unnamed: 0,reviews,labels,languages,review_tokens
0,"[The keys feel so fresh, the colors are crisp....","[1, 0, 1, 0, 3, 1, 2, 1, 0, 1, 2, 3]","[en, de, en, de, fr, en, es, en, de, en, es, fr]","[The, keys, feel, so, fresh,, the, colors, are..."
1,[Idéal pour nettoyer car c'est doux mais n'a p...,"[3, 3, 2, 0, 0, 1, 1]","[fr, fr, es, de, de, en, en]","[Idéal, pour, nettoyer, car, c'est, doux, mais..."
2,[Man muss dann schon ganz schön lange fummeln ...,"[0, 3, 3, 1, 0, 1, 0, 0, 2, 3, 2, 2, 0, 2]","[de, fr, fr, en, de, en, de, de, es, fr, es, e...","[Man, muss, dann, schon, ganz, schön, lange, f..."
3,[They were so lightweight I could not get them...,"[1, 0, 1, 1, 3, 1, 1, 2, 3]","[en, de, en, en, fr, en, en, es, fr]","[They, were, so, lightweight, I, could, not, g..."
4,"[Leicht zu reinigen, einfach alle Teile in die...","[0, 1, 0, 0, 1, 3, 0, 2, 3, 0, 0, 0, 0, 0, 1, ...","[de, en, de, de, en, fr, de, es, fr, de, de, d...","[Leicht, zu, reinigen,, einfach, alle, Teile, ..."
...,...,...,...,...
4995,[Muy buena para preparar cualquier tipo de com...,"[2, 0, 2, 1, 0, 1, 0, 1, 3, 0, 2, 1, 1, 0]","[es, de, es, en, de, en, de, en, fr, de, es, e...","[Muy, buena, para, preparar, cualquier, tipo, ..."
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 2, 1, 2, 3, 0]","[en, en, es, en, es, fr, de]","[Great, jacket, you, can, use, it, for, casual..."
4997,"[Erstes Glas im Eimer gelandet., Ich habe dies...","[0, 0, 0, 2, 3, 0, 0, 1, 0, 0, 0]","[de, de, de, es, fr, de, de, en, de, de, de]","[Erstes, Glas, im, Eimer, gelandet., Ich, habe..."
4998,"[I would purchase this again., We have had thi...","[1, 1, 2, 3, 1, 3, 1, 0, 3, 3]","[en, en, es, fr, en, fr, en, de, fr, fr]","[I, would, purchase, this, again., We, have, h..."


In [67]:
# labeling each tokens of each row for the training set
token_label_train = []
error_train = []
for i in df_train_tokenized_shuffled.index:
    try:
        token_label_train.append(np.repeat(df_train_tokenized_shuffled['labels'][i], token_len_train[i]))
    except:
        error_train.append(i)
    finally:
        continue

In [68]:
len(error_train)

0

In [69]:
len(token_label_train)

60000

In [70]:
# labeling each tokens of each row for the test set
token_label_test = []
error_test = []
for i in df_test_tokenized_shuffled.index:
    try:
        token_label_test.append(np.repeat(df_test_tokenized_shuffled['labels'][i], token_len_test[i]))
    except:
        error_test.append(i)
    finally:
        continue

In [71]:
len(error_test)

0

In [72]:
len(token_label_test)

5000

In [73]:
# labeling each tokens of each row for the validation set
token_label_validation = []
error_validation = []
for i in df_validation_tokenized_shuffled.index:
    try:
        token_label_validation.append(np.repeat(df_validation_tokenized_shuffled['labels'][i], token_len_validation[i]))
    except:
        error_validation.append(i)
    finally:
        continue

In [74]:
len(error_validation)

0

In [75]:
len(token_label_validation)

5000

### Mapping Languages to the Tokens

In [96]:
# finding the language name according to the labels for each token in the training set
token_lang_list_train = []
for label in token_label_train:
  token_lang_list_train.append([langs[i] for i in label])

In [97]:
len(token_lang_list_train)

60000

In [98]:
# finding the language name according to the labels for each token in the test set
token_lang_list_test = []
for label in token_label_test:
  token_lang_list_test.append([langs[i] for i in label])

In [99]:
len(token_lang_list_test)

5000

In [100]:
# finding the language name according to the labels for each token in the validation set
token_lang_list_validation = []
for label in token_label_validation:
  token_lang_list_validation.append([langs[i] for i in label])

In [101]:
len(token_lang_list_validation)

5000

In [102]:
# attaching the token labels to the dataframe
df_train_tokenized_shuffled['token_labels'] = token_label_train
df_test_tokenized_shuffled['token_labels'] = token_label_test
df_validation_tokenized_shuffled['token_labels'] = token_label_validation

In [103]:
# attaching the token language to the dataframe
df_train_tokenized_shuffled['token_languages'] = token_lang_list_train
df_test_tokenized_shuffled['token_languages'] = token_lang_list_test
df_validation_tokenized_shuffled['token_languages'] = token_lang_list_validation

In [104]:
df_train_tokenized_shuffled

Unnamed: 0,reviews,labels,languages,review_tokens,token_labels,token_languages
0,"[Carga algo lento., I do like the range of col...","[2, 1, 1, 1, 1, 0, 1, 3, 1, 2, 1, 1, 1, 3, 1, ...","[es, en, en, en, en, de, en, fr, en, es, en, e...","[Carga, algo, lento., I, do, like, the, range,...","[2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[es, es, es, en, en, en, en, en, en, en, en, e..."
1,[Los imanes son potentes pero el pegamento que...,"[2, 0, 3, 3, 1, 2]","[es, de, fr, fr, en, es]","[Los, imanes, son, potentes, pero, el, pegamen...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[es, es, es, es, es, es, es, es, es, es, es, e..."
2,"[déçue de mon achat, je m'attendais à mieux !!...","[3, 0, 0, 2, 2, 3, 2, 2, 3, 0, 3, 0, 3, 1]","[fr, de, de, es, es, fr, es, es, fr, de, fr, d...","[déçue, de, mon, achat,, je, m'attendais, à, m...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, de, de, d..."
3,[Je suis très déçue car je n’ai pas reçu la bo...,"[3, 0, 1, 1, 2]","[fr, de, en, en, es]","[Je, suis, très, déçue, car, je, n’ai, pas, re...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f..."
4,[El diseño de La foto es más llamativo que las...,"[2, 2, 0, 1, 0, 0, 1, 3, 2, 0, 3, 3, 2]","[es, es, de, en, de, de, en, fr, es, de, fr, f...","[El, diseño, de, La, foto, es, más, llamativo,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[es, es, es, es, es, es, es, es, es, es, es, e..."
...,...,...,...,...,...,...
59995,"[Le tactile fonctionne bien., Die beschriebene...","[3, 0, 2, 1, 3, 2, 1, 2, 0, 0, 3, 0]","[fr, de, es, en, fr, es, en, es, de, de, fr, de]","[Le, tactile, fonctionne, bien., Die, beschrie...","[3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[fr, fr, fr, fr, de, de, de, de, de, de, de, d..."
59996,"[Very disappointed with this purchase., Had to...","[1, 1, 1, 0, 0, 0, 1, 2, 3, 1]","[en, en, en, de, de, de, en, es, fr, en]","[Very, disappointed, with, this, purchase., Ha...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."
59997,[When the battery arrived put in laptop and it...,"[1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 0, 2, 2, 0]","[en, es, en, es, en, fr, es, en, es, es, de, e...","[When, the, battery, arrived, put, in, laptop,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."
59998,"[Al principio no conectaba con los textos., I’...","[2, 1, 1, 3, 3, 0, 2, 2, 2, 2, 1, 1, 2, 3]","[es, en, en, fr, fr, de, es, es, es, es, en, e...","[Al, principio, no, conectaba, con, los, texto...","[2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","[es, es, es, es, es, es, es, en, en, en, en, e..."


In [105]:
df_test_tokenized_shuffled

Unnamed: 0,reviews,labels,languages,review_tokens,token_labels,token_languages
0,"[Hab sie zurück geschickt, würde ich nicht noc...","[0, 2, 0, 3, 1, 0, 3, 1, 1, 2, 1, 1]","[de, es, de, fr, en, de, fr, en, en, es, en, en]","[Hab, sie, zurück, geschickt,, würde, ich, nic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, ...","[de, de, de, de, de, de, de, de, de, es, es, e..."
1,[El cambio del hilo por un material de plástic...,"[2, 3, 1, 1, 0, 3, 0]","[es, fr, en, en, de, fr, de]","[El, cambio, del, hilo, por, un, material, de,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[es, es, es, es, es, es, es, es, es, es, es, e..."
2,"[Sie ist von der Größe auch genau so, wie ich ...","[0, 0, 2, 2, 3, 2, 0, 1, 1, 3, 0, 0, 3, 2]","[de, de, es, es, fr, es, de, en, en, fr, de, d...","[Sie, ist, von, der, Größe, auch, genau, so,, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d..."
3,[du coup j utilise cette crème que sur ma zone...,"[3, 1, 1, 2, 1, 0, 1, 3, 1]","[fr, en, en, es, en, de, en, fr, en]","[du, coup, j, utilise, cette, crème, que, sur,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f..."
4,"[Toutefois, le bouchon métallique noir était l...","[3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 0, 0, ...","[fr, fr, fr, de, de, de, de, de, de, es, de, f...","[Toutefois,, le, bouchon, métallique, noir, ét...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f..."
...,...,...,...,...,...,...
4995,"[As it got older, it doesn't heat as well., Id...","[1, 0, 0, 3, 2, 0, 0, 1, 1, 1, 2, 2, 1, 0]","[en, de, de, fr, es, de, de, en, en, en, es, e...","[As, it, got, older,, it, doesn't, heat, as, w...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, de, de, d..."
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 3, 0, 2, 1, 2]","[en, en, fr, de, es, en, es]","[Great, jacket, you, can, use, it, for, casual...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."
4997,[Glaspartikel oder ähnliches unter der Folie b...,"[0, 0, 0, 2, 0, 3, 0, 0, 0, 1, 0]","[de, de, de, es, de, fr, de, de, de, en, de]","[Glaspartikel, oder, ähnliches, unter, der, Fo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d..."
4998,[We have had this license plate for almost 2 y...,"[1, 3, 1, 3, 1, 0, 1, 3, 3, 2]","[en, fr, en, fr, en, de, en, fr, fr, es]","[We, have, had, this, license, plate, for, alm...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."


In [106]:
df_validation_tokenized_shuffled

Unnamed: 0,reviews,labels,languages,review_tokens,token_labels,token_languages
0,"[The keys feel so fresh, the colors are crisp....","[1, 0, 1, 0, 3, 1, 2, 1, 0, 1, 2, 3]","[en, de, en, de, fr, en, es, en, de, en, es, fr]","[The, keys, feel, so, fresh,, the, colors, are...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, de, de, d..."
1,[Idéal pour nettoyer car c'est doux mais n'a p...,"[3, 3, 2, 0, 0, 1, 1]","[fr, fr, es, de, de, en, en]","[Idéal, pour, nettoyer, car, c'est, doux, mais...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f..."
2,[Man muss dann schon ganz schön lange fummeln ...,"[0, 3, 3, 1, 0, 1, 0, 0, 2, 3, 2, 2, 0, 2]","[de, fr, fr, en, de, en, de, de, es, fr, es, e...","[Man, muss, dann, schon, ganz, schön, lange, f...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d..."
3,[They were so lightweight I could not get them...,"[1, 0, 1, 1, 3, 1, 1, 2, 3]","[en, de, en, en, fr, en, en, es, fr]","[They, were, so, lightweight, I, could, not, g...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."
4,"[Leicht zu reinigen, einfach alle Teile in die...","[0, 1, 0, 0, 1, 3, 0, 2, 3, 0, 0, 0, 0, 0, 1, ...","[de, en, de, de, en, fr, de, es, fr, de, de, d...","[Leicht, zu, reinigen,, einfach, alle, Teile, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, en, e..."
...,...,...,...,...,...,...
4995,[Muy buena para preparar cualquier tipo de com...,"[2, 0, 2, 1, 0, 1, 0, 1, 3, 0, 2, 1, 1, 0]","[es, de, es, en, de, en, de, en, fr, de, es, e...","[Muy, buena, para, preparar, cualquier, tipo, ...","[2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, ...","[es, es, es, es, es, es, es, es, de, de, de, d..."
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 2, 1, 2, 3, 0]","[en, en, es, en, es, fr, de]","[Great, jacket, you, can, use, it, for, casual...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."
4997,"[Erstes Glas im Eimer gelandet., Ich habe dies...","[0, 0, 0, 2, 3, 0, 0, 1, 0, 0, 0]","[de, de, de, es, fr, de, de, en, de, de, de]","[Erstes, Glas, im, Eimer, gelandet., Ich, habe...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d..."
4998,"[I would purchase this again., We have had thi...","[1, 1, 2, 3, 1, 3, 1, 0, 3, 3]","[en, en, es, fr, en, fr, en, de, fr, fr]","[I, would, purchase, this, again., We, have, h...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."


### Unique Language Count

In [107]:
# counting unique languages for each review in the training set
unique_langs_train = []
for lang in lang_list_train:
  unique_langs_train.append(len(set(lang)))

In [108]:
# counting unique languages for each review in the test set
unique_langs_test = []
for lang in lang_list_test:
  unique_langs_test.append(len(set(lang)))

In [109]:
# counting unique languages for each review in the validaiton set
unique_langs_validation = []
for lang in lang_list_validation:
  unique_langs_validation.append(len(set(lang)))

In [110]:
# attaching the count of unique languages for each review to the dataframe
df_train_tokenized_shuffled['unique_language_count'] = unique_langs_train
df_test_tokenized_shuffled['unique_language_count'] = unique_langs_test
df_validation_tokenized_shuffled['unique_language_count'] = unique_langs_validation

### Final Dataset

In [111]:
df_train_tokenized_shuffled

Unnamed: 0,reviews,labels,languages,review_tokens,token_labels,token_languages,unique_language_count
0,"[Carga algo lento., I do like the range of col...","[2, 1, 1, 1, 1, 0, 1, 3, 1, 2, 1, 1, 1, 3, 1, ...","[es, en, en, en, en, de, en, fr, en, es, en, e...","[Carga, algo, lento., I, do, like, the, range,...","[2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[es, es, es, en, en, en, en, en, en, en, en, e...",4
1,[Los imanes son potentes pero el pegamento que...,"[2, 0, 3, 3, 1, 2]","[es, de, fr, fr, en, es]","[Los, imanes, son, potentes, pero, el, pegamen...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[es, es, es, es, es, es, es, es, es, es, es, e...",4
2,"[déçue de mon achat, je m'attendais à mieux !!...","[3, 0, 0, 2, 2, 3, 2, 2, 3, 0, 3, 0, 3, 1]","[fr, de, de, es, es, fr, es, es, fr, de, fr, d...","[déçue, de, mon, achat,, je, m'attendais, à, m...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, de, de, d...",4
3,[Je suis très déçue car je n’ai pas reçu la bo...,"[3, 0, 1, 1, 2]","[fr, de, en, en, es]","[Je, suis, très, déçue, car, je, n’ai, pas, re...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f...",4
4,[El diseño de La foto es más llamativo que las...,"[2, 2, 0, 1, 0, 0, 1, 3, 2, 0, 3, 3, 2]","[es, es, de, en, de, de, en, fr, es, de, fr, f...","[El, diseño, de, La, foto, es, más, llamativo,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[es, es, es, es, es, es, es, es, es, es, es, e...",4
...,...,...,...,...,...,...,...
59995,"[Le tactile fonctionne bien., Die beschriebene...","[3, 0, 2, 1, 3, 2, 1, 2, 0, 0, 3, 0]","[fr, de, es, en, fr, es, en, es, de, de, fr, de]","[Le, tactile, fonctionne, bien., Die, beschrie...","[3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[fr, fr, fr, fr, de, de, de, de, de, de, de, d...",4
59996,"[Very disappointed with this purchase., Had to...","[1, 1, 1, 0, 0, 0, 1, 2, 3, 1]","[en, en, en, de, de, de, en, es, fr, en]","[Very, disappointed, with, this, purchase., Ha...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
59997,[When the battery arrived put in laptop and it...,"[1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 0, 2, 2, 0]","[en, es, en, es, en, fr, es, en, es, es, de, e...","[When, the, battery, arrived, put, in, laptop,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
59998,"[Al principio no conectaba con los textos., I’...","[2, 1, 1, 3, 3, 0, 2, 2, 2, 2, 1, 1, 2, 3]","[es, en, en, fr, fr, de, es, es, es, es, en, e...","[Al, principio, no, conectaba, con, los, texto...","[2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","[es, es, es, es, es, es, es, en, en, en, en, e...",4


In [112]:
df_test_tokenized_shuffled

Unnamed: 0,reviews,labels,languages,review_tokens,token_labels,token_languages,unique_language_count
0,"[Hab sie zurück geschickt, würde ich nicht noc...","[0, 2, 0, 3, 1, 0, 3, 1, 1, 2, 1, 1]","[de, es, de, fr, en, de, fr, en, en, es, en, en]","[Hab, sie, zurück, geschickt,, würde, ich, nic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, ...","[de, de, de, de, de, de, de, de, de, es, es, e...",4
1,[El cambio del hilo por un material de plástic...,"[2, 3, 1, 1, 0, 3, 0]","[es, fr, en, en, de, fr, de]","[El, cambio, del, hilo, por, un, material, de,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[es, es, es, es, es, es, es, es, es, es, es, e...",4
2,"[Sie ist von der Größe auch genau so, wie ich ...","[0, 0, 2, 2, 3, 2, 0, 1, 1, 3, 0, 0, 3, 2]","[de, de, es, es, fr, es, de, en, en, fr, de, d...","[Sie, ist, von, der, Größe, auch, genau, so,, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d...",4
3,[du coup j utilise cette crème que sur ma zone...,"[3, 1, 1, 2, 1, 0, 1, 3, 1]","[fr, en, en, es, en, de, en, fr, en]","[du, coup, j, utilise, cette, crème, que, sur,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f...",4
4,"[Toutefois, le bouchon métallique noir était l...","[3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 0, 0, ...","[fr, fr, fr, de, de, de, de, de, de, es, de, f...","[Toutefois,, le, bouchon, métallique, noir, ét...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f...",4
...,...,...,...,...,...,...,...
4995,"[As it got older, it doesn't heat as well., Id...","[1, 0, 0, 3, 2, 0, 0, 1, 1, 1, 2, 2, 1, 0]","[en, de, de, fr, es, de, de, en, en, en, es, e...","[As, it, got, older,, it, doesn't, heat, as, w...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, de, de, d...",4
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 3, 0, 2, 1, 2]","[en, en, fr, de, es, en, es]","[Great, jacket, you, can, use, it, for, casual...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
4997,[Glaspartikel oder ähnliches unter der Folie b...,"[0, 0, 0, 2, 0, 3, 0, 0, 0, 1, 0]","[de, de, de, es, de, fr, de, de, de, en, de]","[Glaspartikel, oder, ähnliches, unter, der, Fo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d...",4
4998,[We have had this license plate for almost 2 y...,"[1, 3, 1, 3, 1, 0, 1, 3, 3, 2]","[en, fr, en, fr, en, de, en, fr, fr, es]","[We, have, had, this, license, plate, for, alm...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4


In [113]:
df_validation_tokenized_shuffled

Unnamed: 0,reviews,labels,languages,review_tokens,token_labels,token_languages,unique_language_count
0,"[The keys feel so fresh, the colors are crisp....","[1, 0, 1, 0, 3, 1, 2, 1, 0, 1, 2, 3]","[en, de, en, de, fr, en, es, en, de, en, es, fr]","[The, keys, feel, so, fresh,, the, colors, are...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, de, de, d...",4
1,[Idéal pour nettoyer car c'est doux mais n'a p...,"[3, 3, 2, 0, 0, 1, 1]","[fr, fr, es, de, de, en, en]","[Idéal, pour, nettoyer, car, c'est, doux, mais...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f...",4
2,[Man muss dann schon ganz schön lange fummeln ...,"[0, 3, 3, 1, 0, 1, 0, 0, 2, 3, 2, 2, 0, 2]","[de, fr, fr, en, de, en, de, de, es, fr, es, e...","[Man, muss, dann, schon, ganz, schön, lange, f...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d...",4
3,[They were so lightweight I could not get them...,"[1, 0, 1, 1, 3, 1, 1, 2, 3]","[en, de, en, en, fr, en, en, es, fr]","[They, were, so, lightweight, I, could, not, g...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
4,"[Leicht zu reinigen, einfach alle Teile in die...","[0, 1, 0, 0, 1, 3, 0, 2, 3, 0, 0, 0, 0, 0, 1, ...","[de, en, de, de, en, fr, de, es, fr, de, de, d...","[Leicht, zu, reinigen,, einfach, alle, Teile, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, en, e...",4
...,...,...,...,...,...,...,...
4995,[Muy buena para preparar cualquier tipo de com...,"[2, 0, 2, 1, 0, 1, 0, 1, 3, 0, 2, 1, 1, 0]","[es, de, es, en, de, en, de, en, fr, de, es, e...","[Muy, buena, para, preparar, cualquier, tipo, ...","[2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, ...","[es, es, es, es, es, es, es, es, de, de, de, d...",4
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 2, 1, 2, 3, 0]","[en, en, es, en, es, fr, de]","[Great, jacket, you, can, use, it, for, casual...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
4997,"[Erstes Glas im Eimer gelandet., Ich habe dies...","[0, 0, 0, 2, 3, 0, 0, 1, 0, 0, 0]","[de, de, de, es, fr, de, de, en, de, de, de]","[Erstes, Glas, im, Eimer, gelandet., Ich, habe...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d...",4
4998,"[I would purchase this again., We have had thi...","[1, 1, 2, 3, 1, 3, 1, 0, 3, 3]","[en, en, es, fr, en, fr, en, de, fr, fr]","[I, would, purchase, this, again., We, have, h...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4


In [114]:
# saving the dataframes to h5 format
df_train_tokenized_shuffled.to_hdf('data/marc_multilingual-merged_train_tokenized_labeled.h5', key='df', mode='w')
df_test_tokenized_shuffled.to_hdf('data/marc_multilingual-merged_test_tokenized_labeled.h5', key='df', mode='w')
df_validation_tokenized_shuffled.to_hdf('data/marc_multilingual-merged_validation_tokenized_labeled.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['reviews', 'labels', 'languages', 'review_tokens', 'token_labels',
       'token_languages'],
      dtype='object')]

  df_train_tokenized_shuffled.to_hdf('data/marc_multilingual-merged_train_tokenized_labeled.h5', key='df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['reviews', 'labels', 'languages', 'review_tokens', 'token_labels',
       'token_languages'],
      dtype='object')]

  df_test_tokenized_shuffled.to_hdf('data/marc_multilingual-merged_test_tokenized_labeled.h5', key='df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['reviews', 'labels', 'languages', 'review_tokens'

## Tokenizing Data with Transformer

In [115]:
# loading the dataset
df_train_tl = pd.read_hdf('data/marc_multilingual-merged_train_tokenized_labeled.h5', 'df')
df_test_tl = pd.read_hdf('data/marc_multilingual-merged_test_tokenized_labeled.h5', 'df')
df_validation_tl = pd.read_hdf('data/marc_multilingual-merged_validation_tokenized_labeled.h5', 'df')

In [116]:
df_train_tl

Unnamed: 0,reviews,labels,languages,review_tokens,token_labels,token_languages,unique_language_count
0,"[Carga algo lento., I do like the range of col...","[2, 1, 1, 1, 1, 0, 1, 3, 1, 2, 1, 1, 1, 3, 1, ...","[es, en, en, en, en, de, en, fr, en, es, en, e...","[Carga, algo, lento., I, do, like, the, range,...","[2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[es, es, es, en, en, en, en, en, en, en, en, e...",4
1,[Los imanes son potentes pero el pegamento que...,"[2, 0, 3, 3, 1, 2]","[es, de, fr, fr, en, es]","[Los, imanes, son, potentes, pero, el, pegamen...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[es, es, es, es, es, es, es, es, es, es, es, e...",4
2,"[déçue de mon achat, je m'attendais à mieux !!...","[3, 0, 0, 2, 2, 3, 2, 2, 3, 0, 3, 0, 3, 1]","[fr, de, de, es, es, fr, es, es, fr, de, fr, d...","[déçue, de, mon, achat,, je, m'attendais, à, m...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, de, de, d...",4
3,[Je suis très déçue car je n’ai pas reçu la bo...,"[3, 0, 1, 1, 2]","[fr, de, en, en, es]","[Je, suis, très, déçue, car, je, n’ai, pas, re...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f...",4
4,[El diseño de La foto es más llamativo que las...,"[2, 2, 0, 1, 0, 0, 1, 3, 2, 0, 3, 3, 2]","[es, es, de, en, de, de, en, fr, es, de, fr, f...","[El, diseño, de, La, foto, es, más, llamativo,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[es, es, es, es, es, es, es, es, es, es, es, e...",4
...,...,...,...,...,...,...,...
59995,"[Le tactile fonctionne bien., Die beschriebene...","[3, 0, 2, 1, 3, 2, 1, 2, 0, 0, 3, 0]","[fr, de, es, en, fr, es, en, es, de, de, fr, de]","[Le, tactile, fonctionne, bien., Die, beschrie...","[3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[fr, fr, fr, fr, de, de, de, de, de, de, de, d...",4
59996,"[Very disappointed with this purchase., Had to...","[1, 1, 1, 0, 0, 0, 1, 2, 3, 1]","[en, en, en, de, de, de, en, es, fr, en]","[Very, disappointed, with, this, purchase., Ha...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
59997,[When the battery arrived put in laptop and it...,"[1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 0, 2, 2, 0]","[en, es, en, es, en, fr, es, en, es, es, de, e...","[When, the, battery, arrived, put, in, laptop,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
59998,"[Al principio no conectaba con los textos., I’...","[2, 1, 1, 3, 3, 0, 2, 2, 2, 2, 1, 1, 2, 3]","[es, en, en, fr, fr, de, es, es, es, es, en, e...","[Al, principio, no, conectaba, con, los, texto...","[2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","[es, es, es, es, es, es, es, en, en, en, en, e...",4


In [117]:
df_test_tl

Unnamed: 0,reviews,labels,languages,review_tokens,token_labels,token_languages,unique_language_count
0,"[Hab sie zurück geschickt, würde ich nicht noc...","[0, 2, 0, 3, 1, 0, 3, 1, 1, 2, 1, 1]","[de, es, de, fr, en, de, fr, en, en, es, en, en]","[Hab, sie, zurück, geschickt,, würde, ich, nic...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, ...","[de, de, de, de, de, de, de, de, de, es, es, e...",4
1,[El cambio del hilo por un material de plástic...,"[2, 3, 1, 1, 0, 3, 0]","[es, fr, en, en, de, fr, de]","[El, cambio, del, hilo, por, un, material, de,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[es, es, es, es, es, es, es, es, es, es, es, e...",4
2,"[Sie ist von der Größe auch genau so, wie ich ...","[0, 0, 2, 2, 3, 2, 0, 1, 1, 3, 0, 0, 3, 2]","[de, de, es, es, fr, es, de, en, en, fr, de, d...","[Sie, ist, von, der, Größe, auch, genau, so,, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d...",4
3,[du coup j utilise cette crème que sur ma zone...,"[3, 1, 1, 2, 1, 0, 1, 3, 1]","[fr, en, en, es, en, de, en, fr, en]","[du, coup, j, utilise, cette, crème, que, sur,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f...",4
4,"[Toutefois, le bouchon métallique noir était l...","[3, 3, 3, 0, 0, 0, 0, 0, 0, 2, 0, 3, 0, 0, 0, ...","[fr, fr, fr, de, de, de, de, de, de, es, de, f...","[Toutefois,, le, bouchon, métallique, noir, ét...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f...",4
...,...,...,...,...,...,...,...
4995,"[As it got older, it doesn't heat as well., Id...","[1, 0, 0, 3, 2, 0, 0, 1, 1, 1, 2, 2, 1, 0]","[en, de, de, fr, es, de, de, en, en, en, es, e...","[As, it, got, older,, it, doesn't, heat, as, w...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, de, de, d...",4
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 3, 0, 2, 1, 2]","[en, en, fr, de, es, en, es]","[Great, jacket, you, can, use, it, for, casual...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
4997,[Glaspartikel oder ähnliches unter der Folie b...,"[0, 0, 0, 2, 0, 3, 0, 0, 0, 1, 0]","[de, de, de, es, de, fr, de, de, de, en, de]","[Glaspartikel, oder, ähnliches, unter, der, Fo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d...",4
4998,[We have had this license plate for almost 2 y...,"[1, 3, 1, 3, 1, 0, 1, 3, 3, 2]","[en, fr, en, fr, en, de, en, fr, fr, es]","[We, have, had, this, license, plate, for, alm...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4


In [118]:
df_validation_tl

Unnamed: 0,reviews,labels,languages,review_tokens,token_labels,token_languages,unique_language_count
0,"[The keys feel so fresh, the colors are crisp....","[1, 0, 1, 0, 3, 1, 2, 1, 0, 1, 2, 3]","[en, de, en, de, fr, en, es, en, de, en, es, fr]","[The, keys, feel, so, fresh,, the, colors, are...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, de, de, d...",4
1,[Idéal pour nettoyer car c'est doux mais n'a p...,"[3, 3, 2, 0, 0, 1, 1]","[fr, fr, es, de, de, en, en]","[Idéal, pour, nettoyer, car, c'est, doux, mais...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f...",4
2,[Man muss dann schon ganz schön lange fummeln ...,"[0, 3, 3, 1, 0, 1, 0, 0, 2, 3, 2, 2, 0, 2]","[de, fr, fr, en, de, en, de, de, es, fr, es, e...","[Man, muss, dann, schon, ganz, schön, lange, f...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d...",4
3,[They were so lightweight I could not get them...,"[1, 0, 1, 1, 3, 1, 1, 2, 3]","[en, de, en, en, fr, en, en, es, fr]","[They, were, so, lightweight, I, could, not, g...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
4,"[Leicht zu reinigen, einfach alle Teile in die...","[0, 1, 0, 0, 1, 3, 0, 2, 3, 0, 0, 0, 0, 0, 1, ...","[de, en, de, de, en, fr, de, es, fr, de, de, d...","[Leicht, zu, reinigen,, einfach, alle, Teile, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, en, e...",4
...,...,...,...,...,...,...,...
4995,[Muy buena para preparar cualquier tipo de com...,"[2, 0, 2, 1, 0, 1, 0, 1, 3, 0, 2, 1, 1, 0]","[es, de, es, en, de, en, de, en, fr, de, es, e...","[Muy, buena, para, preparar, cualquier, tipo, ...","[2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, ...","[es, es, es, es, es, es, es, es, de, de, de, d...",4
4996,[Great jacket you can use it for casual and ri...,"[1, 1, 2, 1, 2, 3, 0]","[en, en, es, en, es, fr, de]","[Great, jacket, you, can, use, it, for, casual...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4
4997,"[Erstes Glas im Eimer gelandet., Ich habe dies...","[0, 0, 0, 2, 3, 0, 0, 1, 0, 0, 0]","[de, de, de, es, fr, de, de, en, de, de, de]","[Erstes, Glas, im, Eimer, gelandet., Ich, habe...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[de, de, de, de, de, de, de, de, de, de, de, d...",4
4998,"[I would purchase this again., We have had thi...","[1, 1, 2, 3, 1, 3, 1, 0, 3, 3]","[en, en, es, fr, en, fr, en, de, fr, fr]","[I, would, purchase, this, again., We, have, h...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[en, en, en, en, en, en, en, en, en, en, en, e...",4


In [119]:
df_train_tl['unique_language_count'].value_counts()

4    60000
Name: unique_language_count, dtype: int64

In [120]:
# converting the dataframe to pyarrow format
dataset_train = Dataset.from_pandas(df_train_tl, preserve_index= False)
dataset_test = Dataset.from_pandas(df_test_tl, preserve_index= False)
dataset_validation = Dataset.from_pandas(df_validation_tl, preserve_index= False)

In [121]:
dataset_train

Dataset({
    features: ['reviews', 'labels', 'languages', 'review_tokens', 'token_labels', 'token_languages', 'unique_language_count'],
    num_rows: 60000
})

In [122]:
dataset_test

Dataset({
    features: ['reviews', 'labels', 'languages', 'review_tokens', 'token_labels', 'token_languages', 'unique_language_count'],
    num_rows: 5000
})

In [123]:
dataset_validation

Dataset({
    features: ['reviews', 'labels', 'languages', 'review_tokens', 'token_labels', 'token_languages', 'unique_language_count'],
    num_rows: 5000
})

In [160]:
# pushing the dataset to HuggingFace
dataset_train.push_to_hub('msislam/marc-multilingual', split="train")
dataset_test.push_to_hub('msislam/marc-multilingual', split="test")
dataset_validation.push_to_hub('msislam/marc-multilingual', split="validation")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/60 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Updating downloaded metadata with the new split.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Updating downloaded metadata with the new split.


In [125]:
# setting up the model
model_name = "xlm-roberta-base"

In [126]:
# defining the tokenizer
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)

In [137]:
# function to apply the encodings using transformer model
def tokenizing(df):
    encodings = []
    for row in df.index:
        tokens = df['review_tokens'][row]
        tokenized_inputs = tokenizer(tokens, max_length = 512, truncation=True, is_split_into_words=True, add_special_tokens= False)
        word_ids = tokenized_inputs.word_ids()
        tags = df['token_labels'][row]
        labels = [tags[word_id] for word_id in word_ids]
        tokenized_inputs["labels"] = labels
        encodings.append(tokenized_inputs)
    return encodings

In [138]:
# creating the encodings for each set
encodings_train = tokenizing(df_train_tl)
encodings_test = tokenizing(df_test_tl)
encodings_validation = tokenizing(df_validation_tl)

In [139]:
encodings_train

[{'input_ids': [3980, 208, 5745, 48121, 5, 87, 54, 1884, 70, 37457, 111, 104988, 102971, 4, 87, 1902, 47, 456, 5808, 13, 70, 1152, 4778, 5, 10660, 15935, 18, 1380, 23, 10, 108802, 450, 509, 82091, 5, 581, 15122, 136, 187363, 90, 621, 8, 16797, 136, 4488, 6782, 100, 4537, 4927, 120384, 90, 5, 581, 5551, 13003, 7, 3542, 959, 759, 45364, 111, 26156, 5, 2484, 2394, 59848, 165, 7678, 89474, 491, 745, 13643, 5, 87, 444, 31895, 132294, 1660, 57134, 133, 2856, 23, 104988, 5, 42414, 72104, 34, 10999, 531, 17019, 8, 21, 91476, 96143, 613, 95, 54903, 916, 8, 12441, 7, 5, 3293, 83, 4552, 8668, 38543, 5, 15678, 430, 8096, 34771, 8, 40276, 62587, 5, 87, 5161, 20697, 678, 7401, 46133, 2652, 7, 136, 43333, 7, 5, 581, 1152, 4778, 46132, 621, 6183, 33977, 39, 4861, 136, 398, 765, 47, 186, 517, 7844, 49146, 6496, 2856, 707, 5551, 13003, 7, 1221, 738, 10, 160, 26783, 1810, 5, 87, 123997, 903, 47, 9790, 1810, 70, 5551, 13003, 7, 5, 2392, 210435, 3775, 202744, 13, 366, 21, 105812, 87, 765, 37842, 17368, 285

In [140]:
print(len(encodings_train))
print(len(encodings_test))
print(len(encodings_validation))

60000
5000
5000


In [141]:
# creating dataframe using the lists
df_encodings_train = pd.DataFrame(data=encodings_train, columns=['input_ids', 'attention_mask', 'labels'])
df_encodings_test = pd.DataFrame(data=encodings_test, columns=['input_ids', 'attention_mask', 'labels'])
df_encodings_validation = pd.DataFrame(data=encodings_validation, columns=['input_ids', 'attention_mask', 'labels'])

In [142]:
df_encodings_train

Unnamed: 0,input_ids,attention_mask,labels
0,"[3980, 208, 5745, 48121, 5, 87, 54, 1884, 70, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[3731, 66197, 90, 775, 105889, 7, 1788, 88, 28...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,"[3402, 17791, 13, 8, 2667, 6, 91444, 4, 55, 34...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
3,"[845, 5189, 4099, 3402, 17791, 13, 2258, 55, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,"[540, 40395, 8, 239, 2011, 198, 1005, 67140, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
...,...,...,...
59995,"[636, 36322, 44555, 86892, 1806, 5, 622, 16289...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
59996,"[99342, 242980, 678, 903, 59038, 5, 28129, 47,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
59997,"[14847, 70, 172714, 174920, 3884, 23, 26367, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ..."
59998,"[884, 37004, 110, 109465, 402, 158, 388, 59489...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, ..."


In [143]:
df_encodings_test

Unnamed: 0,input_ids,attention_mask,labels
0,"[33124, 1329, 15987, 6, 200453, 4, 15466, 654,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, ..."
1,"[540, 20335, 146, 13611, 196, 51, 4912, 8, 114...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,"[727, 443, 542, 122, 66461, 921, 17344, 221, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[115, 14974, 1647, 6, 95601, 3393, 131892, 41,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, ..."
4,"[206501, 4, 95, 20802, 81941, 6973, 1803, 6736...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
...,...,...,...
4995,"[1301, 442, 4163, 114210, 4, 442, 22027, 25, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
4996,"[32774, 79, 27853, 398, 831, 4527, 442, 100, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4997,"[33287, 254, 87356, 1367, 167319, 7, 3993, 122...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4998,"[1401, 765, 1902, 903, 86872, 37385, 100, 3955...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [144]:
df_encodings_validation

Unnamed: 0,input_ids,attention_mask,labels
0,"[581, 22799, 7, 12319, 221, 63335, 4, 70, 1049...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
1,"[87, 11056, 289, 578, 65462, 8889, 2258, 501, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,"[1572, 6869, 3700, 3762, 7678, 33420, 9614, 56...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[10660, 3542, 221, 22729, 165598, 87, 5809, 95...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[636, 44427, 404, 19574, 8402, 4, 6414, 747, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
4995,"[147259, 26031, 121, 52390, 14363, 4309, 8, 37...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, ..."
4996,"[32774, 79, 27853, 398, 831, 4527, 442, 100, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4997,"[150485, 7, 33287, 566, 4565, 1991, 700, 53809...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4998,"[87, 2806, 59038, 903, 13438, 5, 1401, 765, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [145]:
# saving the dataframe to h5 format
df_encodings_train.to_hdf('data/marc_crosslingual-encoding_train.h5', key='df', mode='w')
df_encodings_test.to_hdf('data/marc_crosslingual-encoding_test.h5', key='df', mode='w')
df_encodings_validation.to_hdf('data/marc_crosslingual-encoding_validation.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['input_ids', 'attention_mask', 'labels'], dtype='object')]

  df_encodings_train.to_hdf('data/marc_crosslingual-encoding_train.h5', key='df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['input_ids', 'attention_mask', 'labels'], dtype='object')]

  df_encodings_test.to_hdf('data/marc_crosslingual-encoding_test.h5', key='df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['input_ids', 'attention_mask', 'labels'], dtype='object')]

  df_encodings_validation.to_hdf('data/marc_crosslingual-encoding_validation.h5', key='df', mode='w')


In [146]:
# loading the h5 files
df_encodings_train = pd.read_hdf('data/marc_crosslingual-encoding_train.h5', 'df')
df_encodings_test = pd.read_hdf('data/marc_crosslingual-encoding_test.h5', 'df')
df_encodings_validation = pd.read_hdf('data/marc_crosslingual-encoding_validation.h5', 'df')

In [147]:
df_encodings_train

Unnamed: 0,input_ids,attention_mask,labels
0,"[3980, 208, 5745, 48121, 5, 87, 54, 1884, 70, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[3731, 66197, 90, 775, 105889, 7, 1788, 88, 28...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,"[3402, 17791, 13, 8, 2667, 6, 91444, 4, 55, 34...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
3,"[845, 5189, 4099, 3402, 17791, 13, 2258, 55, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,"[540, 40395, 8, 239, 2011, 198, 1005, 67140, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
...,...,...,...
59995,"[636, 36322, 44555, 86892, 1806, 5, 622, 16289...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
59996,"[99342, 242980, 678, 903, 59038, 5, 28129, 47,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
59997,"[14847, 70, 172714, 174920, 3884, 23, 26367, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ..."
59998,"[884, 37004, 110, 109465, 402, 158, 388, 59489...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, ..."


In [148]:
df_encodings_test

Unnamed: 0,input_ids,attention_mask,labels
0,"[33124, 1329, 15987, 6, 200453, 4, 15466, 654,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, ..."
1,"[540, 20335, 146, 13611, 196, 51, 4912, 8, 114...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,"[727, 443, 542, 122, 66461, 921, 17344, 221, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[115, 14974, 1647, 6, 95601, 3393, 131892, 41,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, ..."
4,"[206501, 4, 95, 20802, 81941, 6973, 1803, 6736...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
...,...,...,...
4995,"[1301, 442, 4163, 114210, 4, 442, 22027, 25, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
4996,"[32774, 79, 27853, 398, 831, 4527, 442, 100, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4997,"[33287, 254, 87356, 1367, 167319, 7, 3993, 122...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4998,"[1401, 765, 1902, 903, 86872, 37385, 100, 3955...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [149]:
df_encodings_validation

Unnamed: 0,input_ids,attention_mask,labels
0,"[581, 22799, 7, 12319, 221, 63335, 4, 70, 1049...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
1,"[87, 11056, 289, 578, 65462, 8889, 2258, 501, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,"[1572, 6869, 3700, 3762, 7678, 33420, 9614, 56...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[10660, 3542, 221, 22729, 165598, 87, 5809, 95...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[636, 44427, 404, 19574, 8402, 4, 6414, 747, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
4995,"[147259, 26031, 121, 52390, 14363, 4309, 8, 37...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, ..."
4996,"[32774, 79, 27853, 398, 831, 4527, 442, 100, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4997,"[150485, 7, 33287, 566, 4565, 1991, 700, 53809...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4998,"[87, 2806, 59038, 903, 13438, 5, 1401, 765, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [150]:
# converting the dataframes to pyarrow format
dataset_encodings_train = Dataset.from_pandas(df_encodings_train, preserve_index=False)
dataset_encodings_test = Dataset.from_pandas(df_encodings_test, preserve_index=False)
dataset_encodings_validation = Dataset.from_pandas(df_encodings_validation, preserve_index=False)

In [151]:
dataset_encodings_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 60000
})

In [152]:
dataset_encodings_test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [153]:
dataset_encodings_validation

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [154]:
# counting total number of tokens in the training set
token_count = 0
for data in dataset_encodings_train:
    token_count = token_count + len(data["input_ids"])
    
print("Total Tokens in Training Set:", token_count)

Total Tokens in Training Set: 10195342


In [155]:
# counting total number of tokens in the test set
token_count = 0
for data in dataset_encodings_test:
    token_count = token_count + len(data["input_ids"])
    
print("Total Tokens in Test Set:", token_count)

Total Tokens in Test Set: 842760


In [156]:
# counting total number of tokens in the validation set
token_count = 0
for data in dataset_encodings_validation:
    token_count = token_count + len(data["input_ids"])
    
print("Total Tokens in Validation Set:", token_count)

Total Tokens in Validation Set: 842760


In [157]:
# saving the pyarrow data
dataset_encodings_train.save_to_disk("data/MARC_dataset_encodings_train")
dataset_encodings_test.save_to_disk("data/MARC_dataset_encodings_test")
dataset_encodings_validation.save_to_disk("data/MARC_dataset_encodings_validation")

Saving the dataset (0/1 shards):   0%|          | 0/60000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [158]:
# pushing the data to HuggingFace
dataset_encodings_train.push_to_hub('msislam/marc-multilingual-encodings-v4', split="train",)
dataset_encodings_test.push_to_hub('msislam/marc-multilingual-encodings-v4', split="test")
dataset_encodings_validation.push_to_hub('msislam/marc-multilingual-encodings-v4', split="validation")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/60 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/467 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/529 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.
