In [1]:
import os
import io
import json
import spacy
import pathlib 
import warnings
import fr_core_news_md

import pandas as pd 
import tensorflow as tf 
import matplotlib.pyplot as plt

from tensorflow.data import Dataset
from tensorflow.keras import Sequential
from spacy.lang.fr.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, SimpleRNN, GRU, LSTM
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

nlp = fr_core_news_md.load()

In [2]:
df = pd.read_csv("https://go.aws/314bBDq")

In [3]:
df = df[(df["review_lang"] == "french")]

In [4]:
df = df[["review", "stars"]]

In [5]:
df["review_clean"] = df["review"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" " or ch=="'"))
df["review_clean"] = df["review_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
df["review_clean"] = df["review_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)]))

In [6]:
mask = df["review_clean"].isna()==False
df = df[mask]

In [7]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000) # instanciate the tokenizer
tokenizer.fit_on_texts(df["review_clean"])
df["review_encoded"] = tokenizer.texts_to_sequences(df["review_clean"])
df["len_review"] = df["review_encoded"].apply(lambda x: len(x))
df = df[df["len_review"]!=0]

In [8]:
reviews_pad = tf.keras.preprocessing.sequence.pad_sequences(df["review_encoded"], padding="post")

In [9]:
full_ds = tf.data.Dataset.from_tensor_slices((reviews_pad, df["stars"].values-1))

In [10]:
TAKE_SIZE = int(0.7*df.shape[0])

train_data = full_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE)
train_data = train_data.batch(64)

test_data = full_ds.skip(TAKE_SIZE)
test_data = test_data.batch(64)

In [11]:
for review, star in train_data.take(1):
  print(review, star)

tf.Tensor(
[[442  73  50 ...   0   0   0]
 [ 29  28 806 ...   0   0   0]
 [ 92  42  94 ...   0   0   0]
 ...
 [ 20  11 178 ...   0   0   0]
 [ 39  20 283 ...   0   0   0]
 [156 166 365 ...   0   0   0]], shape=(64, 442), dtype=int32) tf.Tensor(
[4 3 4 4 2 2 4 4 4 4 4 4 3 4 2 4 4 4 1 4 4 2 4 2 2 4 3 4 2 4 4 4 4 0 4 4 2
 4 4 4 4 4 1 4 4 2 4 2 3 2 4 2 3 1 4 0 2 3 4 0 3 4 2 4], shape=(64,), dtype=int64)


In [12]:

vocab_size = len(tokenizer.word_index)

model = Sequential([
                Embedding(vocab_size+1, 64, input_shape=[review.shape[1],],  name="embedding"),
                SimpleRNN(units=64, return_sequences=True),
                SimpleRNN(units=32, return_sequences=False),
                Dense(16, activation='relu'), 
                Dense(8, activation="relu"),
                Dense(5, activation="softmax")
    ])

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 442, 64)           854016    
                                                                 
 simple_rnn (SimpleRNN)      (None, 442, 64)           8256      
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                3104      
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 5)                 45        
                                                                 
Total params: 866,085
Trainable params: 866,085
Non-trai

In [14]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss = tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics = [tf.keras.metrics.SparseCategoricalAccuracy()])

In [15]:
(df["stars"]-1).value_counts()

4    4847
3    1536
2    1008
0     557
1     487
Name: stars, dtype: int64

In [16]:
weights = 1/(df["stars"]-1).value_counts()
weights = weights * len(df)/5
weights = {index : values for index , values in zip(weights.index,weights.values)}
weights

{4: 0.34805034041675265,
 3: 1.0983072916666665,
 2: 1.6736111111111112,
 0: 3.028725314183124,
 1: 3.4640657084188917}

In [17]:
model.fit(
    train_data,
    validation_data=test_data,
    class_weight=weights,
    epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2b683cc7a60>

In [18]:
model.save("model_simpleRNN.h5")

In [19]:
json.dump(model.history.history, open("simpleRNN_history.json", 'w'))

In [20]:
vocab_size = len(tokenizer.word_index)
model_gru = tf.keras.Sequential([
                  Embedding(vocab_size+1, 64, input_shape=[review.shape[1],],name="embedding"),
                  GRU(units=64, return_sequences=True), 
                  GRU(units=32, return_sequences=False), 
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),

                  Dense(5, activation="softmax")
])

In [21]:
model_gru.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 442, 64)           854016    
                                                                 
 gru (GRU)                   (None, 442, 64)           24960     
                                                                 
 gru_1 (GRU)                 (None, 32)                9408      
                                                                 
 dense_3 (Dense)             (None, 16)                528       
                                                                 
 dense_4 (Dense)             (None, 8)                 136       
                                                                 
 dense_5 (Dense)             (None, 5)                 45        
                                                                 
Total params: 889,093
Trainable params: 889,093
Non-tr

In [22]:
optimizer= tf.keras.optimizers.Adam()

model_gru.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [23]:
model_gru.fit(train_data,
              epochs=20, 
              validation_data=test_data,
              class_weight=weights)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2b683c9d690>

In [24]:
model_gru.save("model_gru.h5")

In [25]:
json.dump(model_gru.history.history, open("GRU_history.json", 'w'))

In [26]:
vocab_size = len(tokenizer.word_index)
model_lstm = tf.keras.Sequential([
                  Embedding(vocab_size+1, 64, input_shape=[review.shape[1],],name="embedding"),
                  LSTM(units=64, return_sequences=True), 
                  LSTM(units=32, return_sequences=False), 
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),

                  Dense(5, activation="softmax", name="last")
])

In [27]:
model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 442, 64)           854016    
                                                                 
 lstm (LSTM)                 (None, 442, 64)           33024     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense_6 (Dense)             (None, 16)                528       
                                                                 
 dense_7 (Dense)             (None, 8)                 136       
                                                                 
 last (Dense)                (None, 5)                 45        
                                                                 
Total params: 900,165
Trainable params: 900,165
Non-tr

In [28]:
optimizer= tf.keras.optimizers.Adam()

model_lstm.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [29]:
model_lstm.fit(train_data,
              epochs=20, 
              validation_data=test_data,
               class_weight=weights)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2b686b2c970>

In [30]:
model_lstm.save("model_lstm.h5")

In [31]:
json.dump(model_lstm.history.history, open("LSTM_history.json", 'w'))

In [32]:
simpleRNN_history = json.load(open("simpleRNN_history.json", 'r'))

In [33]:
model_simpleRNN = tf.keras.models.load_model("model_simpleRNN.h5")

In [34]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=simpleRNN_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=simpleRNN_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()