In [None]:
## Toy example
samples = ['I am the destroyer of worlds',
           'The bringer of chaos',
           'the phantom knight',
           'i am your father',
           'tHe GhOst In thE shell']

vector_indice = {}
for sentence in samples:
  sentence = sentence.split(' ')
  for word in sentence:
    word = word.lower()
    if word not in vector_indice:
      vector_indice[word] = len(vector_indice)+1

print(vector_indice)


def vectoriser(phrase):
  phrase_code = []
  for word in phrase.split():
    word_code = vector_indice[word.lower()]
    phrase_code.append(word_code)
  return phrase_code

for phrase in samples:
  print(vectoriser(phrase))

{'i': 1, 'am': 2, 'the': 3, 'destroyer': 4, 'of': 5, 'worlds': 6, 'bringer': 7, 'chaos': 8, 'phantom': 9, 'knight': 10, 'your': 11, 'father': 12, 'ghost': 13, 'in': 14, 'shell': 15}
[1, 2, 3, 4, 5, 6]
[3, 7, 5, 8]
[3, 9, 10]
[1, 2, 11, 12]
[3, 13, 14, 3, 15]


In [None]:
#get the dataset
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  66.3M      0  0:00:01  0:00:01 --:--:-- 66.3M


In [None]:
!cat aclImdb/train/pos/10000_8.txt

Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it's like to be homeless? That is Goddard Bolt's lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days without th

In [None]:
!rm -r aclImdb/train/unsup

## Nous commençons par charger l'ensemble de notre dataset

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing import text_dataset_from_directory
train_dataset = text_dataset_from_directory(
    'aclImdb/train',
    batch_size=28,
    validation_split=0.2,
    subset='training',
    seed=42
)
val_dataset =  text_dataset_from_directory(
    'aclImdb/train',
    batch_size=28,
    validation_split=0.2,
    subset='validation',
    seed=42
)
test_dataset = text_dataset_from_directory(
    'aclImdb/test',
    batch_size=28
)
#Voir le nombre de batches dans le dataset
tf.data.experimental.cardinality(train_dataset)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


<tf.Tensor: shape=(), dtype=int64, numpy=715>

## On affiche le corpus qui est constitué de critiques de film

In [None]:
classes = {
    0:"negatif",
    1:"positif"
}
for text_batch,label_batch in train_dataset.take(1):
  for i in range(5):
    print(text_batch.numpy()[i])
    classe = classes[label_batch.numpy()[i]]
    print("the review is : {}".format(classe))

b"Rob Estes, Josie Bisset and a crap load of kids that look nothing like either of them.<br /><br />Basically, Rob and Josie have a shotgun wedding on a drunken night during a Vegas vacation. They each come home to find that their respective children already know of the nuptials due to tabloid-like not-so-fodder. They, Rob and Josie, move both of them and their eight kids into one or the other's house.<br /><br />Rob builds furniture, I think, which is close enough to Frank Lambert's (Patrick Duffy) construction job on the much similar Step by Step to warrant eternal mockage.<br /><br />Josie is some sort of cookie-making queen, though it doesn't look like she makes any of the cookies. Not close enough to Carol Foster's (Suzanne Somers)hairdressing job to warrant likeness mockage, but hilariously preposterous enough to warrant atrocity mockage.<br /><br />Unlike Step by Step, they were a couple before the vacation and actually knew one another's last names, or so one assumes if their s

## Nous allons retirer les balises HTML indésirables de notre corpus.


In [None]:
import re
import string

def standardiser_text(texte):
  lower = tf.strings.lower(texte) #Rendre mon texte minuscule
  lower_striped = tf.strings.strip(lower) #Retirer les espaces de trailing
  lower_striped = tf.strings.regex_replace(lower_striped,pattern='<[^>]+>',rewrite=' ')
  return tf.strings.regex_replace(
      lower_striped, "[%s]" % re.escape(string.punctuation), ""
  )

In [None]:
couche_vecteur = tf.keras.layers.TextVectorization(
    standardize=standardiser_text,
    max_tokens=10000,
    output_mode= 'int',
    output_sequence_length = 200,
)

In [None]:
corpus = train_dataset.map(lambda text,label:text)
couche_vecteur.adapt(corpus)

In [None]:
##On va appliquer la vectorisation à chaque dataset
def vectoriser_texte(text,label):
  text = tf.expand_dims(text,-1)
  return couche_vecteur(text),label

vector_train_dataset = train_dataset.map(vectoriser_texte)
vector_val_dataset = val_dataset.map(vectoriser_texte)
for text,label in vector_train_dataset.take(1):
  for i in range(10):
    print(type(text.numpy()[i]))
    print(text.numpy()[i])
    print('Ce texte est : {}'.format(classes[label.numpy()[i]]))

<class 'numpy.ndarray'>
[ 419 4429    7  286    8   46 9200    1 2204 7543 1116   14  712  312
  335 2697 4907    1 3676    1    1 4418 9144    1 1971  818    1 4739
 1983    1   23 2876    4  313   20    2  509  667    8    2 7173  101
    4    1    8    4 1483   16    4  864    1 1326 1631    1   34 1170
    6  793   44    4 4412 1705   14   22   59  965   34   23 6926  906
  163   44   12    2  265 3135    6    2    1 1326   36 1058    9    6
 1674  512   79   15  966   60  691   90  809   68  690  436 2059    1
    1  586   93   11 2371  359   13 8918 1118  522   32 2390 4539   36
   77  201    4    1  300 5280 1078   14   73   28   43    6  131   12
  419 4429    7  154  179  342  481  416  315   20    4 5549  595  191
   19   12  180    6 1790   61    6 2348    8   20    2    1  981    5
 7518 3474    2 4199    1   15 1674 1590   14   29  394  630 1668    2
 9539 5638 2571   15  419 4429  787  172 4539 1075    9  101 2125   42
  780 5226  667  265    8  419  388 7448   15   54  2

## Le Modèle : RNN

In [None]:
from tensorflow.keras import layers
import tensorflow as tf

In [None]:
def model_RNN():
  max_tokens = 10000
  embed_dimension = 256
  input = tf.keras.Input(shape=(None,),dtype='int64')
  ###Extraction de caractèristiques
  x = layers.Embedding(max_tokens,embed_dimension)(input)
  x = layers.SimpleRNN(64,activation='tanh')(x)
  ###Classifieur
  predictions = layers.Dense(1,activation='sigmoid')(x)
  model = tf.keras.Model(input,predictions)
  return model

In [None]:
model = model_RNN()
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(1e-4),metrics=['accuracy'])

In [None]:
model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_5 (Embedding)     (None, None, 256)         2560000   
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 64)                20544     
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,580,609
Trainable params: 2,580,609
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(vector_train_dataset,validation_data=vector_val_dataset,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
vector_test_dataset = test_dataset.map(vectoriser_texte)
model.evaluate(vector_test_dataset)



[0.9135347008705139, 0.7273600101470947]

## Bidirectionnal RNN

In [None]:
def model_biRNN():
  max_tokens = 10000
  embed_dimension = 256
  input = tf.keras.Input(shape=(None,),dtype='int64')
  ###Extraction de caractèristiques
  x = layers.Embedding(max_tokens,embed_dimension)(input)
  x = layers.Bidirectional(layers.SimpleRNN(64,activation='tanh'))(x)
  ###Classifieur
  predictions = layers.Dense(1,activation='sigmoid')(x)
  model = tf.keras.Model(input,predictions)
  return model

model = model_biRNN()
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(1e-4),metrics=['accuracy'])
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_6 (Embedding)     (None, None, 256)         2560000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              41088     
 l)                                                              
                                                                 
 dense_6 (Dense)             (None, 1)                 129       
                                                                 
Total params: 2,601,217
Trainable params: 2,601,217
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(vector_train_dataset,validation_data=vector_val_dataset,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## LSTM

#### LSTM networks are particularly effective in processing and predicting sequences of data, such as speech, text, and time series data. They are designed to address the vanishing gradient problem, which occurs when the gradients in the backpropagation algorithm diminish exponentially over time, making it difficult for traditional RNNs to capture long-term dependencies.

#### The key idea behind LSTM is the introduction of memory cells, which are responsible for storing and retrieving information over extended time intervals. The memory cells are composed of three main components: an input gate, a forget gate, and an output gate. 

#### Input Gate: It determines how much new information should be stored in the memory cells. It takes input from the current time step and the previous hidden state and applies a sigmoid activation function to produce values between 0 and 1. A value close to 0 means that the information should be ignored, while a value close to 1 means that the information should be stored.

#### Forget Gate: It decides what information should be discarded from the memory cells. It takes input from the current time step and the previous hidden state and applies a sigmoid activation function. This gate outputs values between 0 and 1, with 0 indicating that the information should be forgotten and 1 indicating that the information should be retained.

#### Output Gate: It determines how much information from the memory cells should be exposed as the output of the current time step. It takes input from the current time step and the previous hidden state, and applies a sigmoid activation function. Additionally, it also applies the hyperbolic tangent (tanh) activation function to the current memory cell values. The output gate produces values between 0 and 1, indicating the amount of information to expose.

##### Equation of LSTM

###### i(t) = σ(Wi * h(t-1)+ Ui* x(t) + bi)  input gate
###### f(t) = σ(Wf * h(t-1)+ Ui* x(t) + bf)  forget gate
###### g(t) = tanh(Wg * h(t-1)+ Ui* x(t) + b)  candidate memory cell
###### c(t) = f(t) * c(t-1) + i(t) * g(t)  memory cell
###### o(t) = σ(Wo * h(t-1)+ Ui* x(t) + bo)  output gate
###### h(t) = o(t) * tanh(c(t))  shadow state
###### y(t) = h(t)  cell output

In [None]:
def model_LSTM():
  max_tokens = 10000
  embed_dimension = 256
  input = tf.keras.Input(shape=(None,),dtype='int64')
  ###Extraction de caractèristiques
  x = layers.Embedding(max_tokens,embed_dimension)(input)
  x = layers.Dropout(0.2)(x)
  x = layers.LSTM(64)(x)
  ###Classifieur
  predictions = layers.Dense(1,activation='sigmoid')(x)
  model = tf.keras.Model(input,predictions)
  return model

In [None]:
model = model_LSTM()
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(1e-4),metrics=['accuracy'])
model.summary()

NameError: ignored

In [None]:
history = model.fit(vector_train_dataset,validation_data=vector_val_dataset,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Bidirectional LSTM

In [None]:
def model_biLSTM():
  max_tokens = 10000
  embed_dimension = 256
  input = tf.keras.Input(shape=(None,),dtype='int64')
  ###Extraction de caractèristiques
  x = layers.Embedding(max_tokens,embed_dimension)(input)
  x = layers.Dropout(0.2)(x)
  x = layers.Bidirectional(layers.LSTM(64))(x)
  ###Classifieur
  predictions = layers.Dense(1,activation='sigmoid')(x)
  model = tf.keras.Model(input,predictions)
  return model

model = model_biLSTM()
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(1e-4),metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 256)         2560000   
                                                                 
 dropout (Dropout)           (None, None, 256)         0         
                                                                 
 bidirectional (Bidirectiona  (None, 128)              164352    
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 2,724,481
Trainable params: 2,724,481
Non-trainable params: 0
___________________________________________________

In [None]:
history = model.fit(vector_train_dataset,validation_data=vector_val_dataset,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## GRU

##### GRU was introduced as a way to address some of the complexities and computational overhead associated with LSTM while still capturing long-term dependencies in sequential data.

##### Update Gate (z): It determines how much of the previous hidden state should be passed along to the current time step. It takes input from the current time step and the previous hidden state and applies a sigmoid activation function. A value close to 1 means that the previous hidden state should be fully retained, while a value close to 0 means that the previous hidden state should be mostly ignored.

##### Reset Gate (r): It controls how much of the previous hidden state is used to compute the current hidden state candidate. It takes input from the current time step and the previous hidden state and applies a sigmoid activation function. A value close to 1 means that the previous hidden state should be fully used, while a value close to 0 means that the previous hidden state should be mostly ignored.

In [None]:
def model_biGRU():
  max_tokens = 10000
  embed_dimension = 256
  input = tf.keras.Input(shape=(None,),dtype='int64')
  ###Extraction de caractèristiques
  x = layers.Embedding(max_tokens,embed_dimension)(input)
  x = layers.Bidirectional(layers.GRU(64))(x)
  ###Classifieur
  predictions = layers.Dense(1,activation='sigmoid')(x)
  model = tf.keras.Model(input,predictions)
  return model

model = model_biGRU()
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(1e-4),metrics=['accuracy'])
model.summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_8 (Embedding)     (None, None, 256)         2560000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              123648    
 nal)                                                            
                                                                 
 dense_8 (Dense)             (None, 1)                 129       
                                                                 
Total params: 2,683,777
Trainable params: 2,683,777
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(vector_train_dataset,validation_data=vector_val_dataset,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
