In [44]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.layers import Dense, Input
from keras.models import Sequential
from keras.datasets import imdb

num_words=10000

(X_train, y_train) , (X_test, y_test) = imdb.load_data(num_words=num_words)
print('Numero di esempi del dataset: %d' % X_train.shape[0])
print('Numero di esempi del test: %d' % X_test.shape[0])

Numero di esempi del dataset: 25000
Numero di esempi del test: 25000


Nel caso di reti neurali ricorrenti non conviene utilizzare il one hot encoding, ma il word embedding:

In [45]:
#vediamo quanto è lunga la più lunga recensione all'interno del corpus di testo
longest_review = max(X_train, key=len)  #viene effettuata la ricerca per lunghezza massima
len(longest_review)                     #2494 parole
longest_review                          #contiene gli indici delle parole nel dizionario

[1,
 1014,
 300,
 4349,
 768,
 2702,
 1014,
 2,
 1538,
 5,
 3483,
 5934,
 1918,
 1812,
 2,
 5,
 1378,
 9125,
 2,
 1538,
 5,
 3483,
 5934,
 645,
 183,
 125,
 19,
 6,
 4349,
 768,
 2702,
 1014,
 429,
 1812,
 2,
 5,
 1378,
 9125,
 1793,
 8,
 4,
 2269,
 7,
 4,
 1014,
 199,
 9149,
 28,
 8,
 140,
 143,
 8915,
 11,
 661,
 8,
 79,
 4,
 1176,
 9125,
 5,
 2,
 2,
 56,
 402,
 23,
 34,
 656,
 505,
 2,
 86,
 3483,
 5,
 95,
 2,
 1538,
 6,
 1124,
 2,
 34,
 9125,
 8,
 2,
 562,
 4,
 1933,
 46,
 7,
 4,
 5934,
 597,
 3483,
 805,
 8,
 339,
 27,
 597,
 21,
 4,
 2,
 6075,
 90,
 137,
 9125,
 5,
 2,
 2,
 56,
 23,
 90,
 11,
 4,
 3130,
 19,
 9125,
 2,
 245,
 23,
 2,
 2,
 270,
 56,
 6,
 2702,
 1005,
 3483,
 2,
 83,
 4,
 1746,
 5,
 2,
 120,
 4,
 350,
 5836,
 1646,
 2,
 23,
 4,
 1005,
 103,
 9026,
 5,
 656,
 459,
 7,
 3483,
 2,
 9491,
 6,
 2702,
 83,
 4,
 1746,
 5,
 1678,
 4,
 2,
 270,
 12,
 56,
 4,
 2,
 95,
 270,
 56,
 18,
 6,
 1405,
 2,
 39,
 4,
 655,
 5836,
 63,
 62,
 28,
 276,
 2,
 143,
 4,
 2702,
 21,
 3483,
 

In [46]:
word_index = imdb.get_word_index()    #mi prendo il dizionario
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])    #lo ribalto

decoded_review = [reverse_word_index.get(i-3, '?') for i in longest_review]     #prendo l'indice - 3 perchè i primi 3 caratteri sono riservati
decoded_review = ' '.join(decoded_review)    #lo trasformo in una frase
decoded_review

"? match 1 tag team table match ? ray and spike dudley vs eddie ? and chris benoit ? ray and spike dudley started things off with a tag team table match against eddie ? and chris benoit according to the rules of the match both opponents have to go through tables in order to get the win benoit and ? ? up early on by taking turns ? first spike and then ? ray a german ? by benoit to ? took the wind out of the dudley brother spike tried to help his brother but the ? restrained him while benoit and ? ? up on him in the corner with benoit ? away on ? ? set up a table outside spike ? into the ring and ? over the top rope onto ? on the outside after recovering and taking care of spike ? slipped a table into the ring and helped the ? set it up the ? then set up for a double ? from the middle rope which would have put ? through the table but spike knocked the table over right before his brother came crashing down ? and benoit ? another table in the corner and tried to irish whip spike through it

In [47]:
shortest_review = min(X_train, key=len)     #la recensione più corta contiene 11 parole
decoded_review = [reverse_word_index.get(i-3, '?') for i in shortest_review]     #prendo l'indice - 3 perchè i primi 3 caratteri sono riservati
decoded_review = ' '.join(decoded_review) 
decoded_review

"? i wouldn't rent this one even on dollar rental night"

portiamo tutte le recensioni alla stessa lunghezza: le tronco ed eventualmente aggiungo padding a quelle più corte

In [49]:
from keras.preprocessing.sequence import pad_sequences

max_lenght = 50

X_train = pad_sequences(X_train, maxlen=max_lenght, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_lenght, padding='post', truncating='post')

(25000, 50)

Creiamo la rete neurale ricorrente:

In [50]:
from keras.layers import Embedding, SimpleRNN      #per il word embedding lo aggiungiamo come se fosse uno strato della rete

model = Sequential()
model.add(Embedding(num_words, 50))     #necessita del numero di parole e del numero di embedding da creare
model.add(SimpleRNN(32))                #con 32 nodi, utilizza di default la tanh come funz di attivaz
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [51]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])  #rmsprop adatto per le cnn

In [52]:
model.fit(X_train, y_train, batch_size=512, validation_split=0.2, epochs=10)

Epoch 1/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.5215 - loss: 0.6928 - val_accuracy: 0.5952 - val_loss: 0.6659
Epoch 2/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.6927 - loss: 0.5983 - val_accuracy: 0.5914 - val_loss: 0.8599
Epoch 3/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.7449 - loss: 0.5416 - val_accuracy: 0.6342 - val_loss: 0.6486
Epoch 4/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.7967 - loss: 0.4570 - val_accuracy: 0.7242 - val_loss: 0.5548
Epoch 5/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.8198 - loss: 0.4157 - val_accuracy: 0.7478 - val_loss: 0.5264
Epoch 6/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.8645 - loss: 0.3390 - val_accuracy: 0.6694 - val_loss: 0.6541
Epoch 7/10
[1m40/40[0m [32m━━━━

<keras.src.callbacks.history.History at 0x2935b09e0>

averfitting sul set di validazione, proviamo sul test:

In [53]:
model.evaluate(X_test, y_test)              #risultati scarsi perchè abbiamo utilizzato 50 parole (rimosso tanta informazione)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7364 - loss: 0.6853


[0.6917542815208435, 0.7340800166130066]