<a href="https://colab.research.google.com/github/pkrodev/ML_Tutorial/blob/main/6/Movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importowanie niezbędnych bibliotek
import numpy as np
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten


In [2]:
!wget https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
!unzip -q reviews.zip

--2024-03-27 10:25:29--  https://storage.googleapis.com/esmartdata-courses-files/ann-course/reviews.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.196.207, 173.194.218.207, 108.177.11.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.196.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42878657 (41M) [application/x-zip-compressed]
Saving to: ‘reviews.zip’


2024-03-27 10:25:31 (22.6 MB/s) - ‘reviews.zip’ saved [42878657/42878657]



In [3]:
# Pobieranie danych
data_dir = './reviews'
train_dir = os.path.join(data_dir, 'train') #'./reviews/train'

train_texts = []    #tutaj są przykłady jeszcze tekstu treningowego
train_labels = []   #labele 1 poz 0 neg

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)  #'./reviews/train/neg albo pos'
    for fname in os.listdir(dir_name):
        if fname.endswith('.txt'):
            with open(os.path.join(dir_name, fname)) as f:
                train_texts.append(f.read())
            train_labels.append(0 if label_type == 'neg' else 1)

test_dir = os.path.join(data_dir, 'test')

test_texts = [] #tutaj są przykłady jeszcze tekstu testowego
test_labels = []  #labele 1 poz 0 neg

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type) #'./reviews/test/neg albo pos'
    for fname in os.listdir(dir_name):
        if fname.endswith('.txt'):
            with open(os.path.join(dir_name, fname)) as f:
                test_texts.append(f.read())
            test_labels.append(0 if label_type == 'neg' else 1)

print(len(train_texts))
print(len(test_texts))

25000
25000


In [4]:
# Ustawienia modelu
maxlen = 100
num_words = 10000
embedding_dim = 100


In [5]:

# Inicjacja Tokenizera z ograniczeniem do num_words najczęściej występujących słów
tokenizer = Tokenizer(num_words=num_words)

# Naucz Tokenizer na podstawie tekstu treningowego
tokenizer.fit_on_texts(train_texts)

# Konwersja tekstów treningowych na sekwencje liczb całkowitych za pomocą nauczonego Tokenizera
sequences = tokenizer.texts_to_sequences(train_texts)

# Pobranie słownika mapującego słowa na ich indeksy liczbowe
word_index = tokenizer.word_index

# Wyświetlenie liczby unikatowych słów po tokenizacji
print(f'{len(word_index)} unikatowych słów.')

# Wykorzystanie paddingu do dopasowania wszystkich sekwencji do tej samej długości maxlen
train_data = pad_sequences(sequences, maxlen=maxlen)

# Konwersja listy etykiet na tablicę numpy
train_labels = np.array(train_labels)


88582 unikatowych słów.


In [6]:
train_data.shape

(25000, 100)

In [7]:
train_labels.shape

(25000,)

In [8]:
# Komórka 5: Przygotowanie danych treningowych i walidacyjnych

# Tworzenie tablicy indeksów, które zostaną użyte do przetasowania danych treningowych
indices = np.arange(train_data.shape[0])

# Losowe przetasowanie indeksów, aby zmieszać kolejność danych treningowych
np.random.shuffle(indices)

# Przetasowanie danych treningowych i etykiet zgodnie z wygenerowanymi indeksami
train_data = train_data[indices]
train_labels = train_labels[indices]

# Określenie liczby próbek do treningu i walidacji
training_samples = 15000  # Liczba próbek treningowych
validation_samples = 10000  # Liczba próbek walidacyjnych

# Podział danych na zbiór treningowy i walidacyjny
X_train = train_data[:training_samples]  # Pierwsze 'training_samples' próbek są używane do treningu
y_train = train_labels[:training_samples]  # Odpowiadające im etykiety

# Pozostałe dane są używane jako dane walidacyjne
X_val = train_data[training_samples: training_samples + validation_samples]
y_val = train_labels[training_samples: training_samples + validation_samples]


In [9]:
model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 16)                160016    
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1160033 (4.43 MB)
Trainable params: 1160033 (4.43 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [11]:
history = model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='accuracy vs. val accuracy', xaxis_title='Epoki', yaxis_title='accuracy', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='loss vs. val loss', xaxis_title='Epoki', yaxis_title='loss', yaxis_type='log')
    fig.show()

plot_hist(history)

In [13]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=1)



[0.6934953927993774, 0.8285599946975708]

In [14]:
from tensorflow.keras.layers import SimpleRNN, LSTM

In [15]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          320000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 16)                784       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320801 (1.22 MB)
Trainable params: 320801 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [17]:
history=model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
plot_hist(history)

In [19]:
model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          320000    
                                                                 
 lstm (LSTM)                 (None, 16)                3136      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 323153 (1.23 MB)
Trainable params: 323153 (1.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [21]:
history=model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
plot_hist(history)

In [23]:
sequences = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test_labels)

model.evaluate(X_test, y_test, verbose=1)



[0.49371135234832764, 0.8343999981880188]