In [1]:
import numpy as np
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, SimpleRNN, LSTM
from tensorflow import keras
keras.__version__


'2.6.0'

# Prepare data

In [2]:
data_dir = './reviews'
train_dir = os.path.join(data_dir, 'train')

train_texts = []
train_labels = []
for label in ['neg', 'pos']:
    labe_dir = os.path.join(train_dir, label)
    for file_ in os.listdir(labe_dir):
        if file_[-4:] == '.txt':
            #try:
            f = open(os.path.join(labe_dir, file_))
            train_texts.append(f.read())
            f.close()

            if label == 'pos':
                train_labels.append(1)
            else:
                train_labels.append(0)
            #except UnicodeDecodeError:
                #f.close()
                #os.remove(os.path.join(labe_dir, file_))

print(train_texts[2:4])

["This film lacked something I couldn't put my finger on at first: charisma on the part of the leading actress. This inevitably translated to lack of chemistry when she shared the screen with her leading man. Even the romantic scenes came across as being merely the actors at play. It could very well have been the director who miscalculated what he needed from the actors. I just don't know.<br /><br />But could it have been the screenplay? Just exactly who was the chef in love with? He seemed more enamored of his culinary skills and restaurant, and ultimately of himself and his youthful exploits, than of anybody or anything else. He never convinced me he was in love with the princess.<br /><br />I was disappointed in this movie. But, don't forget it was nominated for an Oscar, so judge for yourself.", 'Sorry everyone,,, I know this is supposed to be an "art" film,, but wow, they should have handed out guns at the screening so people could blow their brains out and not watch. Although th

In [3]:
len(train_texts)

24987

In [4]:
train_labels[:10] + train_labels[-10:]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# tokenize

In [5]:
max_len = 100
num_words = 10_000
embedding_dims = 100

In [6]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_texts)
list(tokenizer.index_word.items())[:10]

[(1, 'the'),
 (2, 'and'),
 (3, 'a'),
 (4, 'of'),
 (5, 'to'),
 (6, 'is'),
 (7, 'br'),
 (8, 'in'),
 (9, 'it'),
 (10, 'i')]

In [7]:
sequences = tokenizer.texts_to_sequences(train_texts)
print(sequences[:2])

[[62, 4, 3, 129, 34, 45, 7569, 1412, 15, 3, 4248, 514, 43, 16, 3, 633, 133, 12, 6, 3, 1300, 460, 4, 1754, 209, 3, 7688, 308, 6, 676, 80, 32, 2135, 1112, 3006, 31, 1, 929, 4, 42, 5115, 469, 9, 2678, 1754, 1, 223, 55, 16, 54, 828, 1324, 850, 228, 9, 40, 96, 122, 1483, 57, 145, 36, 1, 996, 141, 26, 676, 122, 1, 411, 59, 93, 2276, 304, 770, 5, 3, 837, 20, 3, 1755, 646, 42, 125, 71, 22, 236, 101, 16, 46, 49, 624, 31, 708, 84, 708, 379, 3491, 2, 8412, 67, 26, 107, 3363], [4514, 514, 14, 3, 3415, 159, 8589, 1702, 6, 4881, 53, 16, 4515, 5667, 138, 5, 1022, 4985, 3050, 4516, 589, 1339, 34, 6, 1543, 95, 3, 759, 4, 5, 24, 3534, 8, 4, 9, 109, 3051, 5, 1, 1069, 14, 3, 4553, 79, 20, 2084, 6, 4516, 578, 2793, 7356, 38, 489, 1, 8589, 302, 122, 14, 4282, 18, 1692, 942, 1, 1702, 6, 6532, 31, 1, 998, 1809, 668, 24, 104, 2599, 486, 34, 3283, 1, 6692, 1048, 43, 16, 2751, 2545, 33, 1340, 5, 2102, 1, 4515, 1536, 20, 3, 1702, 3246, 20, 32, 4344, 1104, 18, 134, 228, 24, 4756, 217, 1926, 32, 3226, 8, 1, 4673, 1

In [8]:
word_index = tokenizer.word_index
print(f"{len(word_index)} unikatowych wartości ")


88451 unikatowych wartości 


# trim len of data

In [9]:
train_data = pad_sequences(sequences, maxlen=max_len)
train_data.shape

(24987, 100)

In [10]:
train_data[1]

array([4310,   45,  299,  234,    9,   13,    3, 1316,    5,  320,    8,
         11,   28,   55,  731, 2278,  589, 1339,  269,  152,   79,   28,
         55,  731, 2278,  844, 2104,  269, 1814,  134, 2695, 1362,  844,
          6,  345,  114,    5,   78,   47,   23,  957,    4,   82, 1076,
       1585,    5,  165,   43,   15,   96,    7,    7, 4514,    6,    1,
         88, 1683,    4,    1,  287, 4514,  105,   35,  227,   10,  420,
          1, 1005,  492,    9,   57,   44,   33,   68,    3,  224,  706,
          1,  362, 1896,  455,  149,  336,  148,    3,   19,   41,    3,
       1702,   40, 1607,   26,   11,  355,   39, 1474,   31,    1, 4514,
       5450])

In [11]:
#change train_labels to np.array
train_labels = np.asarray(train_labels)


In [12]:
#shuffle sampels
indices =  np.arange(train_data.shape[0])
np.random.shuffle(indices)
train_data = train_data[indices]
train_labels = train_labels[indices]

In [13]:
# division of samples
train_data_len = train_data.shape[0]
training_samples = train_data_len * 3//5
validation_samples = train_data_len - training_samples
X_train = train_data[:training_samples]
y_train = train_labels[:training_samples]

X_val = train_data[training_samples:]
y_val = train_labels[training_samples:]

# building SimpleRNN model

In [26]:
model = Sequential()
model.add(Embedding(num_words, 32))
model.add(SimpleRNN(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 320,801
Trainable params: 320,801
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [28]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go

    df = pd.DataFrame(history.history)
    df['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.epoch, y=df.accuracy, name='accuracy', mode='markers+lines' ))
    fig.add_trace(go.Scatter(x=df.epoch, y=df.val_accuracy, name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=600, title='accuracy vs val accuracy', xaxis_title='Epochs', yaxis_title='accuracy')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.epoch, y=df.loss, name='loss', mode='markers+lines' ))
    fig.add_trace(go.Scatter(x=df.epoch, y=df.val_loss, name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=600, title='loss vs val loss', xaxis_title='Epochs', yaxis_title='loss')
    fig.show()


In [29]:
plot_hist(history)

# bulding LSTM model

In [22]:
model = Sequential()
model.add(Embedding(num_words, 32))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
lstm (LSTM)                  (None, 16)                3136      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 323,153
Trainable params: 323,153
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [24]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
plot_hist(history)