# Cài đặt thư viện

In [22]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow

# Data Downloading

In [2]:
import gdown
gdown.download(f'https://drive.google.com/uc?id=1FyX7UavGoqU28I_2owXy8KM4HCTXOql2', output = 'IMDB_Dataset', quiet= False, fuzzy= False)

Downloading...
From: https://drive.google.com/uc?id=1FyX7UavGoqU28I_2owXy8KM4HCTXOql2
To: c:\Users\Hello!!!\Documents\thuc-hanh-deep-learning\Week_2\IMDB_Dataset
100%|██████████| 66.2M/66.2M [00:01<00:00, 37.2MB/s]


'IMDB_Dataset'

# Data Pre-Processing

In [5]:
data = pd.read_csv('IMDB_Dataset')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
print(data['sentiment'].value_counts())

sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [7]:
data.replace({'sentiment': {'positive': 1, 'negative': 0}}, inplace= True)

  data.replace({'sentiment': {'positive': 1, 'negative': 0}}, inplace= True)


In [10]:
data_samples = data[:10000]
train_data, test_data = train_test_split(data_samples, test_size= 0.5, random_state= 42)

In [11]:
# Kiểm tra dữ liệu có cân bằng chưa => Ta thấy sự chênh lệch của nhãn 0 và 1 không nhiều nên không cần áp dụng các kỹ thuật cân bằng dữ liệu
print(train_data['sentiment'].value_counts())
print(test_data['sentiment'].value_counts())

sentiment
1    2557
0    2443
Name: count, dtype: int64
sentiment
0    2529
1    2471
Name: count, dtype: int64


In [16]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data['review'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen= 500)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen= 500)

In [17]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

# Building to Model

In [18]:
def build_model(hidden_layers, neurons_per_layer, activation, dropout_rate,
                optimizer, learning_rate, embedding_dim=100):
    model = Sequential()

    # Embedding layer
    model.add(Embedding(input_dim= 5000, output_dim= 128, input_length=500))

    # Hidden layers
    for i in range(hidden_layers):
        if i == 0:
            model.add(Bidirectional(LSTM(neurons_per_layer, return_sequences=(hidden_layers > 1))))
        elif i == hidden_layers - 1:
            model.add(Bidirectional(LSTM(neurons_per_layer)))
        else:
            model.add(Bidirectional(LSTM(neurons_per_layer, return_sequences=True)))

        # Add dropout after each LSTM layer
        model.add(Dropout(dropout_rate))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    # Compile model
    if optimizer.lower() == 'adam':
        opt = tensorflow.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer.lower() == 'rmsprop':
        opt = tensorflow.keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer.lower() == 'sgd':
        opt = tensorflow.keras.optimizers.SGD(learning_rate=learning_rate)

    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Setting Hypermeters

In [None]:
configs = [
      {
        'name': 'No.1',
        'batch_size': 64,
        'learning_rate': 0.001,
        'hidden_layers': 1,
        'neurons_per_layer': 64,
        'activation': 'relu',
        'dropout_rate': 0.1,
        'optimizer': 'rmsprop',
        'epochs': 5
    },
    {
        'name': 'No.2',
        'batch_size': 128,
        'learning_rate': 0.01,
        'hidden_layers': 2,
        'neurons_per_layer': 64,
        'activation': 'relu',
        'dropout_rate': 0.2,
        'optimizer': 'adam',
        'epochs': 5
    },
    {
        'name': 'No.3',
        'batch_size': 32,
        'learning_rate': 0.001,
        'hidden_layers': 1,
        'neurons_per_layer': 128,
        'activation': 'tanh',
        'dropout_rate': 0.2,
        'optimizer': 'adam',
        'epochs': 5
    },
        {
        'name': 'No.4',
        'batch_size': 128,
        'learning_rate': 0.001,
        'hidden_layers': 3,
        'neurons_per_layer': 128,
        'activation': 'relu',
        'dropout_rate': 0.2,
        'optimizer': 'adam',
        'epochs': 5
    },
    {
        'name': 'No.5',
        'batch_size': 64,
        'learning_rate': 0.001,
        'hidden_layers': 2,
        'neurons_per_layer': 64,
        'activation': 'relu',
        'dropout_rate': 0.3,
        'optimizer': 'rmsprop',
        'epochs': 5
    },
]

# Training and Evaluating Model

In [25]:
results = []
best_accuracy = 0.0
best_model = None
best_config = None
best_run = None

for config in configs:
    config_results = []

    print(f"\nTraining with configuration: {config['name']}")
    print("Parameters:", {k: v for k, v in config.items() if k != 'name'})

    # Run 3 times for each configuration
    for run in range(3):
        print(f"  Run {run+1}/3")

        # Build model
        model = build_model(
            hidden_layers=config['hidden_layers'],
            neurons_per_layer=config['neurons_per_layer'],
            activation=config['activation'],
            dropout_rate=config['dropout_rate'],
            optimizer=config['optimizer'],
            learning_rate=config['learning_rate']
        )

        # Define early stopping
        early_stopping = EarlyStopping(
            monitor='val_accuracy',
            patience=2,
            restore_best_weights=True
        )

        # Train model
        history = model.fit(
            X_train, Y_train,
            batch_size=config['batch_size'],
            epochs=config['epochs'],
            validation_split=0.2,
            callbacks=[early_stopping],
            verbose=1
        )

        # Evaluate model
        _, accuracy = model.evaluate(X_test, Y_test, verbose=0)
        config_results.append(accuracy)

        print(f"    Test accuracy: {accuracy:.4f}")

        # Check if this model has the best accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
            best_config = config['name']
            best_run = run + 1

    # Calculate mean and standard deviation
    mean_accuracy = np.mean(config_results)
    std_accuracy = np.std(config_results)

    print(f"  Mean accuracy: {mean_accuracy:.4f}")
    print(f"  Standard deviation: {std_accuracy:.4f}")

    # Store results
    results.append({
        'config': config['name'],
        'accuracies': config_results,
        'mean': mean_accuracy,
        'std': std_accuracy
    })


Training with configuration: No.1
Parameters: {'batch_size': 64, 'learning_rate': 0.001, 'hidden_layers': 1, 'neurons_per_layer': 64, 'activation': 'relu', 'dropout_rate': 0.1, 'optimizer': 'rmsprop', 'epochs': 5}
  Run 1/3
Epoch 1/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 412ms/step - accuracy: 0.5075 - loss: 0.6925 - val_accuracy: 0.4980 - val_loss: 0.7490
Epoch 2/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 418ms/step - accuracy: 0.6750 - loss: 0.6270 - val_accuracy: 0.7700 - val_loss: 0.5020
Epoch 3/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 407ms/step - accuracy: 0.7996 - loss: 0.4643 - val_accuracy: 0.8070 - val_loss: 0.4388
Epoch 4/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 408ms/step - accuracy: 0.8591 - loss: 0.3458 - val_accuracy: 0.8040 - val_loss: 0.4438
Epoch 5/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 414ms/step - accuracy: 0.8740 - loss: 0.3163 - val_a

# Saving Model

In [26]:
# Save the best model
best_model.save('best_model.keras')
print(f"\nBest model saved with accuracy: {best_accuracy:.4f}")
print(f"Configuration: {best_config}, Run: {best_run}")


Best model saved with accuracy: 0.8394
Configuration: No.5, Run: 3
