<a href="https://colab.research.google.com/github/minhvn1433/Deep-learning-project/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis

This is a testing notebook. In this notebook, you will play with a csv file. 😎😎😎

### Imports

You will first import common libraries that will be used throughout this notebook.

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input,
    Embedding,
    Conv1D,
    Bidirectional,
    LSTM,
    GRU,
    Flatten,
    GlobalAveragePooling1D,
    Dense,
    Dropout,
)

### Load and Prepare the Dataset

First, you will load the csv file and extract the contents into lists.

In [None]:
# Load the CSV file
df = pd.read_csv('data.csv')
display(df)

# Initialize the lists
sentences = df['comment'].tolist()
labels = df['rate'].tolist()
labels = [label - 1 for label in labels]

You will then split the lists into train, validation and test sets.

In [None]:
# Split the train data
(
    training_sentences,
    temp_sentences,
    training_labels,
    temp_labels
) = train_test_split(sentences, labels, test_size=0.2, stratify=labels)

# Split the validation and test data
(
    validation_sentences,
    testing_sentences,
    validation_labels,
    testing_labels,
) = train_test_split(temp_sentences, temp_labels, test_size=0.5, stratify=temp_labels)

# Convert the labels lists into numpy arrays
training_labels = np.array(training_labels)
validation_labels = np.array(validation_labels)
testing_labels = np.array(testing_labels)

Next, you will generate the vocabulary and padded sequences.

In [None]:
# Parameters
vocab_size = 300
max_length = 304
padding_type = 'post'
trunc_type = 'post'
oov_tok = '<OOV>'

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Generate and pad the validation sequences
validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Generate and pad the testing sequences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

### Plot Ultility

Before you define the models, you will define the function below so you can easily visualize the accuracy and loss history after training.

In [None]:
# Plot Ultility
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel('Epochs')
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

### Model 1: Flatten

In [None]:
embedding_dim = 16

# Build the models
inputs = Input(shape=(max_length,), dtype='int32')
X = Embedding(vocab_size, embedding_dim)(inputs)
X = Flatten()(X)
X = Dense(16, activation='relu')(X)
X = Dense(5, activation='softmax')(X)
model = Model(inputs=inputs, outputs=X)

# Compile the model and print the model summary
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
BATCH_SIZE = 128
NUM_EPOCHS = 10

# Train the model
history = model.fit(
    training_padded,
    training_labels,
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    validation_data=(validation_padded, validation_labels),
    shuffle=True,
    verbose=2,
)

In [None]:
# Plot the accuracy and loss history
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

### Model 2: LSTM

In [None]:
embedding_dim = 16

# Build the models
inputs = Input(shape=(max_length,), dtype='int32')
X = Embedding(vocab_size, embedding_dim)(inputs)
X = Bidirectional(LSTM(32))(X)
X = Dense(16, activation='relu')(X)
X = Dense(5, activation='softmax')(X)
model_lstm = Model(inputs=inputs, outputs=X)

# Compile the model and print the model summary
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

In [None]:
BATCH_SIZE = 128
NUM_EPOCHS = 5

# Train the model
history_lstm = model_lstm.fit(
    training_padded,
    training_labels,
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    validation_data=(validation_padded, validation_labels),
    shuffle=True,
    verbose=2,
)

In [None]:
print(tf.__version__)

In [None]:
# Plot the accuracy and loss history
plot_graphs(history_lstm, 'accuracy')
plot_graphs(history_lstm, 'loss')

### Model 3: GRU

In [None]:
embedding_dim = 16

# Build the models
inputs = Input(shape=(max_length,), dtype='int32')
X = Embedding(vocab_size, embedding_dim)(inputs)
X = Bidirectional(GRU(32))(X)
X = Dense(16, activation='relu')(X)
X = Dense(5, activation='softmax')(X)
model_gru = Model(inputs=inputs, outputs=X)

# Compile the model and print the model summary
model_gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_gru.summary()

In [None]:
BATCH_SIZE = 128
NUM_EPOCHS = 10

# Train the model
history_gru = model_gru.fit(
    training_padded,
    training_labels,
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    validation_data=(validation_padded, validation_labels),
    shuffle=True,
    verbose=2,
)

In [None]:
# Plot the accuracy and loss history
plot_graphs(history_gru, 'accuracy')
plot_graphs(history_gru, 'loss')

### Model 4: Convolution

In [None]:
embedding_dim = 16

# Build the models
inputs = Input(shape=(max_length,), dtype='int32')
X = Embedding(vocab_size, embedding_dim)(inputs)
X = Conv1D(128, 5, activation='relu')(X)
X = GlobalAveragePooling1D()(X)
X = Dense(16, activation='relu')(X)
X = Dense(5, activation='softmax')(X)
model_conv = Model(inputs=inputs, outputs=X)

# Compile the model and print the model summary
model_conv.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_conv.summary()

In [None]:
BATCH_SIZE = 128
NUM_EPOCHS = 10

# Train the model
history_conv = model_conv.fit(
    training_padded,
    training_labels,
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    validation_data=(validation_padded, validation_labels),
    shuffle=True,
    verbose=2,
)

In [None]:
# Plot the accuracy and loss history
plot_graphs(history_conv, 'accuracy')
plot_graphs(history_conv, 'loss')