<a href="https://colab.research.google.com/github/nghoanglong/NLP-Sentiment-Analysis/blob/master/Sentiment_Analysis_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib as plt
import matplotlib.pyplot as plt
import nltk
import csv
import chardet
nltk.download('punkt')
from nltk.tree import Tree
from nltk.tokenize import word_tokenize

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
class PreprocessData:
    def load_dataset(self,
                     DATASET_REQ_PATH):
        """Load dataset tương ứng theo type_dataset

            type_dataset = [train, dev, test] | type = string
            file_extension = .txt, .csv,... | type = string

            return ndarray shape(n, 2) với row = array[sentence, label]
        """
        try:
            with open(DATASET_REQ_PATH, 'rb') as f:
                result = chardet.detect(f.read())
            dataset = pd.read_csv(DATASET_REQ_PATH, encoding=result['encoding'])
            dataset = dataset.to_numpy()
            return dataset
        except FileExistsError as err:
            print(err)
            return None

    def transform_sentence(self,
                           sent_tokenized,
                           li_vocabs,
                           mode=True):
        """Encode và decode một sentence về dạng tương ứng

           if mode = True => Encode một sentence đã được tokenize về dạng numerical
              sent_tokenized: [token, token, token,...]
              li_vocabs = {token: id, token: id,...}
              return sentence = [id, id, id, id,...]

           if mode = False => Decode một sentence ở dạng numerical về dạng list các tokens
              sent_tokenized: [id, id, id,...]
              li_vocabs = {id: token, id: token, id: token,...}
              return sentence = [token, token, token, token,...]
        """
        oov_tok = "<OOV>"
        sent_transformed = None
        if mode:
            # nếu token ko có trong li_vocabs -> thêm idx của '<OOV>'
            sent_transformed = np.array([li_vocabs.get(token, li_vocabs[oov_tok])
                                         for token in sent_tokenized])
        else:
            # nếu idx ko có trong li_vocabs -> là giá trị pad_value đc thêm vào -> remove all pad_value
            transformed = np.array([li_vocabs.get(idx, 'pad_value')
                                    for idx in sent_tokenized])
            sent_transformed = np.delete(transformed, 
                                         np.where(transformed == 'pad_value'))
        return sent_transformed
    def visualize_sentence_length(self,
                                  dataset):
      """Visualize length of all sequences in the dataset
      """
      df = pd.DataFrame(np.array([len(sample) for sample in dataset]), 
                        columns=['length'])
      _, axes = plt.subplots(figsize=(25, 5))
      sns.countplot(x='length', data=df, ax=axes)

In [None]:
# load dataset
data = PreprocessData()
loaded_train_data = data.load_dataset('/content/gdrive/MyDrive/All Datasets/NLP-Sentiment-data/data pre-processed/train_csv.csv')
loaded_dev_data = data.load_dataset('/content/gdrive/MyDrive/All Datasets/NLP-Sentiment-data/data pre-processed/dev_csv.csv')
loaded_test_data = data.load_dataset('/content/gdrive/MyDrive/All Datasets/NLP-Sentiment-data/data pre-processed/test_csv.csv')

In [None]:
train_data = loaded_train_data[:,0]
dev_data = loaded_dev_data[:, 0]
test_data = loaded_test_data[:, 0]

In [None]:
from sklearn import preprocessing

train_data_label = np.array(loaded_train_data[:, 1]).astype(np.int64)
dev_data_label = np.array(loaded_dev_data[:, 1]).astype(np.int64)
test_data_label = np.array(loaded_test_data[:, 1]).astype(np.int64)

lb = preprocessing.LabelBinarizer()
lb.fit(np.concatenate((train_data_label, dev_data_label, test_data_label),axis=0))

train_data_label = lb.transform(train_data_label)
dev_data_label = lb.transform(dev_data_label)
test_data_label = lb.transform(test_data_label)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
oov_tok = '<OOV>'
MAX_TOKENS = 10000
# build vocabulary trên các sentence từ 3 bộ data
tokenizer = Tokenizer(num_words=MAX_TOKENS, oov_token=oov_tok)
tokenizer.fit_on_texts(np.concatenate((train_data, dev_data, test_data), axis=0))
li_vocabs = tokenizer.word_index

# encode các sentence về dạng ids
train_sequences = tokenizer.texts_to_sequences(train_data)
dev_sequences = tokenizer.texts_to_sequences(dev_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

In [None]:
data.visualize_sentence_length(train_sequences)

In [None]:
data.visualize_sentence_length(dev_sequences)

In [None]:
data.visualize_sentence_length(test_sequences)

In [None]:
# Sau khi
max_length_seq = 53
padding_type = 'post'
trungcating_type = 'post'

train_padded = pad_sequences(train_sequences, maxlen=max_length_seq, padding=padding_type, truncating=trungcating_type)
dev_padded = pad_sequences(dev_sequences, maxlen=max_length_seq, padding=padding_type, truncating=trungcating_type)
test_padded = pad_sequences(test_sequences, maxlen=max_length_seq, padding=padding_type, truncating=trungcating_type)

In [None]:
# build model
EMBEDDING_DIM = 64
NUM_WORDS = len(li_vocabs)
model = keras.Sequential([
            tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_DIM, input_length=max_length_seq),
            tf.keras.layers.SpatialDropout1D(0.2),
            tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2),
            tf.keras.layers.Dense(5, activation='softmax')        
])

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs = 30
history = model.fit(train_padded, train_data_label, epochs=num_epochs, validation_data=(test_padded, test_data_label), verbose=2)

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [None]:
plot_graphs(history, 'accuracy')