In [38]:
import numpy as np 
import pandas as pd

import re
import string
import os

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from spellchecker import SpellChecker
import enchant
import wordninja
from autocorrect import Speller

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Carl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, Flatten
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

### Data Collection

In [3]:
df_train = pd.read_csv("../../Datasets/sentiment_analysis/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
df_train = df_train.drop(columns=["id"])
display(df_train.head(3)) 

Unnamed: 0,sentiment,review
0,1,"""With all this stuff going down at the moment ..."
1,1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,0,"""The film starts with a manager (Nicholas Bell..."


### Data Preprocessing - Including Lematisation and Tokenisation

In [4]:
stops = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
dictionary = enchant.Dict("en_US")
spell = SpellChecker()
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Carl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Carl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Carl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\Carl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\Carl\AppData\Roaming\nltk_data...
[nltk_

True

In [None]:
def data_preprocessing(text):

    # convert to lowercase
    text = text.lower()

    # remove special characters from head and tail of words
    text = ' '.join([re.sub(r'^[^\w]+|[^\w]+$', '', word) for word in text.split()])

    # remove stopwords
    text = ' '.join([word for word in text.split() if not word in stops]) 

    # remove URLs
    text = re.sub(r'http\S+', '', text) # URLs

    # remove HTML tags
    text = re.sub(r'<.*?>', '', text) # HTML tags

    # remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text).rstrip()

    # remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # remove double spaces
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    return text

def data_tokenisation(text):
    tokens = text.split()
    return tokens

def split_token_if_concatenated(token):
    segments = wordninja.split(token)
    if len(segments) > 1:
        return segments
    else:
        return [token]
    
def get_corrected_tokens(df, token_col):
    all_tokens = {token for tokens in df[token_col] for token in tokens}
    misspelled_tokens = spell.unknown(all_tokens)
    corrected_tokens = {}
    for token in all_tokens:
        if token in misspelled_tokens:
            corrected_tokens[token] = spell.correction(token)
        else:
            corrected_tokens[token] = token
    return corrected_tokens

def data_lemmatisation(tokens):
    lemma_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemma_tokens

In [13]:
df_train["review_cleaned"] = df_train['review'].apply(data_preprocessing)
df_train["review_tokens"] = df_train['review_cleaned'].apply(data_tokenisation)
df_train["review_lemmatised"] = df_train['review_tokens'].apply(data_lemmatisation)
display(df_train.head(3)) 

Unnamed: 0,sentiment,review,review_cleaned,review_tokens,review_lemmatised
0,1,"""With all this stuff going down at the moment ...",stuff going moment mj started listening music ...,"[stuff, going, moment, mj, started, listening,...","[stuff, going, moment, mj, started, listening,..."
1,1,"""\""The Classic War of the Worlds\"" by Timothy ...",classic war worlds timothy hines entertaining ...,"[classic, war, worlds, timothy, hines, enterta...","[classic, war, world, timothy, hines, entertai..."
2,0,"""The film starts with a manager (Nicholas Bell...",film starts manager nicholas bell giving welco...,"[film, starts, manager, nicholas, bell, giving...","[film, start, manager, nicholas, bell, giving,..."


In [68]:
# df_test = df_train.head(3)
# all_tokens = {token for tokens in df_test['review_tokens'] for token in tokens}
# print(all_tokens)

# misspelled_tokens = spell.unknown(all_tokens)
# print(misspelled_tokens)

# df_test["review_correct_tokens"] = df_test['review_tokens'].apply(data_spellchecker)
# display(df_test)

In [15]:
df_train["num_words"] = df_train["review"].map(lambda val: len(val.split(" ")))
df_test["num_words"] = df_test["review"].map(lambda val: len(val.split(" ")))
print(max(df_train["num_words"]))
max_length = 5000
max_features = 5000

1449


In [18]:
df_train

Unnamed: 0,sentiment,review,num_words
0,1,stuff going moment mj ive started listening mu...,227
1,1,classic war worlds timothy hines entertaining ...,81
2,0,film starts manager nicholas bell giving welco...,234
3,0,must assumed praised film greatest filmed oper...,196
4,1,superbly trashy wondrously unpretentious 80s e...,214
...,...,...,...
24995,0,seems like consideration gone imdb reviews fil...,54
24996,0,dont believe made film completely unnecessary ...,99
24997,0,guy loser cant get girls needs build picked st...,66
24998,0,30 minute documentary buuel made early 1930s o...,99


In [16]:
y_train = df_train["sentiment"].values
y_test = df_test["sentiment"].values

X_train = df_train["review"]
X_test = df_test["review"]

KeyError: 'sentiment'

In [103]:
def tokenize(sentence):
  max_features = 6000
  tokenizer = Tokenizer(num_words=max_features)
  tokenizer.fit_on_texts(list(train_reviews))
  list_tokenized = tokenizer.texts_to_sequences(sentence)
  max_length = 360
  idv = pad_sequences(list_tokenized, maxlen=max_length)
  return list_tokenized, idv

list_tokenized_train, X_train = tokenize(train_reviews)
list_tokenized_test, X_test = tokenize(test_reviews)

In [104]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epochs, logs={}):
        if logs.get('accuracy') > 0.95:
            print('\n Stopped Training!\n')
            self.model.stop_training = True

def train_model(model, model_name, n_epochs, batch_size, X_data, y_data, validation_split):    
    checkpoint_path = model_name+"_cp-{epoch:04d}.weights.h5"
    checkpoint_dir = os.path.dirname(checkpoint_path)
    cp_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        save_weights_only=True,
        verbose=1
    )
    callbacks_earlystop = myCallback()

    history = model.fit(
        X_data,
        y_data,
        steps_per_epoch=batch_size,
        epochs=n_epochs,
        validation_split=validation_split,
        verbose=1,
        callbacks=[cp_callback, callbacks_earlystop]
    )
    return history


In [105]:
def generate_graph(history):
    plt.plot(history.history['accuracy'], 'b')
    plt.plot(history.history['val_accuracy'], 'r')
    plt.title('Model Accuracy'),
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()

In [113]:
class Model_A():
    def __new__(self):
        inp = Input(shape=(360, ))
        embed_size = 128
        x = Embedding(max_features, embed_size)(inp)
        x = LSTM(60, return_sequences=True)(x)
        x = GlobalMaxPool1D()(x)
        x = Dropout(0.1)(x)
        x = Dense(50, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=x)
        model.compile(
            loss='binary_crossentropy', 
            optimizer='SGD', 
            metrics=['accuracy']
        )
        return model
    
model_a = Model_A()
model_a.summary()

In [114]:
history_a = train_model(
    model = model_a, 
    model_name = "model_a", 
    n_epochs = 10, 
    batch_size = 64, 
    X_data = X_train, 
    y_data = y, 
    validation_split = 0.2
)

Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node functional_21_1/embedding_22_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\traitlets\config\application.py", line 1043, in launch_instance

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\ipykernel\kernelapp.py", line 725, in start

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\tornado\platform\asyncio.py", line 195, in start

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\asyncio\base_events.py", line 608, in run_forever

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\asyncio\base_events.py", line 1936, in _run_once

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\asyncio\events.py", line 84, in _run

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\ipykernel\kernelbase.py", line 513, in dispatch_queue

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\ipykernel\kernelbase.py", line 502, in process_one

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\ipykernel\kernelbase.py", line 409, in dispatch_shell

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\ipykernel\kernelbase.py", line 729, in execute_request

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\ipykernel\ipkernel.py", line 422, in do_execute

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\ipykernel\zmqshell.py", line 540, in run_cell

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3009, in run_cell

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3064, in _run_cell

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3269, in run_cell_async

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3448, in run_ast_nodes

  File "C:\Users\mia.jensen\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code

  File "C:\Users\mia.jensen\AppData\Local\Temp\ipykernel_36884\3622214616.py", line 1, in <module>

  File "C:\Users\mia.jensen\AppData\Local\Temp\ipykernel_36884\3529105120.py", line 17, in train_model

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 371, in fit

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 219, in function

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 132, in multi_step_on_iterator

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 113, in one_step_on_data

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 57, in train_step

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\layers\layer.py", line 910, in __call__

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\ops\operation.py", line 58, in __call__

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\models\functional.py", line 183, in call

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\ops\function.py", line 171, in _run_through_graph

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\models\functional.py", line 643, in call

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\layers\layer.py", line 910, in __call__

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\ops\operation.py", line 58, in __call__

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\utils\traceback_utils.py", line 156, in error_handler

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\layers\core\embedding.py", line 140, in call

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\ops\numpy.py", line 5442, in take

  File "c:\Users\mia.jensen\Anaconda3\envs\Python_3_11_11\Lib\site-packages\keras\src\backend\tensorflow\numpy.py", line 2222, in take

indices[134,265] = 5906 is not in [0, 5000)
	 [[{{node functional_21_1/embedding_22_1/GatherV2}}]] [Op:__inference_multi_step_on_iterator_5987]