<a href="https://colab.research.google.com/github/paulcodrea/reddit_humor/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# import os
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# from time import time
from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout 
from keras.layers import LSTM, Embedding
# from time import time
# from keras.callbacks import EarlyStopping
# from sklearn.preprocessing import MinMaxScaler
# from pathlib import Path

# train_test_split
from sklearn.model_selection import train_test_split
# import nltk
# import regex as re
# from collections import defaultdict

from nltk.tokenize import word_tokenize
# from gensim.models import Word2Vec

# import Tokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# from nltk.stem.snowball import EnglishStemmer

In [17]:
# We might have to change the following

config = {
    "learning_rate": 0.001,
    "epochs": 5, 
    "batch_size": 32,
    "train_p": 0.55,
    "val_p": 0.05,
    "LSTM_layer": [50, 100],
    "Dropout_layer": [0.15, 0.2],
    "activation": 'tanh',
    "timesteps": 1,
}

In [41]:
class LSTM_model:
    def __init__(self, path):
        self.path = path # Path to the dataset
        self.data = pd.DataFrame() # Dataframe to store the dataset

        self.context_window = 3 # Context window size
        self.w2v_feature_vector = []
        self.vocabulary_size = 0

        self.jokes_to_numerical = []
        self.model = None

    def read_dataset(self):
        """
        Reads the dataset from the given path.
        """
        ret = pd.read_csv(self.path)
        ret.drop(columns=['Unnamed: 0'], inplace=True)

        # randomize the data set and take the first 1000 rows
        ret = ret.sample(frac=1)#.head(1000)

        self.data = ret


    def preprocess_text(self):
        """
        Preprocesses the text data.
        """
        self.data['tokens'] = self.data['joke'].apply(word_tokenize) # tokenize the text but keep the punctuation

    # get the maximum size of tokens in the dataset and add to column
    def get_max_tokens(self):
        self.data['max_tokens'] = self.data['tokens'].apply(lambda x: len(x))


    def convert_jokes_to_numerical(self):
        """
        Converts the jokes to numerical values.
        """
        tokenizer = Tokenizer(num_words=None, split=' ')
        tokenizer.fit_on_texts(self.data['joke'].values)
        self.jokes_to_numerical = tokenizer.texts_to_sequences(self.data['joke'].values)

        # get vocabulary size
        self.vocabulary_size = len(tokenizer.word_index) + 1

    def pad_sequences(self, max_length):
        """
        Pads the sequences.
        """
        self.jokes_to_numerical = pad_sequences(self.jokes_to_numerical, maxlen=max_length, padding='post')


    def data_split(self):
        """
        Splits the data into train and validation sets.

        Constructs the LSTM model.
        """
        X = self.jokes_to_numerical
        y = self.data['funny']

        print("X shape: ", X.shape)
        print("y shape: ", y.shape)
        # split the data into train and validation sets and make them random
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # print(X_train[0])

        print("X_train:", len(X_train))
        print("X_test:", len(X_test))
        print("y_train:", len(y_train))
        print("y_test:", len(y_test))

        self.model = Sequential()
        self.model.add(Embedding(input_dim=self.vocabulary_size, output_dim=120, input_length=int(X.shape[1])))
        # self.model.add(LSTM(config['LSTM_layer'][0], activation=config['activation'], input_shape=(X.shape[1])))
        self.model.add(Dropout(config['Dropout_layer'][0]))
        self.model.add(LSTM(config['LSTM_layer'][1], activation=config['activation']))
        self.model.add(Dropout(config['Dropout_layer'][1]))
        self.model.add(Dense(units=1, activation='softmax'))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        self.model.fit(X_train, y_train, epochs=config['epochs'], batch_size=config['batch_size'], verbose='auto')#, validation_split=config['val_p'])

        # Evaluate the model
        scores = self.model.evaluate(X_test, y_test)
        print("Accuracy: %.2f%%" % (scores[1] * 100))


In [42]:
# SETTINGS for local machine - change this for Goolg Colab
path = "dataset/final_jokes(1283).csv" #"/content/drive/MyDrive/NLU_Humor-detection/final_jokes(1283).csv"

joke_model = LSTM_model(path)
joke_model.read_dataset()

joke_model.preprocess_text()
joke_model.get_max_tokens() # get the maximum number of tokens. Since we need the word2vec feature vector to be of the same size for all jokes. 
max_length_joke = joke_model.data['max_tokens'].max()
print("Max length of joke: ", max_length_joke)

Max length of joke:  405


## Tokenizer from keras

In [43]:
joke_model.convert_jokes_to_numerical()
print("Length of first line: ", len(joke_model.data['joke'][0]))

joke_model.pad_sequences(max_length_joke)
print("Length of feature vector after normalisation: ", len(joke_model.jokes_to_numerical[0]))

# print(joke_model.jokes_to_numerical[:1])

Length of first line:  186
Length of feature vector after normalisation:  405


In [44]:
joke_model.data_split()

X shape:  (1283, 405)
y shape:  (1283,)
X_train: 898
X_test: 385
y_train: 898
y_test: 385
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 52.47%
