<a href="https://colab.research.google.com/github/paulcodrea/reddit_humor/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import os
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# from time import time
from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout 
from keras.layers import LSTM
# from time import time
# from keras.callbacks import EarlyStopping
# from sklearn.preprocessing import MinMaxScaler
# from pathlib import Path

# import nltk
# import regex as re
# from collections import defaultdict

from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

# import Tokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# from nltk.stem.snowball import EnglishStemmer

In [2]:
# We might have to change the following

config = {
    "learning_rate": 0.001,
    "epochs": 40, 
    "batch_size": 4,
    "train_p": 0.55,
    "val_p": 0.05,
    "LSTM_layer": [50, 100],
    "Dropout_layer": [0.15, 0.2],
    "activation": 'tanh',
    "timesteps": 1,
}

In [3]:
class LSTM_model:
    def __init__(self, path):
        self.path = path # Path to the dataset
        self.data = pd.DataFrame() # Dataframe to store the dataset

        self.context_window = 3 # Context window size
        self.w2v_feature_vector = []

        self.jokes_to_numerical = []

    def read_dataset(self):
        """
        Reads the dataset from the given path.
        """
        ret = pd.read_csv(self.path)
        ret.drop(columns=['Unnamed: 0'], inplace=True)
        
        return ret

    def preprocess_text(self):
        """
        Preprocesses the text data.
        """
        self.data['tokens'] = self.data['joke'].apply(word_tokenize) # tokenize the text but keep the punctuation

    # get the maximum size of tokens in the dataset and add to column
    def get_max_tokens(self):
        self.data['max_tokens'] = self.data['tokens'].apply(lambda x: len(x))


    def construct_word2vec(self, max_length):
        """
        Constructs the word2vec model. (Feature vector)
        """
        self.w2v_feature_vector = []
        context_words = [] # Construct window list for word2vec
        
        for line in self.data['tokens']:
            for index, word in enumerate(line):
                if self.context_window > 0:
                    left = index - self.context_window//2
                    right = index + self.context_window//2 + 1
                else:
                    left = index - self.context_window//2
                    right = index + self.context_window//2
                context_words.append([line[i] for i in range(left, right) if i >= 0 and i < len(line)])
 

        # Create a word2vec model
        # context_words = [['a', 'b'], ['a', 'b', 'c'], ['b', 'c', 'd'], ['c', 'd', 'e'], ['d', 'e']] -> list of lists of words and window size is 5
        # vector_size = 50 -> dimension of the feature vector (pairs)
        # min_count = 4 -> minimum number of occurrences of a word in the corpus
        # workers = 4 -> number of threads to use
        # window = 5 -> window size
        model = Word2Vec(context_words, vector_size=max_length, window=self.context_window, workers=4)

        for line in self.data['tokens']:
            for index, word in enumerate(line):
                if word in model.wv.key_to_index:
                    self.w2v_feature_vector.append(model.wv.get_vector(word))
                else:
                    # if the word is not in the model, then add zero. 
                    self.w2v_feature_vector.append(np.zeros(max_length))


    def convert_jokes_to_numerical(self):
        """
        Converts the jokes to numerical values.
        """
        tokenizer = Tokenizer(num_words=None, split=' ')
        tokenizer.fit_on_texts(self.data['joke'].values)
        self.jokes_to_numerical = tokenizer.texts_to_sequences(self.data['joke'].values)

    def pad_sequences(self, max_length):
        """
        Pads the sequences.
        """
        self.jokes_to_numerical = pad_sequences(self.jokes_to_numerical, maxlen=max_length, padding='post')


In [4]:
# SETTINGS for local machine - change this for Goolg Colab
path = "dataset/final_jokes(1283).csv" #"/content/drive/MyDrive/NLU_Humor-detection/final_jokes(1283).csv"

joke_model = LSTM_model(path)
joke_model.data = joke_model.read_dataset()

joke_model.preprocess_text()
joke_model.get_max_tokens() # get the maximum number of tokens. Since we need the word2vec feature vector to be of the same size for all jokes. 
max_length_joke = joke_model.data['max_tokens'].max()
print("Max length of joke: ", max_length_joke)

Max length of joke:  405


## FIRST METHOD: word2vec

In [5]:
joke_model.construct_word2vec(max_length_joke)
# print(len(joke_model.data['tokens'][0]))

## SECOND METHOD: Tokenizer from keras

In [9]:
joke_model.convert_jokes_to_numerical()
print("Length of first line: ", len(joke_model.data['joke'][0]))

joke_model.pad_sequences(max_length_joke)
print("Length of feature vector after normalisation: ", len(joke_model.jokes_to_numerical[0]))

Length of first line:  186
Length of feature vector after normalisation:  405


In [10]:
print(joke_model.w2v_feature_vector[0])

[ 0.01190722  0.05554788  0.11459028 -0.1150763   0.22286381 -0.0155257
  0.16852403  0.02064565  0.01578335 -0.02260887  0.02366269 -0.09488297
  0.14833543  0.05778171 -0.19579011 -0.17012028  0.13127667 -0.08018813
 -0.06836197 -0.02995019 -0.12100121  0.02024477  0.13432936  0.05154734
  0.06854516  0.09863234 -0.12623179 -0.1973311  -0.00601147 -0.04878491
 -0.09182403 -0.0076752   0.04144758 -0.10919733  0.02863755  0.08945033
 -0.01912385 -0.11983074 -0.03082166 -0.10574423 -0.05163813 -0.00842052
  0.13482538  0.06638224 -0.04382432  0.01843091 -0.16098052  0.12893982
  0.04196787  0.10164028 -0.08608922 -0.07912943 -0.21943185 -0.09881731
 -0.02099738 -0.26242748  0.07640062  0.14756086  0.15768601 -0.09466252
  0.06271809 -0.08707754  0.09271543 -0.0210466  -0.07002075 -0.26492184
 -0.01147205  0.03388357  0.00128775 -0.16882087  0.10245993  0.28725275
  0.16894642  0.02838083  0.21739523  0.07468344  0.05968522  0.10323022
 -0.17828791  0.04917506  0.04343579  0.19628695  0.