In [1]:
!pip install gensim==4.2.0



In [2]:
import pandas as pd
import numpy as np

from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout 
from keras.layers import LSTM, Embedding

from sklearn.model_selection import train_test_split

import gensim
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

# import Tokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [11]:
# We might have to change the following

# config = {
#     "learning_rate": 0.001,
#     "epochs": 5, 
#     "batch_size": 32,
#     "train_p": 0.55,
#     "val_p": 0.05,
#     "LSTM_layer": [50, 100],
#     "Dropout_layer": [0.15, 0.2],
#     "activation": 'tanh',
#     "timesteps": 1,
# }

In [12]:
class LSTM_model:
    def __init__(self, path):
        self.path = path # Path to the dataset
        self.data = pd.DataFrame() # Dataframe to store the dataset

        self.context_window = 5 # Context window size
        self.w2v_feature_vector = []
        self.vocabulary_size = 0
        self.vocabulary = []

        self.jokes_to_numerical = []
        self.model = None


    def read_dataset(self):
        """
        Reads the dataset from the given path.
        """
        ret = pd.read_csv(self.path)
        ret.drop(columns=['Unnamed: 0'], inplace=True)

        # rearange the columns as funny, joke
        ret = ret[['funny', 'joke']]
        ret = ret.sample(frac=1).reset_index(drop=True)
        return ret


    def preprocess_text(self):
        """
        Preprocesses the text data.
        """
        # gensim.utils.simple_preprocess. 
        # This will remove all punctuation, remove stop words and tokenize the given sentence.
        self.data['tokens'] = self.data['joke'].apply(lambda x: gensim.utils.simple_preprocess(x))

        # self.data['tokens'] = self.data['joke'].apply(word_tokenize) # tokenize the text but keep the punctuation


    def get_vocabulary(self):
        """
        Gets the vocabulary.
        """
        self.vocabulary_size = len(self.data['tokens'].apply(set).apply(len))
        self.vocabulary = self.data['tokens'].apply(set).apply(list)


    def get_max_tokens(self):
        """
        Gets the maximum number of tokens in a joke.
        """
        self.data['max_tokens'] = self.data['tokens'].apply(lambda x: len(x))

        # run get_vocalulary() again to get the vocabulary
        self.get_vocabulary()


    def w2v_model(self, max_length):
        """
        Splits the data into train and validation sets.

        Constructs the word2vec model. (Feature vector)
        """
        X = self.data['tokens']
        y = self.data['funny']

        print("X shape: ", X.shape)
        print("y shape: ", y.shape)
        # split the data into train and validation sets and make them random
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

        w2v_model = Word2Vec(X_train, vector_size=max_length, min_count=1, window=self.context_window)

        w2v_model.train(X_train, total_examples=len(X_train), epochs=10)

        vocab = w2v_model.wv.index_to_key
        print("Vocabulary size: ", len(vocab))

        word_vec = {}
        for word in vocab:
            word_vec[word] = w2v_model.wv.get_vector(word)
        
        # print(word_vec['like'])
        print("The no of key-value pairs : ",len(word_vec))


        # Create a padded sequence of the joke
        tokeniser = Tokenizer()
        tokeniser.fit_on_texts(self.data['tokens'])
        vocab_size = len(tokeniser.word_index) + 1
        print("Vocabulary size: ", vocab_size)

        # pad the sequences to the same length
        ret = tokeniser.texts_to_sequences(self.data['tokens'])

        data_padded = pad_sequences(ret, maxlen=max_length, padding='post')
        print("Data padded shape: ", data_padded.shape)


        # Create the embeddings matrix
        embedding_matrix = np.zeros((vocab_size, max_length))
        for word, i in tokeniser.word_index.items():
            embedding_vector = word_vec.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        # print(embedding_matrix[15])

        Y = self.data['funny']

        x_train, x_test, y_train, y_test = train_test_split(data_padded, Y, test_size=0.3, random_state=42)

        # print size of the train and test sets
        print("X_train:", len(X_train))
        print("X_test:", len(X_test))
        print("y_train:", len(y_train))
        print("y_test:", len(y_test))


        # Building the model
        model = Sequential()
        model.add(Embedding(vocab_size, max_length, input_length=max_length, 
                            weights=[embedding_matrix], trainable=False))

        model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.summary()

        # Train the model
        model.fit(x_train, y_train, batch_size=100, epochs=5, validation_data=(x_test, y_test))

        # Evaluate the model
        scores = model.evaluate(x_test, y_test)
        print("Accuracy: %.2f%%" % (scores[1] * 100))

In [13]:
# SETTINGS for local machine - change this for Goolg Colab
# path = "dataset/final_jokes(1283).csv"
path = "/content/drive/MyDrive/NLU_Humor-detection/final_jokes(1283).csv"

joke_model = LSTM_model(path)
joke_model.data = joke_model.read_dataset()
joke_model.preprocess_text()

joke_model.get_max_tokens() # get the maximum number of tokens. Since we need the word2vec feature vector to be of the same size for all jokes. 
max_length_joke = joke_model.data['max_tokens'].max()


joke_model.w2v_model(max_length_joke)



X shape:  (1283,)
y shape:  (1283,)
Vocabulary size:  3605
The no of key-value pairs :  3605
Vocabulary size:  4647
Data padded shape:  (1283, 332)
X_train: 898
X_test: 385
y_train: 898
y_test: 385
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 332, 332)          1542804   
                                                                 
 lstm_3 (LSTM)               (None, 128)               236032    
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,778,965
Trainable params: 236,161
Non-trainable params: 1,542,804
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 51.95%
