In [14]:
# !pip install gensim==4.2.0

In [15]:
import pandas as pd
import numpy as np
import pickle

from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout 
from keras.layers import LSTM, Embedding

from sklearn.model_selection import train_test_split

import gensim
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

# import Tokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import precision_score, recall_score, f1_score

In [16]:
# from google.colab import drive
# drive.mount('/content/drive')

In [17]:
# We might have to change the following
config = {
    "learning_rate": 0.001,
    "epochs": 3, 
    "batch_size": 70,
    "test_p": 0.2,
    "val_p": 0.1,
    "LSTM_layer": [50, 100],
    "Dropout_layer": [0.15, 0.2],
    "activation": 'softmax',
    ##################### SAVE FOR LIVE DEMO #############################
    "model_path": "model/2c_model.h5",
    "tokenizer_path": "model/2c_tokenizer.pickle",
    "data_path": "model/2c_data.csv",
}

In [18]:
class LSTM_model:
    def __init__(self, path):
        self.path = path # Path to the dataset
        self.data = pd.DataFrame() # Dataframe to store the dataset
        self.tokenizer = Tokenizer(num_words=None, split=' ') # Tokenizer to tokenize the text

        self.context_window = 5 # Context window size
        self.w2v_feature_vector = []
        self.vocabulary_size = 0
        self.vocabulary = []

        self.jokes_to_numerical = []
        self.model = None
        self.word_vec = {}


    def read_dataset(self):
        """
        Reads the dataset from the given path.
        """
        ret = pd.read_csv(self.path)
        ret.drop(columns=['Unnamed: 0'], inplace=True)

        # rearange the columns as funny, joke
        ret = ret[['funny', 'joke']]
        ret = ret.sample(frac=1).reset_index(drop=True)
        # return ret
        self.data = ret

    def preprocess_text(self):
        """
        Preprocesses the text data.
        """
        # gensim.utils.simple_preprocess. 
        # This will remove all punctuation, remove stop words and tokenize the given sentence.
        self.data['tokens'] = self.data['joke'].apply(lambda x: gensim.utils.simple_preprocess(x))


    def get_vocabulary(self):
        """
        Gets the vocabulary.
        """
        self.vocabulary_size = len(self.data['tokens'].apply(set).apply(len))
        self.vocabulary = self.data['tokens'].apply(set).apply(list)


    def get_max_tokens(self):
        """
        Gets the maximum number of tokens in a joke.
        """
        self.data['max_tokens'] = 0
        # create a new column in the dataframe with max tokens per row. and count only if the token isalpha()
        for index, row in self.data.iterrows():
            count = 0
            for token in row['tokens']:
                if token.isalpha():
                    count += 1
            self.data['max_tokens'][index] = count
        
        # Get the Vocabulary
        self.get_vocabulary()


    def w2v_model(self, max_length):
        """
        Splits the data into train and validation sets.
        Constructs the word2vec model. (Feature vector)
        """
        X = self.data['tokens']
        y = self.data['funny']

        print("X shape: ", X.shape)
        print("y shape: ", y.shape)
        # split the data into train and validation sets and make them random
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

        w2v_model = Word2Vec(X_train, vector_size=max_length, min_count=1, window=self.context_window)

        w2v_model.train(X_train, total_examples=len(X_train), epochs=10)

        vocab = w2v_model.wv.index_to_key
        print("Vocabulary size: ", len(vocab))

        # self.word_vec = {}
        for word in vocab:
            self.word_vec[word] = w2v_model.wv.get_vector(word)
        
        # print(word_vec['like'])
        print("The no of key-value pairs : ",len(self.word_vec))


    def LSTM_model(self, max_length):
        # Create a padded sequence of the joke
        # tokeniser = Tokenizer()
        self.tokeniser.fit_on_texts(self.data['tokens'])
        vocab_size = len(self.tokeniser.word_index) + 1
        print("Vocabulary size: ", vocab_size)

        # pad the sequences to the same length
        ret = self.tokeniser.texts_to_sequences(self.data['tokens'])

        data_padded = pad_sequences(ret, maxlen=max_length, padding='post')
        print("Data padded shape: ", data_padded.shape)

        # Create the embeddings matrix
        embedding_matrix = np.zeros((vocab_size, max_length))
        for word, i in self.tokeniser.word_index.items():
            embedding_vector = self.word_vec.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        # print(embedding_matrix[15])

        Y = self.data['funny']
        x_train, x_test, y_train, y_test = train_test_split(data_padded, 
                                                            Y, test_size=config['test_p'], 
                                                            random_state=42)

        # Building the model
        self.model = Sequential()
        self.model.add(Embedding(input_dim=vocab_size, output_dim=max_length, input_length=max_length, 
                            weights=[embedding_matrix], trainable=False))
        self.model.add(Dropout(config['Dropout_layer'][0]))
        self.model.add(LSTM(config['LSTM_layer'][1], activation=config['activation'])) #dropout=config['Dropout_layer'][1], recurrent_dropout=0.2))
        self.model.add(Dropout(config['Dropout_layer'][1]))
        self.model.add(Dense(units=1, activation=config['activation']))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        self.model.summary()

        # Train the model
        self.model.fit(x_train, y_train,epochs=config['epochs'], batch_size=config['batch_size'], verbose='auto', validation_split=config['val_p'])
        self.model.save(config['model_path']) # save the model
        self.evaluate_model(x_test, y_test, max_length) # evaluate the model

    def evaluate_model(self, X_test, y_test, max_length):
        """
        Evaluates the model.
        """
        scores = self.model.evaluate(X_test, y_test)
        print("Accuracy: %.2f%%" % (scores[1] * 100))

        # Print Precision and Recall
        y_pred = self.model.predict(X_test)
        y_pred = np.round(y_pred)

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print("Precision: %.2f%%" % (precision * 100))
        print("Recall: %.2f%%" % (recall * 100))
        print("F1-Score: %.2f%%" % (f1 * 100))

        # Save data
        self.save_data(max_length, scores, precision, recall, f1)

    def save_data(self, max_length, accuracy, precision, recall, f1):
        """
        Saves the data.
        """
        # Add in dataframe master_df max_len, accuracy, precision, recall, f1-score
        ret = pd.DataFrame(columns=['max_len', 'accuracy', 'precision', 'recall', 'f1-score'])
        ret.loc[0] = [max_length, accuracy, precision, recall, f1]
        ret.to_csv(config['data_path'])
        
        # save the tokenizer
        with open(config['tokenizer_path'], 'wb') as handle:
            pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Read Data and pre-process it.

In [19]:
# SETTINGS for local machine - change this for Goolg Colab
path = "dataset/final_jokes(1283).csv"
# path = "/content/drive/MyDrive/NLU_Humor-detection/final_jokes(1283).csv"

joke_model = LSTM_model(path)
# joke_model.data = 
joke_model.read_dataset()
joke_model.preprocess_text()

joke_model.get_max_tokens() # get the maximum number of tokens. Since we need the word2vec feature vector to be of the same size for all jokes. 
max_length_joke = joke_model.data['max_tokens'].max()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['max_tokens'][index] = count


### Construct the word2vec embedding vector and train the LSTM model

In [20]:
joke_model.w2v_model(max_length_joke)
joke_model.LSTM_model(max_length_joke)

X shape:  (1283,)
y shape:  (1283,)
Vocabulary size:  3547
The no of key-value pairs :  3547
Vocabulary size:  4647
Data padded shape:  (1283, 332)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 332, 332)          1542804   
                                                                 
 dropout_4 (Dropout)         (None, 332, 332)          0         
                                                                 
 lstm_2 (LSTM)               (None, 100)               173200    
                                                                 
 dropout_5 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 1,716,105
Trainable para

AttributeError: 'LSTM_model' object has no attribute 'tokenizer'