<a href="https://colab.research.google.com/github/paulcodrea/reddit_humor/blob/main/2a_keras-tokenize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
import pandas as pd
import numpy as np
# import emoji
import re
import pickle

import nltk
nltk.download('punkt')

from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout 
from keras.layers import LSTM, Embedding

from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import precision_score, recall_score, f1_score


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paulc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [76]:
# from google.colab import drive
# drive.mount('/content/drive')

In [77]:
# We might have to change the following

config = {
    "learning_rate": 0.001,
    "epochs": 5, 
    "batch_size": 50,
    "test_p": 0.2,
    "val_p": 0.1,
    "LSTM_layer": [50, 100],
    "Dropout_layer": [0.15, 0.2],
    "activation": 'softmax',
    ##################### SAVE FOR LIVE DEMO #############################
    "model_path": './model/2a_model.h5',
    "tokenizer_path": './model/2a_tokenizer.pickle',
    "data_path": "model/2a_data.csv",
}

In [78]:
class LSTM_model:
    def __init__(self, path):
        """
        Initializes the class.
        """
        self.path = path # Path to the dataset
        self.data = pd.DataFrame() # Dataframe to store the dataset
        # self.master_df = pd.DataFrame()

        # self.context_window = 3 # Context window size
        # self.w2v_feature_vector = []
        self.vocabulary_size = 0
        self.tokenizer = Tokenizer(num_words=None, split=' ')

        self.jokes_to_numerical = []
        self.model = None


    def read_dataset(self):
        """
        Reads the dataset from the given path.
        """
        ret = pd.read_csv(self.path)
        ret.drop(columns=['Unnamed: 0'], inplace=True)

        # randomize the data set and take the first 10 rows
        ret = ret.sample(frac=1, ignore_index=True)#.head(10)
        self.data = ret

        
    def preprocess_text(self):
        """
        Preprocesses the text data.
        """
        emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+", flags=re.UNICODE)


        # Remove URLs.
        self.data['text'] = self.data['joke'].apply(lambda x: ' '.join(word for word in x.split() if not word.startswith('http')))
        self.data['tokens'] = self.data['text'].apply(word_tokenize)
        self.data['tokens'] = self.data['tokens'].apply(lambda x: ' '.join(word for word in x if not re.search(emoji_pattern, word))) # remove emojis
        self.data['tokens'] = self.data['tokens'].apply(word_tokenize) # tokenize the text again


    def get_max_tokens(self):
        """
        Returns the maximum number of tokens in the dataset.
        """
        self.data['max_tokens'] = 0
        # create a new column in the dataframe with max tokens per row. and count only if the token isalpha()
        for index, row in self.data.iterrows():
            count = 0
            for token in row['tokens']:
                if token.isalpha():
                    count += 1
            self.data['max_tokens'][index] = count

    def set_vocabulary_size(self, size):
        """
        Sets the vocabulary size.
        """
        self.vocabulary_size = size


    def convert_jokes_to_numerical(self):
        """
        Converts the jokes to numerical values.
        """
        self.tokenizer.fit_on_texts(self.data['joke'].values)
        self.jokes_to_numerical = self.tokenizer.texts_to_sequences(self.data['joke'].values)

        # get vocabulary size
        self.vocabulary_size = len(self.tokenizer.word_index) + 1


    def pad_sequences(self, max_length):
        """
        Pads the sequences.
        """
        self.jokes_to_numerical = pad_sequences(self.jokes_to_numerical, maxlen=max_length, padding='post')


    def gen_pad_sequences(self, text, max_length):
        """
        Generates padded sequences.
        """
        return pad_sequences(text, maxlen=max_length, padding='post')

    def split_dataset(self, X_dataset, y_dataset):
        """
        Splits the dataset into training and testing sets.
        """
        X = X_dataset
        y = y_dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config['test_p'], random_state=42)

        return X_train, X_test, y_train, y_test


    def LSTM_model(self, max_length):
        """
        Splits the data into train and validation sets.
        Constructs the LSTM model.
        """
        X_train, X_test, y_train, y_test = self.split_dataset(X_dataset=self.jokes_to_numerical, y_dataset=self.data['funny'])

        self.model = Sequential()
        self.model.add(Embedding(input_dim=self.vocabulary_size, output_dim=max_length, input_length=int(X_train.shape[1])))
        self.model.add(Dropout(config['Dropout_layer'][0]))
        self.model.add(LSTM(config['LSTM_layer'][1], activation=config['activation']))
        self.model.add(Dropout(config['Dropout_layer'][1]))
        self.model.add(Dense(units=1, activation=config['activation']))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        self.model.fit(X_train, y_train, epochs=config['epochs'], batch_size=config['batch_size'], verbose='auto', validation_split=config['val_p'])

        self.model.save(config['model_path']) # save the model
        self.evaluate_model(X_test, y_test, max_length) # evaluate the model


    def evaluate_model(self, X_test, y_test, max_length):
        """
        Evaluates the model.
        """
        scores = self.model.evaluate(X_test, y_test)
        print("Accuracy: %.2f%%" % (scores[1] * 100))

        # Print Precision and Recall
        y_pred = self.model.predict(X_test)
        y_pred = np.round(y_pred)

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print("Precision: %.2f%%" % (precision * 100))
        print("Recall: %.2f%%" % (recall * 100))
        print("F1-Score: %.2f%%" % (f1 * 100))

        # Save data
        self.save_data(max_length, scores[1], precision, recall, f1)

    def save_data(self, max_length, accuracy, precision, recall, f1):
        """
        Saves the data.
        """
        # Add in dataframe master_df max_len, accuracy, precision, recall, f1-score
        ret = pd.DataFrame(columns=['max_len', 'accuracy', 'precision', 'recall', 'f1-score'])
        ret.loc[0] = [max_length, accuracy, precision, recall, f1]
        ret.to_csv(config['data_path'])
        
        # save the tokenizer
        with open(config['tokenizer_path'], 'wb') as handle:
            pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Read data and pre-process it.

In [79]:
# SETTINGS for local machine - change this for Goolg Colab
path = "dataset/final_jokes(1283).csv" 
#path = "/content/drive/MyDrive/NLU_Humor-detection/final_jokes(1283).csv"

joke_model = LSTM_model(path)
joke_model.read_dataset()

In [80]:
# Preprocess the text. THIS SHOULD BE REMOVED AFTER THE DATASET IS CREATED.
joke_model.preprocess_text()
joke_model.get_max_tokens()
max_length_joke = joke_model.data['max_tokens'].max()

# Padding the embedding vector
joke_model.convert_jokes_to_numerical()
joke_model.pad_sequences(max_length_joke)

# Set vocab size
# joke_model.set_vocabulary_size(len(vocab))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['max_tokens'][index] = count


### Construct the embeddings vector for the LSTM model

In [81]:
# print("_______________________________________________________________________")
print("Max length of joke: ", max_length_joke)
print("Vocabulary size: ", joke_model.vocabulary_size)
# print("\nHead of dataframe: \n", joke_model.data.head())
# print("_______________________________________________________________________")

Max length of joke:  329
Vocabulary size:  4951


### Train the model & Save the model (for live demo)

In [82]:
joke_model.LSTM_model(max_length_joke)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 61.09%
Precision: 61.09%
Recall: 100.00%
F1-Score: 75.85%
