<a href="https://colab.research.google.com/github/paulcodrea/reddit_humor/blob/main/2a_keras-tokenize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
import emoji

import nltk
nltk.download('punkt')

from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout 
from keras.layers import LSTM, Embedding

from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import precision_score, recall_score, f1_score


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paulc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# We might have to change the following

config = {
    "learning_rate": 0.001,
    "epochs": 3, 
    "batch_size": 70,
    "test_p": 0.2,
    "val_p": 0.1,
    "LSTM_layer": [50, 100],
    "Dropout_layer": [0.15, 0.2],
    "activation": 'softmax',
}

In [16]:
class LSTM_model:
    def __init__(self, path):
        """
        Initializes the class.
        """
        self.path = path # Path to the dataset
        self.data = pd.DataFrame() # Dataframe to store the dataset

        # self.context_window = 3 # Context window size
        # self.w2v_feature_vector = []
        self.vocabulary_size = 0

        self.jokes_to_numerical = []
        self.model = None


    def read_dataset(self):
        """
        Reads the dataset from the given path.
        """
        ret = pd.read_csv(self.path)
        ret.drop(columns=['Unnamed: 0'], inplace=True)

        # randomize the data set and take the first 10 rows
        ret = ret.sample(frac=1, ignore_index=True)#.head(10)
        self.data = ret


    def emoji_tokenizer(self, text):
        """
        Tokenizes the text. Removes the emojis.
        """
        return [c for c in text if c not in emoji.UNICODE_EMOJI]

        
    def preprocess_text(self):
        """
        Preprocesses the text data.
        """
        # Remove URLs.
        self.data['text'] = self.data['joke'].apply(lambda x: ' '.join(word for word in x.split() if not word.startswith('http')))

        # Remove rows if text contains "[ Removed" or "[Removed".
        self.data = self.data[self.data['text'].str.contains("[ Removed") == False]
        self.data = self.data[self.data['text'].str.contains("[Removed") == False]

        # Remove emojis.
        self.data['text'] = self.data['text'].apply(self.emoji_tokenizer)

        self.data['tokens'] = self.data['text'].apply(word_tokenize)
        # self.data['tokens'] = self.data['joke'].apply(word_tokenize) # tokenize the text but keep the punctuation


    def get_max_tokens(self):
        """
        Returns the maximum number of tokens in the dataset.
        """
        self.data['max_tokens'] = 0
        # create a new column in the dataframe with max tokens per row. and count only if the token isalpha()
        for index, row in self.data.iterrows():
            count = 0
            for token in row['tokens']:
                if token.isalpha():
                    count += 1
            self.data['max_tokens'][index] = count


    def convert_jokes_to_numerical(self):
        """
        Converts the jokes to numerical values.
        """
        tokenizer = Tokenizer(num_words=None, split=' ')
        tokenizer.fit_on_texts(self.data['joke'].values)
        self.jokes_to_numerical = tokenizer.texts_to_sequences(self.data['joke'].values)

        # get vocabulary size
        self.vocabulary_size = len(tokenizer.word_index) + 1


    def pad_sequences(self, max_length):
        """
        Pads the sequences.
        """
        self.jokes_to_numerical = pad_sequences(self.jokes_to_numerical, maxlen=max_length, padding='post')


    def LSTM_model(self, max_length):
        """
        Splits the data into train and validation sets.
        Constructs the LSTM model.
        """
        X = self.jokes_to_numerical
        y = self.data['funny']

        print("X shape: ", X.shape)
        print("y shape: ", y.shape)

        # split the data into train and validation sets and make them random
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config['test_p'], random_state=42)

        print("X_train:", len(X_train))
        print("X_test:", len(X_test))
        print("y_train:", len(y_train))
        print("y_test:", len(y_test))

        self.model = Sequential()
        self.model.add(Embedding(input_dim=self.vocabulary_size, output_dim=max_length, input_length=int(X.shape[1])))
        self.model.add(Dropout(config['Dropout_layer'][0]))
        self.model.add(LSTM(config['LSTM_layer'][1], activation=config['activation']))
        self.model.add(Dropout(config['Dropout_layer'][1]))
        self.model.add(Dense(units=1, activation=config['activation']))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        self.model.fit(X_train, y_train, epochs=config['epochs'], batch_size=config['batch_size'], verbose='auto', validation_split=config['val_p'])

        # evaluate the model
        self.evaluate_model(X_test, y_test)


    def evaluate_model(self, X_test, y_test):
        """
        Evaluates the model.
        """
        scores = self.model.evaluate(X_test, y_test)
        print("Accuracy: %.2f%%" % (scores[1] * 100))

        # Print Precision and Recall
        y_pred = self.model.predict(X_test)
        y_pred = np.round(y_pred)
        print("Precision: %.2f%%" % (precision_score(y_test, y_pred) * 100))
        print("Recall: %.2f%%" % (recall_score(y_test, y_pred) * 100))
        print("F1-Score: %.2f%%" % (f1_score(y_test, y_pred) * 100))


### Read data and pre-process it.

In [15]:
# SETTINGS for local machine - change this for Goolg Colab
path = "dataset/final_jokes(1283).csv" 
# path = "/content/drive/MyDrive/NLU_Humor-detection/final_jokes(1283).csv"

joke_model = LSTM_model(path)
joke_model.read_dataset()

In [6]:
joke_model.preprocess_text()
joke_model.get_max_tokens() 
max_length_joke = joke_model.data['max_tokens'].max()
print("_______________________________________________________________________")
print("\nMax length of joke: ", max_length_joke)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['max_tokens'][index] = count


_______________________________________________________________________

Max length of joke:  329


### Construct the embeddings vector for the LSTM model

In [7]:
joke_model.convert_jokes_to_numerical()

joke_model.pad_sequences(max_length_joke)
print("Length of feature vector after normalisation: ", 
      len(joke_model.jokes_to_numerical[0]))


# joke_model.jokes_to_numerical[100]

Length of feature vector after normalisation:  329


In [8]:
print(joke_model.vocabulary_size)

4951


### Train the model

In [9]:
joke_model.LSTM_model(max_length_joke)

X shape:  (1283, 329)
y shape:  (1283,)
X Train shape (1026, 329)
y train shape (1026,)
X_train: 1026
X_test: 257
y_train: 1026
y_test: 257
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 56.42%
Precision: 56.42%
Recall: 100.00%
F1-Score: 72.14%
