In [20]:
import pandas as pd
import numpy as np
import pickle
import nltk
nltk.download('punkt')

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paulc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
# We might have to change the following

config = {
    "train_p": 0.7,
    "test_p": 0.2,
    "val_p": 0.1,
    ##################### SAVE FOR LIVE DEMO #############################
    "model_path": './model/3_model.pickle',
    "tokenizer_path": './model/3_tokenizer.pickle',
    "data_path": "model/3_data.csv",
}

In [27]:
class Random_Forest:
    def __init__(self, path):
        """
        Initializes the class.
        """
        self.path = path # Path to the dataset
        self.data = pd.DataFrame() # Dataframe to store the dataset

        self.vocabulary_size = 0
        self.tokenizer = Tokenizer(num_words=None, split=' ')

        self.jokes_to_numerical = []
        self.model = None


    def read_dataset(self):
        """
        Reads the dataset from the given path.
        """
        ret = pd.read_csv(self.path)
        ret.drop(columns=['Unnamed: 0'], inplace=True)


        count_zero = count_one = 0
        # check at which index the joke is 0 or 1
        for index, row in ret.iterrows():
            if row['funny'] == 0:
                count_zero += 1
            else:
                count_one += 1
        print("The input has {} jokes with 0 and {} jokes with 1".format(count_zero, count_one))

        self.data = ret


    def get_max_tokens(self):
        """
        Returns the maximum number of tokens in the dataset.
        """
        self.data['max_tokens'] = 0
        # create a new column in the dataframe with max tokens per row. and count only if the token isalpha()
        for index, row in self.data.iterrows():
            count = 0
            for token in row['tokens']:
                if token.isalpha():
                    count += 1
            self.data['max_tokens'][index] = count


    def set_vocabulary_size(self, size):
        """
        Sets the vocabulary size.
        """
        self.vocabulary_size = size


    def convert_jokes_to_numerical(self):
        """
        Converts the jokes to numerical values.
        """
        self.tokenizer.fit_on_texts(self.data['clean_text'].values)
        self.jokes_to_numerical = self.tokenizer.texts_to_sequences(self.data['clean_text'].values)

        # get vocabulary size
        self.vocabulary_size = len(self.tokenizer.word_index) + 1


    def pad_sequences(self, max_length):
        """
        Pads the sequences.
        """
        self.jokes_to_numerical = pad_sequences(self.jokes_to_numerical, maxlen=max_length, padding='post')


    def gen_pad_sequences(self, text, max_length):
        """
        Generates padded sequences.
        """
        return pad_sequences(text, maxlen=max_length, padding='post')


    def split_dataset(self, X_dataset, y_dataset):
        """
        Splits the dataset into training and testing sets.
        """
        X = X_dataset
        y = y_dataset

        # convert X as a numpy array float32
        X = np.array(X, dtype=np.float32)
        y = np.array(y, dtype=np.float32)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config['test_p'], random_state=42)

        return X_train, X_test, y_train, y_test


    def random_forest(self, X_train, X_test, y_train, y_test, max_length_joke):
        """
        Trains a random forest model.
        """
        model = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=42)
        model.fit(X_train, y_train)

        # predict the test set
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print("Accuracy: {}".format(accuracy))
        print("Precision: {}".format(precision))
        print("Recall: {}".format(recall))
        print("F1: {}".format(f1))

        self.model = model
        self.save_data(max_length_joke, accuracy, precision, recall, f1)


    def run_model(self, max_length):
        """
        Splits the data into train and validation sets. Trains a random forest model.
        """
        X_train, X_test, y_train, y_test = self.split_dataset(X_dataset=self.jokes_to_numerical, y_dataset=self.data['funny'])
        self.random_forest(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, max_length_joke=max_length)


    def save_data(self, max_length, accuracy, precision, recall, f1):
        """
        Saves the data.
        """
        # Add in dataframe master_df max_len, accuracy, precision, recall, f1-score
        ret = pd.DataFrame(columns=['max_len', 'accuracy', 'precision', 'recall', 'f1-score'])
        ret.loc[0] = [max_length, accuracy, precision, recall, f1]
        ret.to_csv(config['data_path'])

        with open(config['model_path'], 'wb') as handle:
            pickle.dump(self.model, handle)
        
        # save the tokenizer
        with open(config['tokenizer_path'], 'wb') as handle:
            pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [28]:
# SETTINGS for local machine - change this for Goolg Colab
path = "dataset/final_jokes(2918).csv" 

joke_model = Random_Forest(path)
joke_model.read_dataset()

The input has 1459 jokes with 0 and 1459 jokes with 1


In [29]:
max_length_joke = joke_model.data['token_count'].max()

# Padding the embedding vector
joke_model.convert_jokes_to_numerical()
joke_model.pad_sequences(max_length_joke)


print("Max length of joke: ", max_length_joke)
print("Vocabulary size: ", joke_model.vocabulary_size)

Max length of joke:  134
Vocabulary size:  5777


In [30]:
joke_model.run_model(max_length_joke)

Accuracy: 0.660958904109589
Precision: 0.6212534059945504
Recall: 0.794425087108014
F1: 0.6972477064220183
