<a href="https://colab.research.google.com/github/paulcodrea/reddit_humor/blob/main/live_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
!pip3 install pickle5
!pip3 install termcolor



In [38]:
## THIS iS A PYTHON SCRIPT TO RUN THE CODE FOR LIVE DEMO
import os
import re
import math
import pickle5 as pickle
import nltk
import numpy as np
import pandas as pd

from termcolor import colored
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
class humour_live_demo:
    def __init__(self, path):
        self.path = path
        self.tokenizer = None
        self.model = None
        self.data_df = pd.DataFrame()
        
        self.max_length = None
        self.corpus_size = None
        self.input_vec = []


    def pre_process(self, text):
        """
        Pre-processes the input and returns the output. 
        Removes stopwords, punctuation, and emojis.
        """
        text = re.sub(r'http\S+', '', text) # remove links
        text = re.sub(r'[^\w\s]','', text) # remove punctuation

        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "]+", flags=re.UNICODE)

        text = emoji_pattern.sub(r'', text) # remove emoji
        text = ' '.join([word for word in word_tokenize(text) if word not in stopwords]) # remove stopwords
        text = text.lower()
        ret = word_tokenize(text)

        return ret


    def process_input(self, text):
        """
        Processes the input and returns the output numerical vector.
        """
        input = self.pre_process(text)
        input = [word for word in input if word.isalpha()]
        input = [word for word in input if not word.startswith("http")]
        print("Clean input: ", ' '.join(input))

        input_numerical = self.tokenizer.texts_to_sequences(input) # Convert to numerical
        input_numerical = [item for sublist in input_numerical for item in sublist] # Flatten the list
        input_numerical = np.array([input_numerical]) # Convert to numpy array
        input_numerical = pad_sequences(input_numerical, maxlen=int(self.max_length)) # Pad the input
        input_numerical = np.array(input_numerical, dtype=np.float32)
        
        self.input_vec = input_numerical # Set the input vector


    def process_tf_idf(self, text):
        """
        Processes the tf-idf dataframe and returns the output. 
        """
        input = self.pre_process(text)

        # Get the tf-idf vector
        # Create the word frequency.
        word_freq = {}
        for word in input:
            word_freq[word] = 0
        for word in input:
            word_freq[word] = word_freq[word] + 1

        # Create the tf vector
        tf = {}
        for word in input:
            tf[word] = word_freq[word] / len(input)

        # Computing idf
        idf = {}
        for word in input:
            if word in self.tokenizer.keys():
                idf[word] = math.log((self.corpus_size + 1)/ (len(self.tokenizer[word]) + 1))
            else:
                idf[word] = 0

        # compute tf-idf
        tf_idf = []
        for word in input:
            tf_idf.append(tf[word] * idf[word])

        input_numerical = np.array([tf_idf]) # Convert to numpy array
        input_numerical = pad_sequences(input_numerical, maxlen=self.max_length,
                                        dtype=np.float32) # Pad the input

        self.input_vec = input_numerical


    def run_model(self, model_name, dataset_name):
        """
        Runs the model and returns the output
        """
        output = self.model.predict(self.input_vec)
        output = np.round(output)
        print("\nModel predicts: ", int(output))

        self.legend(model_name, dataset_name)
        # print(legend)

        if int(output) == 0:
            print(colored("The input is predicted as - not a joke", "red"))
        else:
            print(colored("The input is predicted as - a joke", "green"))


    def legend(self, model_name, dataset_name):
        """
        Prints the legend. This is to explain the model to the user.
        """
        final_name = ""
        # if model contains the word "tf-idf"
        if "2a" in model_name:
            final_name = "LSTM trained on word_id embedding"
        elif "2b" in model_name:
            final_name = "LSTM trained on tf-idf embedding"
        elif "2c" in model_name:
            final_name = "LSTM trained on word2vec embedding"
        elif "3" in model_name:
            final_name = "Random Forest trained on word_id embedding"
        else:
            final_name = "Model not found"
        
        if "1a" in dataset_name:
            final_name = final_name + " and dataset contains dadjokes, BadJokes (no stopwords and case folding)"
        elif "1b" in dataset_name:
            final_name = final_name + " and dataset contains dadjokes, BadJokes (with stopwords and case folding)"
        elif "2a" in dataset_name:
            final_name = final_name + " and dataset contains dadjokes, facts (no stopwords and no case folding)"
        elif "2b" in dataset_name:
            final_name = final_name + " and dataset contains dadjokes, facts (with stopwords and case folding)"
        
        print(colored(final_name, "red"))

    def run_script(self, input):
        """
        This is a function to run the script.
        """
        print("\n______________________________________________________________")
        for file in os.listdir(self.path):
            if file.endswith(".h5"):
                # THIS IS FOR LSTM, word_id and word2vec-------------------------
                if file.startswith("2a") or file.startswith("2c"):
                    model_name = file[:2] # parse file name to get first 2 characters from file name
                    dataset_name = file[2:-8] # parse file name to get the dataset name

                    # 1. Read Model
                    try: 
                        self.model = load_model(self.path + file)
                        print("\n\nModel loaded: " + file)
                    except Exception as e:
                        print(e)
                        print("Model not loaded")
                        continue

                    # 2. Read Data 
                    try:
                        self.data_df = pd.read_csv(self.path + model_name 
                                                          + dataset_name + 'data.csv')
                        print("Data loaded: " + model_name + '_data.csv')
                    except:
                        print("No data found")
                        continue

                    # 3. Read Pickle file
                    try:
                        with open(self.path + model_name + dataset_name 
                                  + 'tokenizer.pickle', 'rb') as handle:
                            self.tokenizer = pickle.load(handle)
                        print("Tokenizer loaded: " + model_name + '_tokenizer.pickle')
                    except:
                        print("No tokenizer found")
                        continue
                    
                    # Save Max Length
                    self.max_length = int(self.data_df['max_len'][0])

                    # 4. Process the input
                    self.process_input(ret)
                    self.run_model(model_name, dataset_name)

                 # THIS IS FOR LSTM tf-idf ------------------------------------
                elif file.startswith("2b-") and file.endswith("model.h5"):
                    model_name = file[:2]
                    dataset_name = file[2:-8] # parse file name to get the dataset name


                    # 1. Read Model
                    try:
                        self.model = load_model(self.path + file)
                        print("\n\nModel loaded: " + file)
                    except Exception as e:
                        print(e)
                        print("Model not loaded")
                        continue

                    # 2. Read Data
                    try:
                        self.data_df = pd.read_csv(self.path + model_name 
                                                          + dataset_name + 'data.csv')
                        print("Data loaded: " + model_name + '_data.csv')
                    except:
                        print("No data found")
                        continue

                    # 3. Read Pickle file
                    try:
                        with open(self.path + model_name + dataset_name 
                                  + 'tokenizer.pickle', 'rb') as handle:
                            self.tokenizer = pickle.load(handle)
                        print("Tokenizer loaded: " + model_name + dataset_name
                              + 'tokenizer.pickle')
                    except:
                        print("No tokenizer found")
                        continue

                    # Save Max Length
                    self.max_length = int(self.data_df['max_len'][0])
                    self.corpus_size = int(self.data_df['corpus_size'][0])

                    # 4. Process the input
                    self.process_tf_idf(ret)
                    self.run_model(model_name, dataset_name)


            # THIS IS FOR RANDOM FOREST ----------------------------------------
            elif file.startswith("3-") and file.endswith("model.pickle"):
                model_name = file[:1]
                dataset_name = file[1:-12] # parse file name to get the dataset name

                # 1. Read Model
                try: 
                    with open(self.path + model_name + dataset_name 
                              + 'model.pickle', 'rb') as handle:
                        self.model = pickle.load(handle)
                    print("\n\nModel loaded: " + file)
                except Exception as e:
                    print(e)
                    print("Model not loaded")
                    continue

                # 2. Read Data 
                try:
                    self.data_df = pd.read_csv(self.path + model_name 
                                                      + dataset_name + 'data.csv')
                    print("Data loaded: " + model_name + '_data.csv')
                except:
                    print("No data found")
                    continue

                # 3. Read Pickle file
                try:
                    with open(self.path + model_name + dataset_name 
                              + 'tokenizer.pickle', 'rb') as handle:
                        self.tokenizer = pickle.load(handle)
                    print("Tokenizer loaded: " + model_name + '_tokenizer.pickle')
                except:
                    print("No tokenizer found")
                    continue
                
                # Save Max Length
                self.max_length = int(self.data_df['max_len'][0])

                # 4. Process the input
                self.process_input(ret)
                self.run_model(model_name, dataset_name)

        print("______________________________________________________________")
        print("\n")

### Add the path to the models

In [55]:
path = "/content/drive/MyDrive/NLU_Humour-detection/COMP34812/best_models/"
live_session = humour_live_demo(path)

### Get input from the user and predict if the input is funny or not

In [56]:
while True:
    ret = input("Enter a sentence: ") # Read input from user

    # Check if the user wants to exit
    if ret == "exit":
      print("End of model...")
      break

    live_session.run_script(ret)

Enter a sentence: Helvetica and Times New Roman walk into a bar. “Get out of here!” shouts the bartender. “We don’t serve your type.”

______________________________________________________________


Model loaded: 2a-dataset_2a_model.h5
Data loaded: 2a_data.csv
Tokenizer loaded: 2a_tokenizer.pickle
Clean input:  helvetica times new roman walk bar get shouts bartender we dont serve type

Model predicts:  1
[31mLSTM trained on word_id embedding and dataset contains dadjokes, facts (no stopwords and no case folding)[0m
[32mThe input is predicted as - a joke[0m


Model loaded: 2c-dataset_2b_model.h5
Data loaded: 2c_data.csv
Tokenizer loaded: 2c_tokenizer.pickle
Clean input:  helvetica times new roman walk bar get shouts bartender we dont serve type

Model predicts:  0
[31mLSTM trained on word2vec embedding and dataset contains dadjokes, facts (with stopwords and no case folding)[0m
[31mThe input is predicted as - not a joke[0m


Model loaded: 3-dataset_2a_model.pickle
Data loaded: 

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations



Model predicts:  1
[31mRandom Forest trained on word_id embedding and dataset contains dadjokes, facts (no stopwords and no case folding)[0m
[32mThe input is predicted as - a joke[0m


Model loaded: 2b-dataset_2a_model.h5
Data loaded: 2b_data.csv
Tokenizer loaded: 2b-dataset_2a_tokenizer.pickle

Model predicts:  0
[31mLSTM trained on tf-idf embedding and dataset contains dadjokes, facts (no stopwords and no case folding)[0m
[31mThe input is predicted as - not a joke[0m
______________________________________________________________


Enter a sentence: exit
End of model...
