<a href="https://colab.research.google.com/github/paulcodrea/reddit_humor/blob/main/2b_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating a tf-idf vector to represent every joke in the corpus

# Loading data from the csv:

In [101]:
# tf-idf

import csv
import nltk
from nltk import word_tokenize, RegexpTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
import numpy as np


with open('drive/MyDrive/Humour_Detection_Dataset/final_jokes(1283).csv', newline='') as f:
    reader = csv.reader(f)
    data = list(reader)

print("Data successfuly read!")

print("This is the first line from the data file: \n", data[1])
print("This is the formatted joke: \n", data[1][1])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Data successfuly read!
This is the first line from the data file: 
 ['0', "My 5 year old just ran out of her room to tell me this joke she just thought up: what did the cow say after he was fed? Moooooooooore!\n\nI've never been this proud of anything in my life.", '1']
This is the formatted joke: 
 My 5 year old just ran out of her room to tell me this joke she just thought up: what did the cow say after he was fed? Moooooooooore!

I've never been this proud of anything in my life.


# Helper functions

In [102]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

def removeStopWords( words):
        """
        Remove the stop-words from a list.
        """
        filtered_words = []
        for w in words:
            if w not in stopWords:
                filtered_words.append(w)
        return filtered_words

def process_document(document: list, lower = True, stem = False, remove_stop_words=True) -> list:
        """
        Pre-process (lower, remove non-alphabetic words, and stem) a document 
        and return a list of its terms.
        """
        list_jokes_tokenised = []
        list_tokens = []
        for line in document:
          joke = line[1]
          tokenizer = RegexpTokenizer(r'\w+')
          tokens = tokenizer.tokenize(joke)
          if(lower):
              tokens = [w.lower() for w in tokens]
          if(stem):
              tokens = [stemmer.stem(w) for w in tokens]
          if(remove_stop_words):
              tokens = removeStopWords(tokens)
          list_jokes_tokenised.append(tokens)
          list_tokens = list_tokens + tokens
        return list_tokens, list_jokes_tokenised

def term_frequency_corpus(tokens: list):
  """
  Returns the a dictionary where the keys are the vocabulary words and the values
  are the frequency of the word in the whole corpus.
  """
  vocab_freq = {}
  vocabulary = list(dict.fromkeys(tokens))
  for word in vocabulary:
    vocab_freq[word] = 0
  for token in tokens:
    vocab_freq[token] = vocab_freq[token] + 1
  return vocab_freq

def term_frequency(vocab: dict, jokes: list):
  """
  Returns a dictionary where the keys are the vocabulary words and the values are
  a list that represents the frequency of the word in each joke divided by the 
  length of the joke.
  """
  dict_term_freq = {}
  for word in vocab.keys():
    dict_term_freq[word] = []
  for word in vocab.keys():
    for joke in jokes:
      freq = 0
      for word_index in range(len(joke)):
        if word == joke[word_index]:
          freq = freq + 1
      dict_term_freq[word].append(freq / len(joke))
  return dict_term_freq



def document_frequency(vocabulary: dict, jokes: list):
  """
  Returns a dictionary where the keys are all words in the vocabulary and the 
  values the list of jokes the word appears in.
  """
  doc_freq = {}
  for word in vocabulary.keys():
    doc_freq[word] = []
    for joke_index in range(len(jokes)):
      if word in jokes[joke_index]:
        doc_freq[word].append(joke_index)
  return doc_freq



In [103]:

list_tokens, list_jokes_tokenized = process_document(data, lower = True, stem = False, remove_stop_words=False)
print("The data has been changed into tokens!")
# get rid of the first row (titles)
list_tokens = list_tokens[1:]
list_jokes_tokenized = list_jokes_tokenized[1:]

print("This is the first token from the corpus, pre-processed: \n", list_tokens[0])
print("This is the first item from the tokenised list of jokes:\n", list_jokes_tokenized[0])
print("This is how many jokes the data set has:\n",len(list_jokes_tokenized))


The data has been changed into tokens!
This is the first token from the corpus, pre-processed: 
 my
This is the first item from the tokenised list of jokes:
 ['my', '5', 'year', 'old', 'just', 'ran', 'out', 'of', 'her', 'room', 'to', 'tell', 'me', 'this', 'joke', 'she', 'just', 'thought', 'up', 'what', 'did', 'the', 'cow', 'say', 'after', 'he', 'was', 'fed', 'moooooooooore', 'i', 've', 'never', 'been', 'this', 'proud', 'of', 'anything', 'in', 'my', 'life']
This is how many jokes the data set has:
 1283


at this point we have:

list_tokens -> to be used to generate the vocab

list_jokes_tokenized -> to be used to generate embeddings!

# TF calculation

Create TF for each word in the vocabulary:

The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

In [104]:
# take 2
# create TF (term frequency)

# the frequency of each word from the vocabulary in the corpus 
vocab_freq = term_frequency_corpus(list_tokens)
print("The vocabulary with the associated frequency for each word has been generated!")
print("This is how many times the word 'joke' is in our corpus\n", vocab_freq["joke"])

  

The vocabulary with the associated frequency for each word has been generated!
This is how many times the word 'joke' is in our corpus
 58


In [105]:
vocabulary_size = len(vocab_freq)
print("This is the length of our vocabulary:\n", vocabulary_size)

This is the length of our vocabulary:
 4708


In [106]:
# Calculate tf number of times a word appears in a doc / total number of words in the doc
tf = term_frequency(vocab_freq, list_jokes_tokenized)
print("Term frequency matrix/dictionary was generated!")
print("This is the term frequency vector for word 'joke':\n", tf["joke"])

Term frequency matrix/dictionary was generated!
This is the term frequency vector for word 'joke':
 [0.025, 0.0, 0.0, 0.025, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.019230769230769232, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.038461538461538464, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07142857142857142, 0.0, 0.08333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

# IDF calculation

Calculating IDF for each word:

The log of the number of documents divided by the number of documents that contain the word w. The IDF is computed once for all documents.



In [107]:

import math

# create a list of all the docs in which the vocab words occur -> to be used in idf
dict_document_freq = document_frequency(vocab_freq, list_jokes_tokenized)
print("A list of all documents in which the words from our vocab appear in have been generated!")
print("The list of all documents that contain the word 'Joke' looks like this:\n",dict_document_freq["joke"] )



A list of all documents in which the words from our vocab appear in have been generated!
The list of all documents that contain the word 'Joke' looks like this:
 [0, 3, 36, 54, 92, 133, 135, 163, 213, 222, 228, 243, 256, 291, 355, 467, 493, 557, 568, 590, 597, 599, 617, 664, 728, 732, 753, 818, 822, 832, 833, 880, 909, 958, 963, 975, 1003, 1036, 1044, 1051, 1060, 1084, 1089, 1096, 1106, 1160, 1179, 1186, 1200, 1213, 1241, 1271, 1273]


In [108]:

idf = {}
for word in dict_document_freq.keys():
  idf[word] = math.log(len(list_jokes_tokenized)/len(dict_document_freq[word]))
print("A dictionary with each word's tf-idf values has been generated!")

A dictionary with each word's tf-idf values has been generated!


# TF-IDF calculation

TF-IDF is simply the TF multiplied by IDF.

The output will be a list of tf-idf values (word x joke)

In [109]:
dict_tf_idf = {}

for word in vocab_freq.keys():
  dict_tf_idf[word] = []
  for tf_index in range(len(tf[word])):
    dict_tf_idf[word].append(tf[word][tf_index] * idf[word])

print("The tf-idf values have beed generated! ")
print("this is how the tf-idf vector looks like for the word 'joke':\n",dict_tf_idf["joke"])
print("This is the length of each tf-idf vector. Each value represent's the word's tf-idf for each sentence:\n", len(dict_tf_idf["joke"]))

The tf-idf values have beed generated! 
this is how the tf-idf vector looks like for the word 'joke':
 [0.07966661127658786, 0.0, 0.0, 0.07966661127658786, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.24512803469719344, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06128200867429836, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12256401734859672, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2276188893616796, 0.0, 0.2655553709219595, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

# Representing each joke with tf-idf values

Representing each joke by and embedding vector 

In [110]:
jokes_as_tf_idf = []
max_joke_size = 0
print(type(list_jokes_tokenized))
for joke_index in range(len(list_jokes_tokenized)):
  joke = list_jokes_tokenized[joke_index]
  tf_idf_list = []
  for word in joke:
    tf_idf_list.append(dict_tf_idf[word][joke_index])
  if len(joke) > max_joke_size:
    max_joke_size = len(tf_idf_list)
  tf_idf = np.asarray(tf_idf_list)
  jokes_as_tf_idf.append(tf_idf)
jokes_as_tf_idf = np.asarray(jokes_as_tf_idf)
print(type(jokes_as_tf_idf))
print(type(jokes_as_tf_idf[0]))
print("The jokes have been embedded by their tf-idf values!")
print("This is the max joke length for our corpus: ", max_joke_size)
print("This is how the first joke looks like before the embedding step: \n", list_jokes_tokenized[0])
print("This is how the tf-idf embedding of the first joke looks like:\n",jokes_as_tf_idf[0])

<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
The jokes have been embedded by their tf-idf values!
This is the max joke length for our corpus:  347
This is how the first joke looks like before the embedding step: 
 ['my', '5', 'year', 'old', 'just', 'ran', 'out', 'of', 'her', 'room', 'to', 'tell', 'me', 'this', 'joke', 'she', 'just', 'thought', 'up', 'what', 'did', 'the', 'cow', 'say', 'after', 'he', 'was', 'fed', 'moooooooooore', 'i', 've', 'never', 'been', 'this', 'proud', 'of', 'anything', 'in', 'my', 'life']
This is how the tf-idf embedding of the first joke looks like:
 [0.08402464 0.12399329 0.11122265 0.10809358 0.15230413 0.14426655
 0.07784763 0.08816644 0.09228051 0.13412992 0.03487262 0.10666462
 0.06096144 0.13068308 0.07966661 0.08014282 0.15230413 0.11122265
 0.07615206 0.03177131 0.05685386 0.02131269 0.10960919 0.07200726
 0.09947256 0.05158015 0.04801286 0.16159523 0.17892391 0.02864223
 0.0974715  0.09307423 0.10531293 0.13068308 0.14426655 0.08816644

  del sys.path[0]


In [111]:
jokes_as_tf_idf.shape

(1283,)

# split train - test data

select random indices for train

In [112]:
import random

indices = list(range(1, len(jokes_as_tf_idf)))

number_of_training_samples = int(len(jokes_as_tf_idf) * 0.7)
training_index_list = random.sample(indices,number_of_training_samples)

testing_index_list = list(set(indices) ^ set(training_index_list))  # we r missing one joke (bc of the int())
random.shuffle(training_index_list)
random.shuffle(testing_index_list)

In [113]:
print(jokes_as_tf_idf[0])

[0.08402464 0.12399329 0.11122265 0.10809358 0.15230413 0.14426655
 0.07784763 0.08816644 0.09228051 0.13412992 0.03487262 0.10666462
 0.06096144 0.13068308 0.07966661 0.08014282 0.15230413 0.11122265
 0.07615206 0.03177131 0.05685386 0.02131269 0.10960919 0.07200726
 0.09947256 0.05158015 0.04801286 0.16159523 0.17892391 0.02864223
 0.0974715  0.09307423 0.10531293 0.13068308 0.14426655 0.08816644
 0.12693787 0.04597091 0.08402464 0.13868796]


Pad the jokes

In [118]:
new_arr = np.zeros((len(jokes_as_tf_idf), max_joke_size))

for idx, joke in enumerate(jokes_as_tf_idf):
  len_joke = len(joke)
  joke_x = jokes_as_tf_idf[idx]
  new_arr[idx] = np.append(joke, [0] * (max_joke_size - len_joke))

print(new_arr.shape)
jokes_as_tf_idf = new_arr

(1283, 347)


In [115]:
# print(jokes_as_tf_idf[0])

In [121]:
def generate_train_test_data(jokes_as_tf_idf, index_list, corpus):
  returned_data = np.zeros(shape=(len(index_list), max_joke_size))
  labels = np.zeros(shape=(len(index_list), 1))
  index_returned_data = 0
  for index in index_list:
    returned_data[index_returned_data] = jokes_as_tf_idf[index]
    labels[index_returned_data] = corpus[index][2]
    index_returned_data += 1

  return returned_data, labels
 

In [123]:

X_train, y_train = generate_train_test_data(jokes_as_tf_idf, training_index_list, data)
X_test, y_test = generate_train_test_data(jokes_as_tf_idf, testing_index_list, data)

# print(X_train[0])
# print(y_train[0])
# print(X_test[0])
# print(y_test[0])

# Trying it on the LSTM

In [124]:
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')

from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout 
from keras.layers import LSTM, Embedding

from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import precision_score, recall_score, f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [125]:
config = {
    "learning_rate": 0.001,
    "epochs": 3, 
    "batch_size": 70,
    "test_p": 0.2,
    "val_p": 0.1,
    "LSTM_layer": [50, 100],
    "Dropout_layer": [0.15, 0.2],
    "activation": 'softmax',
}

In [128]:
def LSTM_model(max_length, X_train, X_test, y_train, y_test, vocabulary_size):
  """
  Splits the data into train and validation sets.

  Constructs the LSTM model.
  """

  # print("X shape: ", X.shape)
  # print("y shape: ", y.shape)

  # split the data into train and validation sets and make them random
  # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config['test_p'], random_state=42)

  print("X_train:", len(X_train))
  print("X_test:", len(X_test))
  print("y_train:", len(y_train))
  print("y_test:", len(y_test))

  print(X_train.shape)
  print(y_train.shape)

  model = Sequential()
  model.add(Embedding(input_dim=vocabulary_size, output_dim=max_length, input_length=int(X_train.shape[1])))
  model.add(Dropout(config['Dropout_layer'][0]))
  model.add(LSTM(config['LSTM_layer'][1], activation=config['activation']))
  model.add(Dropout(config['Dropout_layer'][1]))
  model.add(Dense(units=1, activation=config['activation']))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.fit(X_train, y_train, epochs=config['epochs'], batch_size=config['batch_size'], verbose='auto', validation_split=config['val_p'])

  # Evaluate the model
  scores = model.evaluate(X_test, y_test)
  print("Accuracy: %.2f%%" % (scores[1] * 100))

  # Print Precision and Recall
  y_pred = model.predict(X_test)
  y_pred = np.round(y_pred)
  print("Precision: %.2f%%" % (precision_score(y_test, y_pred) * 100))
  print("Recall: %.2f%%" % (recall_score(y_test, y_pred) * 100))
  print("F1-Score: %.2f%%" % (f1_score(y_test, y_pred) * 100))

In [129]:
LSTM_model(max_joke_size, X_train, X_test, y_train, y_test, len(vocab_freq))

X_train: 898
X_test: 384
y_train: 898
y_test: 384
(898, 347)
(898, 1)
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 52.86%
Precision: 52.86%
Recall: 100.00%
F1-Score: 69.17%
