<a href="https://colab.research.google.com/github/paulcodrea/reddit_humor/blob/main/2b_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating a tf-idf embedding vector to represent every joke in the corpus

In [21]:
import csv
import nltk
from nltk import word_tokenize, RegexpTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
import numpy as np
import pandas as pd
from nltk.stem.porter import *
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading data from the csv:

In [22]:
path = 'drive/MyDrive/Humour_Detection_Dataset/final_jokes(2918).csv'
data = pd.read_csv(path)

print("Data successfuly read!")

print("This is the first line from the data file: \n", data.head(1))


Data successfuly read!
This is the first line from the data file: 
    Unnamed: 0                                     joke  \
0           0  What vocal group always kills? The OJs.   

                                              tokens  token_count  funny  \
0  ['What', 'vocal', 'group', 'always', 'kills', ...            7      0   

                               clean_text  
0  what vocal group always kills the ojs   


# Helper functions

The functions below enable the data from the csv to be transformed in variables used to compute the tf and idf.

In [23]:
def tokenise_clean_text(data_set):
  """
  Tokenises the data from the CSV file.
  """
  arr = []
  for line in data_set['clean_text']:
    arr.append(line.split(' '))
  arr = [x[:(len(x) - 1)] for x in arr]
  return arr

def return_vocab(document) -> list:
  """
  Retuns the vocabulary of a document.
  """
  computed_vocab = []
  for line in document:
    for word in line:
      computed_vocab.append(word)
  computed_vocab = list(dict.fromkeys(computed_vocab))
  return computed_vocab

def term_frequency_corpus(vocabulary, list_jokes):
  """
  Returns the a dictionary where the keys are the vocabulary words and the values
  are the frequency of the word in the whole corpus.
  """
  vocab_freq = {}
  for word in vocabulary:
    vocab_freq[word] = 0
  for joke in list_jokes:
    for token in joke:
      vocab_freq[token] = vocab_freq[token] + 1
  return vocab_freq

def term_frequency(vocab: dict, jokes: list):
  """
  Returns a dictionary where the keys are the vocabulary words and the values are
  a list that represents the frequency of the word in each joke divided by the 
  length of the joke.
  """
  dict_term_freq = {}
  for word in vocab.keys():
    dict_term_freq[word] = []
  for word in vocab.keys():
    for joke in jokes:
      freq = 0
      for word_index in range(len(joke)):
        if word == joke[word_index]:
          freq = freq + 1
      dict_term_freq[word].append(freq / len(joke))
  return dict_term_freq

def document_frequency(vocabulary: dict, jokes: list):
  """
  Returns a dictionary where the keys are all words in the vocabulary and the 
  values the list of jokes the word appears in.
  """
  doc_freq = {}
  for word in vocabulary.keys():
    doc_freq[word] = []
    for joke_index in range(len(jokes)):
      if word in jokes[joke_index]:
        doc_freq[word].append(joke_index)
  return doc_freq



In [24]:

list_jokes_tokenized = tokenise_clean_text(data)
vocabulary = return_vocab(list_jokes_tokenized)
print("The vocabulary has been created!")
print("This is the len of the vocabulary:\n ", len(vocabulary))

print("This is the first token from the corpus, pre-processed: \n", vocabulary[0])
print("This is the first item from the tokenised list of jokes:\n", list_jokes_tokenized[0])
print("This is how many jokes the data set has:\n",len(list_jokes_tokenized))


The vocabulary has been created!
This is the len of the vocabulary:
  5778
This is the first token from the corpus, pre-processed: 
 what
This is the first item from the tokenised list of jokes:
 ['what', 'vocal', 'group', 'always', 'kills', 'the', 'ojs']
This is how many jokes the data set has:
 2918


at this point we have:

vocabulary -> all tokens that appear in the document

list_jokes_tokenized -> all jokes represented by a list of words. To be used to generate embeddings!

# Term Frequency (TF) calculation

Create TF for each word in the vocabulary:

The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

In [25]:
# the frequency of each word from the vocabulary in the corpus 
vocab_freq = term_frequency_corpus(vocabulary, list_jokes_tokenized)
print("The vocabulary with the associated frequency for each word has been generated!")
print("This is how many times the word 'joke' is in our corpus\n", vocab_freq["joke"])

  

The vocabulary with the associated frequency for each word has been generated!
This is how many times the word 'joke' is in our corpus
 127


In [26]:
vocabulary_size = len(vocab_freq)
print("This is the length of our vocabulary:\n", vocabulary_size)

This is the length of our vocabulary:
 5778


In [27]:
# Calculate tf number of times a word appears in a doc / total number of words in the doc
tf = term_frequency(vocab_freq, list_jokes_tokenized)
print("Term frequency matrix/dictionary was generated!")
print("This is the term frequency vector for word 'joke':\n", tf["joke"])

Term frequency matrix/dictionary was generated!
This is the term frequency vector for word 'joke':
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

# IDF calculation

Calculating IDF for each word:

The log of the number of documents divided by the number of documents that contain the word w. The IDF of a word is computed once for all documents.



In [28]:

import math

# create a list of all the docs in which the vocab words occur -> to be used in idf
dict_document_freq = document_frequency(vocab_freq, list_jokes_tokenized)
print("A list of all documents in which the words from our vocab appear in have been generated!")
print("The list of all documents that contain the word 'Joke' looks like this:\n",dict_document_freq["joke"] )



A list of all documents in which the words from our vocab appear in have been generated!
The list of all documents that contain the word 'Joke' looks like this:
 [38, 50, 91, 135, 193, 196, 218, 219, 228, 286, 315, 329, 339, 354, 381, 393, 440, 533, 564, 592, 699, 704, 709, 712, 715, 742, 793, 796, 805, 823, 858, 860, 866, 877, 929, 961, 984, 999, 1082, 1115, 1132, 1150, 1153, 1164, 1193, 1217, 1235, 1264, 1265, 1280, 1326, 1389, 1412, 1414, 1434, 1509, 1512, 1513, 1523, 1588, 1608, 1661, 1668, 1669, 1701, 1719, 1724, 1787, 1854, 1864, 1909, 1918, 1924, 1929, 1961, 1997, 2015, 2027, 2029, 2042, 2074, 2080, 2103, 2181, 2190, 2380, 2381, 2428, 2434, 2441, 2446, 2476, 2492, 2513, 2543, 2684, 2716, 2717, 2735, 2751, 2756, 2759, 2762, 2783, 2820, 2844, 2861, 2879, 2888]


In [29]:

idf = {}
for word in dict_document_freq.keys():
  idf[word] = math.log(len(list_jokes_tokenized)/len(dict_document_freq[word]))
print("A dictionary with each word's tf-idf values has been generated!")

A dictionary with each word's tf-idf values has been generated!


# TF-IDF calculation

TF-IDF is simply the TF multiplied by IDF.

The output will be a list of tf-idf values (word x joke)

In [30]:
dict_tf_idf = {}

for word in vocab_freq.keys():
  dict_tf_idf[word] = []
  for tf_index in range(len(tf[word])):
    dict_tf_idf[word].append(tf[word][tf_index] * idf[word])

print("The tf-idf values have beed generated! ")
print("this is how the tf-idf vector looks like for the word 'joke':\n",dict_tf_idf["joke"])
print("This is the length of each tf-idf vector. Each value represent's the word's tf-idf for each sentence:\n", len(dict_tf_idf["joke"]))

The tf-idf values have beed generated! 
this is how the tf-idf vector looks like for the word 'joke':
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6574611693707175, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8218264617133968, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16436529234267938, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.6436529234267936, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

# Representing each joke with tf-idf values

Representing each joke by an embedding vector.

In [31]:
jokes_as_tf_idf = []
max_joke_size = 0


for joke_index in range(len(list_jokes_tokenized)):
  joke = list_jokes_tokenized[joke_index]
  tf_idf_list = []
  for word in joke:
    tf_idf_list.append(dict_tf_idf[word][joke_index])
  tf_idf = np.asarray(tf_idf_list)
  jokes_as_tf_idf.append(tf_idf)

  # calculate max_joke size to use in the padding step
  if len(joke) > max_joke_size:
    max_joke_size = len(tf_idf_list)
    
jokes_as_tf_idf = np.asarray(jokes_as_tf_idf)

print("The jokes have been embedded by their tf-idf values!")
print("This is the max joke length for our corpus: ", max_joke_size)
print("This is how the first joke looks like before the embedding step: \n", list_jokes_tokenized[0])
print("This is how the tf-idf embedding of the first joke looks like:\n",jokes_as_tf_idf[0])

The jokes have been embedded by their tf-idf values!
This is the max joke length for our corpus:  134
This is how the first joke looks like before the embedding step: 
 ['what', 'vocal', 'group', 'always', 'kills', 'the', 'ojs']
This is how the tf-idf embedding of the first joke looks like:
 [0.18756714 1.04078665 0.79725121 0.55021705 0.8427446  0.38434096
 1.04078665]




In [32]:
jokes_as_tf_idf.shape

(2918,)

# split train - test data

Pad the jokes. - at the end

In [33]:
new_arr = np.zeros((len(jokes_as_tf_idf), max_joke_size))
old_arr = jokes_as_tf_idf
for idx, joke in enumerate(jokes_as_tf_idf):
  len_joke = len(joke)
  joke_x = jokes_as_tf_idf[idx]
  new_arr[idx] = np.append(joke, [0] * (max_joke_size - len_joke))

print(new_arr.shape)
jokes_as_tf_idf = new_arr

(2918, 134)


In [34]:
def generate_train_test_data(jokes_list, corpus):
  train_max_index = int(len(jokes_list) * 0.8)
  X_train = np.zeros(shape=(train_max_index, max_joke_size))
  X_test = np.zeros(shape=(len(jokes_list) - train_max_index, max_joke_size))
  index_returned_data = 0
  for index in range(0, train_max_index):
    X_train[index_returned_data] = jokes_as_tf_idf[index]
  y_train = corpus["funny"][:train_max_index]
  index_returned_data = 0
  for index in range(train_max_index, len(jokes_as_tf_idf)):
    X_test[index_returned_data] = jokes_as_tf_idf[index]
  y_test = corpus["funny"][train_max_index:]
  return X_train, X_test, y_train, y_test

In [35]:
X_train, X_test, y_train, y_test = generate_train_test_data(jokes_as_tf_idf, data)

# Trying it on the LSTM

In [36]:
import pandas as pd
import numpy as np

import nltk
nltk.download('punkt')

from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout 
from keras.layers import LSTM, Embedding

from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import precision_score, recall_score, f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [37]:
config = {
    "learning_rate": 0.001,
    "epochs": 3, 
    "batch_size": 70,
    "test_p": 0.2,
    "val_p": 0.1,
    "LSTM_layer": [50, 100],
    "Dropout_layer": [0.15, 0.2],
    "activation": 'softmax',
}

In [38]:
def LSTM_model(max_length, X_train, X_test, y_train, y_test, vocabulary_size):
  """
  Splits the data into train and validation sets.

  Constructs the LSTM model.
  """

  # print("X shape: ", X.shape)
  # print("y shape: ", y.shape)

  # split the data into train and validation sets and make them random
  # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config['test_p'], random_state=42)

  print("X_train:", len(X_train))
  print("X_test:", len(X_test))
  print("y_train:", len(y_train))
  print("y_test:", len(y_test))

  print(X_train.shape)
  print(y_train.shape)

  model = Sequential()
  model.add(Embedding(input_dim=vocabulary_size, output_dim=max_length, input_length=int(X_train.shape[1])))
  model.add(Dropout(config['Dropout_layer'][0]))
  model.add(LSTM(config['LSTM_layer'][1], activation=config['activation']))
  model.add(Dropout(config['Dropout_layer'][1]))
  model.add(Dense(units=1, activation=config['activation']))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.fit(X_train, y_train, epochs=config['epochs'], batch_size=config['batch_size'], verbose='auto', validation_split=config['val_p'])

  # Evaluate the model
  scores = model.evaluate(X_test, y_test)
  print("Accuracy: %.2f%%" % (scores[1] * 100))

  # Print Precision and Recall
  y_pred = model.predict(X_test)
  y_pred = np.round(y_pred)
  print("Precision: %.2f%%" % (precision_score(y_test, y_pred) * 100))
  print("Recall: %.2f%%" % (recall_score(y_test, y_pred) * 100))
  print("F1-Score: %.2f%%" % (f1_score(y_test, y_pred) * 100))

In [39]:
LSTM_model(max_joke_size, X_train, X_test, y_train, y_test, len(vocab_freq))

X_train: 2334
X_test: 584
y_train: 2334
y_test: 584
(2334, 134)
(2334,)
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 52.74%
Precision: 52.74%
Recall: 100.00%
F1-Score: 69.06%
