<a href="https://colab.research.google.com/github/paulcodrea/reddit_humor/blob/main/2b_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating a tf-idf vector to represent every joke in the corpus

# Loading data from the csv:

In [91]:
# tf-idf

import csv
import nltk
from nltk import word_tokenize, RegexpTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

with open('drive/MyDrive/Humour_Detection_Dataset/final_jokes(1283).csv', newline='') as f:
    reader = csv.reader(f)
    data = list(reader)

print(data[1])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['0', "My 5 year old just ran out of her room to tell me this joke she just thought up: what did the cow say after he was fed? Moooooooooore!\n\nI've never been this proud of anything in my life.", '1']


In [92]:
print(data[1][1])

My 5 year old just ran out of her room to tell me this joke she just thought up: what did the cow say after he was fed? Moooooooooore!

I've never been this proud of anything in my life.


# Helper functions

In [93]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

def removeStopWords( words):
        """
        Remove the stop-words from a list.
        """
        filtered_words = []
        for w in words:
            if w not in stopWords:
                filtered_words.append(w)
        
        return filtered_words
def process_document(document: list, lower = True, stem = False, remove_stop_words=True) -> list:
        """
        Pre-process (lower, remove non-alphabetic words, and stem) a document 
        and return a list of its terms.
        """
        list_jokes_tokenised = []
        list_tokens = []
        for line in document:
          joke = line[1]
          tokenizer = RegexpTokenizer(r'\w+')
          tokens = tokenizer.tokenize(joke)
          if(lower):
              tokens = [w.lower() for w in tokens]
          if(stem):
              tokens = [stemmer.stem(w) for w in tokens]
          if(remove_stop_words):
              tokens = removeStopWords(tokens)
          list_jokes_tokenised.append(tokens)
          list_tokens = list_tokens + tokens

        return list_tokens, list_jokes_tokenised

def term_frequency_corpus(tokens: list):
  """
  Returns the a dictionary where the keys are the vocabulary words and the values
  are the frequency of the word in the whole corpus.
  """
  vocab_freq = {}
  vocabulary = list(dict.fromkeys(tokens))
  for word in vocabulary:
    vocab_freq[word] = 0
  for token in tokens:
    vocab_freq[token] = vocab_freq[token] + 1
  return vocab_freq

def term_frequency(vocab: dict, jokes: list):
  """
  Returns a dictionary where the keys are the vocabulary words and the values are
  a list that represents the frequency of the word in each joke divided by the 
  length of the joke.
  """
  dict_term_freq = {}
  for word in vocab.keys():
    dict_term_freq[word] = []
  for word in vocab.keys():
    for joke in jokes:
      freq = 0
      for word_index in range(len(joke)):
        if word == joke[word_index]:
          freq = freq + 1
      dict_term_freq[word].append(freq / len(joke))
  return dict_term_freq



def document_frequency(vocabulary: dict, jokes: list):
  """
  Returns a dictionary where the keys are all words in the vocabulary and the 
  values the list of jokes the word appears in.
  """
  doc_freq = {}
  for word in vocabulary.keys():
    doc_freq[word] = []
    for joke_index in range(len(jokes)):
      if word in jokes[joke_index]:
        doc_freq[word].append(joke_index)
  return doc_freq



In [94]:

list_tokens, list_jokes_tokenized = process_document(data, lower = True, stem = False, remove_stop_words=False)
# get rid of the first row (titles)
list_tokens = list_tokens[1:]
list_jokes_tokenized = list_jokes_tokenized[1:]

print(list_tokens[0])
print(list_jokes_tokenized[0])
print(len(list_jokes_tokenized))


my
['my', '5', 'year', 'old', 'just', 'ran', 'out', 'of', 'her', 'room', 'to', 'tell', 'me', 'this', 'joke', 'she', 'just', 'thought', 'up', 'what', 'did', 'the', 'cow', 'say', 'after', 'he', 'was', 'fed', 'moooooooooore', 'i', 've', 'never', 'been', 'this', 'proud', 'of', 'anything', 'in', 'my', 'life']
1283


In [95]:
print(len(list_jokes_tokenized))

1283


# TF calculation

Create TF for each word in the vocabulary:

The number of times a word appears in a document divded by the total number of words in the document. Every document has its own term frequency.

In [96]:
# take 2
# create TF (term frequency)

# the frequency of each word from the vocabulary in the corpus 
vocab_freq = term_frequency_corpus(list_tokens)
print(vocab_freq["joke"])

# number of times a word appears in a doc / total number of words in the doc
tf = term_frequency(vocab_freq, list_jokes_tokenized)

  

58


In [97]:
print(len(vocab_freq))

4708


In [98]:
print(tf["joke"])

[0.025, 0.0, 0.0, 0.025, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.019230769230769232, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.038461538461538464, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07142857142857142, 0.0, 0.08333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

# IDF calculation

Calculating IDF for each word:

The log of the number of documents divided by the number of documents that contain the word w. The IDF is computed once for all documents.



In [99]:

import math

# a list of all the docs in which the vocab words occur -> to be used in idf
dict_document_freq = document_frequency(vocab_freq, list_jokes_tokenized)
print(dict_document_freq)

idf = {}
for word in dict_document_freq.keys():
  idf[word] = math.log(len(list_jokes_tokenized)/len(dict_document_freq[word]))




# TF-IDF calculation

TF-IDF is simply the TF multiplied by IDF.

TF is a list (since a word's tf is dependent on a joke) whereas the IDF is just a value (since it represents the word in the entire corpus). The output will be a list of tf-idf values for each word, with the size = size of the corpus (number of jokes)

In [100]:
dict_tf_idf = {}

for word in vocab_freq.keys():
  dict_tf_idf[word] = []
  for tf_index in range(len(tf[word])):
    dict_tf_idf[word].append(tf[word][tf_index] * idf[word])

print(dict_tf_idf["joke"])
print(len(dict_tf_idf["joke"]))

[0.07966661127658786, 0.0, 0.0, 0.07966661127658786, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.24512803469719344, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06128200867429836, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.12256401734859672, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2276188893616796, 0.0, 0.2655553709219595, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.531110741843919, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

# Representing each joke with tf-idf values

Representing each joke by and embedding vector 

In [101]:
jokes_as_tf_idf = []
print(len(list_jokes_tokenized))
max_joke_size = 0
for joke_index in range(len(list_jokes_tokenized)):
  joke = list_jokes_tokenized[joke_index]
  tf_idf_list = []
  for word in joke:
    tf_idf_list.append(dict_tf_idf[word][joke_index])
  if len(joke) > max_joke_size:
    max_joke_size = len(tf_idf_list)
  jokes_as_tf_idf.append(tf_idf_list)

print(max_joke_size)
print(list_jokes_tokenized[0])
print(jokes_as_tf_idf[0])

1283
347
['my', '5', 'year', 'old', 'just', 'ran', 'out', 'of', 'her', 'room', 'to', 'tell', 'me', 'this', 'joke', 'she', 'just', 'thought', 'up', 'what', 'did', 'the', 'cow', 'say', 'after', 'he', 'was', 'fed', 'moooooooooore', 'i', 've', 'never', 'been', 'this', 'proud', 'of', 'anything', 'in', 'my', 'life']
[0.08402464063420628, 0.12399329468198544, 0.11122265408783566, 0.10809357551398552, 0.15230412502211627, 0.14426655008739364, 0.07784762741952717, 0.08816644091316375, 0.09228051154539775, 0.13412992238468954, 0.03487262454588649, 0.10666461516798681, 0.06096143733301355, 0.13068307911728164, 0.07966661127658786, 0.08014281615085522, 0.15230412502211627, 0.11122265408783566, 0.07615206251105813, 0.03177130832913702, 0.05685386105073164, 0.021312689054841384, 0.10960919105939637, 0.07200725613998953, 0.09947256335669227, 0.051580154095221854, 0.04801286004464218, 0.16159522960139228, 0.1789239091153909, 0.02864222975528687, 0.09747149566485386, 0.09307422900326226, 0.105312934636

In [102]:
import numpy as np
np.shape(jokes_as_tf_idf)

  result = asarray(a).shape


(1283,)

# split train - test data

select random indices for train

In [103]:
import random

indices = list(range(1, len(jokes_as_tf_idf)))

number_of_training_samples = int(len(jokes_as_tf_idf) * 0.7)
training_index_list = random.sample(indices,number_of_training_samples)

testing_index_list = list(set(indices) ^ set(training_index_list))  # we r missing one joke (bc of the int())
random.shuffle(training_index_list)
random.shuffle(testing_index_list)

In [104]:
print(jokes_as_tf_idf[0])

[0.08402464063420628, 0.12399329468198544, 0.11122265408783566, 0.10809357551398552, 0.15230412502211627, 0.14426655008739364, 0.07784762741952717, 0.08816644091316375, 0.09228051154539775, 0.13412992238468954, 0.03487262454588649, 0.10666461516798681, 0.06096143733301355, 0.13068307911728164, 0.07966661127658786, 0.08014281615085522, 0.15230412502211627, 0.11122265408783566, 0.07615206251105813, 0.03177130832913702, 0.05685386105073164, 0.021312689054841384, 0.10960919105939637, 0.07200725613998953, 0.09947256335669227, 0.051580154095221854, 0.04801286004464218, 0.16159522960139228, 0.1789239091153909, 0.02864222975528687, 0.09747149566485386, 0.09307422900326226, 0.10531293463622991, 0.13068307911728164, 0.14426655008739364, 0.08816644091316375, 0.126937870573395, 0.0459709092692855, 0.08402464063420628, 0.1386879613045384]


attempt to do it in a numpy list

In [105]:
def generate_train_test_data(index_list, corpus):
  data_list = []
  label_list = []
  for index in index_list:
    data_list.append(np.array(jokes_as_tf_idf[index]))
    label_list.append(corpus[index][2])

  data = np.array(data_list)
  labels = np.array(label_list)
  return data, labels
 

In [106]:

X_train, y_train = generate_train_test_data(training_index_list, data)
X_test, y_test = generate_train_test_data(testing_index_list, data)

print(X_train[0])
print(y_train[0])
print(X_test[0])
print(y_test[0])

[0.10590436 0.14770511 0.11066756 0.20028051 0.12013433 0.53865077
 0.59641303 0.50486201 0.33512185 0.12013433 0.30760171 0.26714272]
1
[0.14120581 0.33432604 0.08008956 0.71820102 0.79521737 0.33432604
 0.37992075 0.57900514 0.79521737]
0


  


padding the arrays


In [107]:
from keras.preprocessing.sequence import pad_sequences

jokes_to_numerical = pad_sequences(X_train, maxlen=max_joke_size, padding='post')
print(jokes_to_numerical[0])
print(X_train[0])


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0.10590436 0.14770511 0.11066756 0.20028051 0.12013433 0.53865077
 0.59641303 0.50486201 0.33512185 0.12013433 0.30760171 0.26714272]


In [108]:
print(X_train[0])
for idx, x in np.ndenumerate(X_train):
  len_x = len(X_train[idx])
  X_train[idx] = np.pad(X_train[idx], (0,max_joke_size - len_x))
# print(a)

[0.10590436 0.14770511 0.11066756 0.20028051 0.12013433 0.53865077
 0.59641303 0.50486201 0.33512185 0.12013433 0.30760171 0.26714272]


In [109]:
print(X_train[0])

[0.10590436 0.14770511 0.11066756 0.20028051 0.12013433 0.53865077
 0.59641303 0.50486201 0.33512185 0.12013433 0.30760171 0.26714272
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.    

In [110]:
print(max_joke_size)
print(len(X_train[0]))

347
347
