**The Dad Joke Generator**

An information retrieval algorithm that I created as my final project for the Information Extraction course at university.

I worked with a dad joke dataset found on Kaggle: https://www.kaggle.com/datasets/oktayozturk010/reddit-dad-jokes & 

a short jokes dataset, found on Kaggle as well: https://www.kaggle.com/datasets/abhinavmoudgil95/short-jokes

In [1]:
import pandas as pd

In [2]:
#Loading the datasets

df_shortjokes = pd.read_csv('shortjokes.csv')

df_dadjokes = pd.read_csv('reddit_dadjokes.csv')

In [3]:
#Removing unwanted columns

df_dadjokes = df_dadjokes.drop(['url','score','date', 'author'], axis=1)

df_dadjokes.head()

Unnamed: 0,joke
0,"Doctor: ""So, you're telling me that you have a..."
1,A grizzly kept talking to me and annoyed me He...
2,I rubbed mayonnaise on my eyes Oh fuck oh shit...
3,What do you say to encourage an asteroid? Go l...
4,They always ask me why my mood is always negat...


In [4]:
#Importing modules for preprocessing pipeline

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patriciagrigor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patriciagrigor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/patriciagrigor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
#A class for the preprocessing pipeline which can be reused & adapted for several documents

class PreprocessingPipeline:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stemming = PorterStemmer()
        self.punctuation = string.punctuation
    
    #Converting text into tokens
    def tokenize(self, text):
        return word_tokenize(text)
    
    #Converting the tokens to lowercase
    def case_fold(self, token):
        return token.lower()
    
    #Removing stop-words
    def remove_stop_words(self, token):
        if token is not None and token not in self.stop_words:
            return token
        return None

    #Removing unwanted characters
    def remove_unwanted_characters(self, token):
        if token is not None and not token.isalpha():
            return None
        return token
    
    #Lemmatizing tokens
    def lemmatize(self,token):
        lemmatized_token = self.lemmatizer.lemmatize(token)
        return lemmatized_token
   
    def token_stemmer(self,token):
        stemmed_token = self.stemming.stem(token)
        return stemmed_token

    #Preprocessing text by applying all steps from above
    def preprocess_text(self, text):
        tokens = self.tokenize(text)
        preprocessed_tokens = []
        for token in tokens:
            token = self.case_fold(token)
            token = self.remove_stop_words(token)
            token = self.remove_unwanted_characters(token)
            
            if token is not None:
                token = self.lemmatize(token)
                #token = self.token_stemmer(token)
                preprocessed_tokens.append(token)
        
        return preprocessed_tokens

In [7]:
short_jokes = df_shortjokes['Joke'].to_list()
dad_jokes = df_dadjokes['joke'].to_list()

all_jokes = short_jokes + dad_jokes

print(len(all_jokes))

447985


In [8]:
preprocessor = PreprocessingPipeline()

#Trying out the preprocessing pipeline & comparing it to the original text

preprocessed_jokes = [preprocessor.preprocess_text(joke) for joke in all_jokes]

for joke in preprocessed_jokes[:5]:
    print(joke)

['narrating', 'documentary', 'narrator', 'ca', 'hear', 'saying', 'cuz', 'talking']
['telling', 'daughter', 'garlic', 'good', 'good', 'immune', 'system', 'keep', 'pest', 'mosquito', 'vampire', 'men']
['going', 'really', 'rough', 'period', 'work', 'week', 'fault', 'swapping', 'tampax', 'sand', 'paper']
['could', 'dinner', 'anyone', 'dead', 'alive', 'would', 'choose', 'alive']
['two', 'guy', 'walk', 'bar', 'third', 'guy', 'duck']


In [9]:
from gensim.models import Word2Vec

In [10]:
model = Word2Vec(preprocessed_jokes, vector_size=300, window=5, min_count=1)

In [11]:
# Save the model
model.save("word2vec_jokes_model.bin")

In [12]:
# Load the model
model = Word2Vec.load("word2vec_jokes_model.bin")

In [13]:
word_embeddings = model.wv

In [14]:
import numpy as np

# Convert word embeddings to NumPy array
embedding_matrix = np.array([word_embeddings[word] for word in word_embeddings.index_to_key])


In [15]:
import faiss

# Initialize FAISS index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])  # L2 distance metric

# Add embeddings to the index
index.add(embedding_matrix)

In [16]:
faiss.write_index(index, "word2vec_index.faiss")
word_embeddings.save("word2vec_word_embeddings.kv")

In [17]:
index_file = 'word2vec_index.faiss'
index = faiss.read_index(index_file)

In [18]:
from rank_bm25 import BM25Okapi

# Initialize BM25Okapi with the joke corpus
bm25 = BM25Okapi(preprocessed_jokes)

#Normal query

In [19]:
query = "Corona virus"  # Example query keyword

# Preprocess the query keyword
query_tokens = preprocessor.preprocess_text(query)

# Get BM25 scores for the jokes based on the query
bm25_scores = bm25.get_scores(query_tokens)

In [20]:
ranked_jokes_full = sorted(zip(all_jokes, bm25_scores), key=lambda x: x[1], reverse=True)

In [21]:
# Sort the jokes based on BM25 scores in descending order
ranked_jokes = sorted(zip(preprocessed_jokes, bm25_scores), key=lambda x: x[1], reverse=True)


In [22]:
print(ranked_jokes[0:3])

[(['single', 'person', 'like', 'corona', 'corona', 'corona', 'virus'], 21.66872211038533), (['corona', 'hold', 'virus'], 21.219569152196243), (['corona', 'hold', 'virus'], 21.219569152196243)]


In [23]:
print(ranked_jokes_full[0:3])

[('A single person be like: Corona vir-you + corona vir-me = corona virus.', 21.66872211038533), ('I’ll have a Corona... hold the virus!', 21.219569152196243), ('I’ll have a corona... Hold the virus!', 21.219569152196243)]


Expanded query (with most_similar)

In [38]:
query = "virus"  # Example query keyword

# Preprocess the query keyword
query_tokens = preprocessor.preprocess_text(query)

query_embeddings = [model.wv[token] for token in query_tokens]

similar_terms = []
for token in query_tokens:
    similar_tokens = model.wv.most_similar(token, topn=3)  # Adjust the number of similar terms as desired
    similar_terms.extend([sim_term[0] for sim_term in similar_tokens])

expanded_query = query_tokens + similar_terms


# Get BM25 scores for the jokes based on the query
bm25_scores = bm25.get_scores(expanded_query)

In [39]:
# Sort the jokes based on BM25 scores in descending order
expanded_ranked_jokes = sorted(zip(all_jokes, bm25_scores), key=lambda x: x[1], reverse=True)

In [40]:
print(expanded_ranked_jokes[0:5])

[('Apparently the corona virus has been around since the 1800s But it used to spread by COVID wagons', 22.307948314746458), ('A single person be like: Corona vir-you + corona vir-me = corona virus.', 21.66872211038533), ('The National Center for Disease Control just downgraded the coronavirus. They say it’s a corona lite virus now.', 21.53273564512857), ('What’s the difference between Covid 19 and Romeo &amp; Juliet? ..One’s a Corona virus and the other’s a Verona crisis.', 21.28338691183202), ('I’ll have a Corona... hold the virus!', 21.219569152196243)]


To visualize my information extraction system, I created an interface using JupyterDash.

In [25]:
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State

The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html


In [28]:
app = JupyterDash(__name__)

external_stylesheets = [
    "https://cdnjs.cloudflare.com/ajax/libs/normalize/7.0.0/normalize.min.css",
    "https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css",
    "https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap",
    "style.css"
]

app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

app.layout = html.Div(
    className="container",
    style={
        "display": "flex",
        "justify-content": "center",
        "align-items": "center",
        "height": "100vh",
        "background-image": "url('assets/harold.jpg')",  # Specify the URL or path to your background image
        "background-size": "cover",
        "background-position": "center"
    },
    children=[
        html.Div(
            className="content",
            children=[
                html.H1(
                    "The Dad Joke Generator",
                    style={"font-family": "Comic Sans MS", "font-weight": "bold", "margin-bottom": "20px"}  # Set the font to Comic Sans
                ),
                dcc.Input(
                    id="query-input",
                    type="text",
                    placeholder="generate a dad joke containing this word",
                    style={"width": "400px", "margin-bottom": "10px"},
                    className="input-field"  # Add a custom CSS class
                ),
                html.Button("generate", id="search-button", n_clicks=0),
                html.Div(
                    id="result-output",
                    style={
                        'border': '1px solid black',
                        'padding': '10px',
                        'margin-top': '20px',
                        'background-color': 'white',
                        "font-family": "Comic Sans MS"
                    }
                )

            ]
        )
    ]
)

@app.callback(
    Output('result-output', 'children'),
    [Input('search-button', 'n_clicks')],
    [State('query-input', 'value')]
)
def perform_search(n_clicks, query):
    query_tokens = preprocessor.preprocess_text(query)

    query_embeddings = [model.wv[token] for token in query_tokens]

    similar_terms = []
    for token in query_tokens:
        similar_tokens = model.wv.most_similar(token, topn=3)  # Adjust the number of similar terms as desired
        similar_terms.extend([sim_term[0] for sim_term in similar_tokens])

    expanded_query = query_tokens + similar_terms

    bm25 = BM25Okapi(preprocessed_jokes)
    bm25_scores = bm25.get_scores(expanded_query)
    expanded_ranked_jokes = sorted(zip(all_jokes, bm25_scores), key=lambda x: x[1], reverse=True)

    results = []
    try:
        for rank, (joke, score) in enumerate(expanded_ranked_jokes[:5], 1):  # Print top 3 most relevant jokes
            result = html.P(f"Joke {rank}: {joke}")
            results.append(result)
    except IndexError:
        results.append("I'm out of inspiration. Someone find me a dad!")

    return results

if __name__ == '__main__':
    app.run_server(mode='inline')

Dash is running on http://127.0.0.1:8050/



[1;31m---------------------------------------------------------------------------[0m
[1;31mTypeError[0m                                 Traceback (most recent call last)
Cell [1;32mIn[7], line 41[0m, in [0;36mPreprocessingPipeline.preprocess_text[1;34m(
    self=<__main__.PreprocessingPipeline object>,
    text=None
)[0m
[0;32m     40[0m [38;5;28;01mdef[39;00m [38;5;21mpreprocess_text[39m([38;5;28mself[39m, text):
[1;32m---> 41[0m     tokens [38;5;241m=[39m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43mtokenize[49m[43m([49m[43mtext[49m[43m)[49m
        text [1;34m= None[0m[1;34m
        [0mself [1;34m= <__main__.PreprocessingPipeline object at 0x178f6b130>[0m
[0;32m     42[0m     preprocessed_tokens [38;5;241m=[39m []
[0;32m     43[0m     [38;5;28;01mfor[39;00m token [38;5;129;01min[39;00m tokens:

Cell [1;32mIn[7], line 12[0m, in [0;36mPreprocessingPipeline.tokenize[1;34m(
    self=<__main__.PreprocessingPipeline object>,
    text