In [1]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load joke data from CSV
jokes_df = pd.read_csv("/kaggle/input/qa-jokes/jokes.csv")

In [3]:
jokes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38269 entries, 0 to 38268
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        38269 non-null  int64 
 1   Question  38269 non-null  object
 2   Answer    38266 non-null  object
dtypes: int64(1), object(2)
memory usage: 897.1+ KB


In [4]:
# Extract questions and responses from the DataFrame
joke_corpus = dict(zip(jokes_df["Question"], jokes_df["Answer"]))

In [5]:
# Preprocessing: Tokenization and TF-IDF vectorization
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# Define a function to preprocess text
def preprocess(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    # Join tokens back into a sentence
    return " ".join(tokens)

# Preprocess the joke corpus
processed_corpus = [preprocess(question) for question in joke_corpus.keys()]

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the processed corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus)

In [8]:
# Function to get the most similar joke and its punchline
def get_most_similar_joke(query, tfidf_matrix):
    query = preprocess(query)
    query_vec = tfidf_vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix)
    most_similar_joke_question_index = cosine_similarities.argmax()
    most_similar_question = list(joke_corpus.keys())[most_similar_joke_question_index]
    response = joke_corpus.get(most_similar_question, "I don't have a joke for that question.")
    return most_similar_question, response

In [10]:
# Example usage:
user_question = "Tell me a joke about chickens."
most_similar_question, response = get_most_similar_joke(user_question, tfidf_matrix)
# print(f"User Question: {user_question}")
# print(f"Most Similar Joke Question: {most_similar_question}")
print(response)

What do chickens say? Cock-a-doodle-doo What does a chicken in the bathroom say? Doodle-doodle-cock What does a gay chicken say? Any-cock'll-do


In [14]:
# Accept user input
user_question = input("Enter your joke-related question: ")
# Find the most similar joke and its punchline
most_similar_question, response = get_most_similar_joke(user_question, tfidf_matrix)

# Display the result
print(most_similar_question,response)

Enter your joke-related question:  Tell me a joke about baby


Should I have a baby after 25? No, 25 is enough
