In [82]:
import kagglehub

# Load and preprocess the dataset
path = kagglehub.dataset_download("abhinavmoudgil95/short-jokes")

# Print the path where the dataset is saved
print("Path to dataset files:", path)


Path to dataset files: /kaggle/input/short-jokes


In [83]:
import pandas as pd

# Load and preprocess the dataset
jokes_data = pd.read_csv(path + "/shortjokes.csv")

# Display the first few rows of the dataset
print(jokes_data.head())


   ID                                               Joke
0   1  [me narrating a documentary about narrators] "...
1   2  Telling my daughter garlic is good for you. Go...
2   3  I've been going through a really rough period ...
3   4  If I could have dinner with anyone, dead or al...
4   5     Two guys walk into a bar. The third guy ducks.


In [84]:
# Check the structure of the dataset
print(jokes_data.info())

# Display the first few rows
print(jokes_data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231657 entries, 0 to 231656
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      231657 non-null  int64 
 1   Joke    231657 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.5+ MB
None
   ID                                               Joke
0   1  [me narrating a documentary about narrators] "...
1   2  Telling my daughter garlic is good for you. Go...
2   3  I've been going through a really rough period ...
3   4  If I could have dinner with anyone, dead or al...
4   5     Two guys walk into a bar. The third guy ducks.


In [85]:
jokes_data = jokes_data.sample(n=5000, random_state=42)

In [86]:
jokes_data.loc[4510]

ID                                                   4511
Joke    What do all battered women have in common? The...
Name: 4510, dtype: object

In [87]:
jokes_data['Joke'] = jokes_data['Joke'].str.strip()


In [88]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


In [89]:
# Tokenize and create sequences

tokenizer = Tokenizer()

In [90]:
tokenizer.fit_on_texts(jokes_data['Joke'])

In [91]:
len(token.word_index)

10485

In [92]:
input_sequences=[]
for sentence in jokes_data['Joke']:
    tokenized_sentence = token.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])


In [93]:
max_len = max([len(x) for x in input_sequences])
max_len

44

In [94]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [95]:
# Prepare data for training

X = padded_input_sequences[:,:-1]

In [96]:
y = padded_input_sequences[:,-1]

In [97]:
X.shape

(83483, 43)

In [98]:
y.shape

(83483,)

In [99]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=10486)

In [100]:
y.shape

(83483, 10486)

In [101]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [102]:
# Build and train the model

model = Sequential()
model.add(Embedding(10486, 100, input_length=43))
model.add(LSTM(150))
model.add(Dense(10486, activation='softmax'))



In [103]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [104]:
model.summary()

In [105]:
history = model.fit(
    X, y,
    epochs=100,
    batch_size=64,
    #validation_split=0.2,  # Automatically split 20% of data for validation
    #callbacks=[early_stopping],
    verbose=1
)


Epoch 1/100
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step - accuracy: 0.0411 - loss: 7.3326
Epoch 2/100
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.0788 - loss: 6.4679
Epoch 3/100
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.1158 - loss: 5.9483
Epoch 4/100
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.1410 - loss: 5.5362
Epoch 5/100
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.1648 - loss: 5.1509
Epoch 6/100
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.1835 - loss: 4.8109
Epoch 7/100
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.2060 - loss: 4.4826
Epoch 8/100
[1m1305/1305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.2355 - loss: 4.1551


In [106]:
# Text prediction example

import numpy as np
import time
text = "women"

for i in range(30):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=44, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
women are
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
women are like
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
women are like stars
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
women are like stars at
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
women are like stars at first
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
women are like stars at first they
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
women are like stars at first they are
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
women are like stars at first they are small
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
women are like stars at first they are small and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
women are l