<a href="https://colab.research.google.com/github/marabaso/ie423/blob/main/task8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initialize

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Movie metadata
df_joke = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/joke/JokeText.csv')

# User ratings for each movie
df_rate1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/joke/UserRatings1.csv')
df_rate2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/joke/UserRatings2.csv')

## Explore and prepare data

In [None]:
df_joke.head()

Unnamed: 0,JokeId,JokeText
0,0,"A man visits the doctor. The doctor says ""I ha..."
1,1,This couple had an excellent relationship goin...
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...
3,3,Q. What's the difference between a man and a t...
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...


In [None]:
df_joke.isna().sum()

JokeId      0
JokeText    0
dtype: int64

In [None]:
df_rate2.head()

Unnamed: 0,JokeId,User36711,User36712,User36713,User36714,User36715,User36716,User36717,User36718,User36719,...,User73412,User73413,User73414,User73415,User73416,User73417,User73418,User73419,User73420,User73421
0,0,,,,3.93,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,4.81,...,,,,,,,,,,
2,2,,,,,,,,,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,5.68,1.07,8.11,-2.33,-5.83,8.2,-5.83,1.94,0.1,...,3.64,4.32,6.99,-9.66,-8.4,-0.63,9.51,-7.67,-1.6,8.3


In [None]:
df_joke.shape, df_rate1.shape, df_rate2.shape

((100, 2), (100, 36711), (100, 36712))

In [None]:
merged_rates = pd.merge(df_rate1, df_rate2, on='JokeId', how='outer')

final_df = pd.merge(df_joke, merged_rates, on='JokeId', how='outer')

In [None]:
final_df.head()

Unnamed: 0,JokeId,JokeText,User1,User2,User3,User4,User5,User6,User7,User8,...,User73412,User73413,User73414,User73415,User73416,User73417,User73418,User73419,User73420,User73421
0,0,"A man visits the doctor. The doctor says ""I ha...",5.1,-8.79,-3.5,7.14,-8.79,9.22,-4.03,3.11,...,,,,,,,,,,
1,1,This couple had an excellent relationship goin...,4.9,-0.87,-2.91,-3.88,-0.58,9.37,-1.55,0.92,...,,,,,,,,,,
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...,1.75,1.99,-2.18,-3.06,-0.58,-3.93,-3.64,7.52,...,,,,,,,,,,
3,3,Q. What's the difference between a man and a t...,-4.17,-4.61,-0.1,0.05,8.98,9.27,-6.99,0.49,...,,,,,,,,,,
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...,5.15,5.39,7.52,6.26,7.67,3.45,5.44,-0.58,...,3.64,4.32,6.99,-9.66,-8.4,-0.63,9.51,-7.67,-1.6,8.3


In [None]:
final_df.isna().sum()

JokeId        0
JokeText      0
User1         0
User2         0
User3         0
             ..
User73417    84
User73418    85
User73419    85
User73420    85
User73421    85
Length: 73423, dtype: int64

In [None]:
missing_counts = final_df.isna().sum()

# Find columns with zero missing values
columns_no_missing = missing_counts[missing_counts == 0].index

# Number of columns with zero missing values
num_columns_no_missing = len(columns_no_missing)

print(f"Number of columns with zero missing values: {num_columns_no_missing}")


Number of columns with zero missing values: 14118


Since we have lots of missing data in our dataframe, we need to handle it. We will try different approaches when building recommendations.

## Build Recommendations

### 1. Content Based Filtering

The idea here is to determine how similar the descriptions are based on the terms used in the descriptions - while ignoring commonly used words.  Then recommend other items with similar descriptions.  In order to do this, **TF-IDF Vectorization** is used.

#### Prepare data

##### Filling with 0

In [None]:
df_f0 = final_df.fillna(0)
df_f0.head()

Unnamed: 0,JokeId,JokeText,User1,User2,User3,User4,User5,User6,User7,User8,...,User73412,User73413,User73414,User73415,User73416,User73417,User73418,User73419,User73420,User73421
0,0,"A man visits the doctor. The doctor says ""I ha...",5.1,-8.79,-3.5,7.14,-8.79,9.22,-4.03,3.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,This couple had an excellent relationship goin...,4.9,-0.87,-2.91,-3.88,-0.58,9.37,-1.55,0.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...,1.75,1.99,-2.18,-3.06,-0.58,-3.93,-3.64,7.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Q. What's the difference between a man and a t...,-4.17,-4.61,-0.1,0.05,8.98,9.27,-6.99,0.49,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...,5.15,5.39,7.52,6.26,7.67,3.45,5.44,-0.58,...,3.64,4.32,6.99,-9.66,-8.4,-0.63,9.51,-7.67,-1.6,8.3


##### Dropping all columns with at least one missing value

In [None]:
x = final_df.isna().sum()
y = x>0
y.sum() / len(x)

0.807716927938112

Since most of the columns (80 %) include at least 1 missing value, we keep the columns with at least 75 % filled.

In [None]:
t = int(len(final_df)*0.75)
df_dropped = final_df.dropna(axis = 1, thresh = t)

# filling the rest with 0
df_dropped.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dropped.fillna(0,inplace=True)


In [None]:
df_dropped.head()

Unnamed: 0,JokeId,JokeText,User1,User2,User3,User4,User5,User6,User7,User8,...,User18794,User18795,User18796,User18797,User18798,User18799,User18800,User18801,User18802,User18803
0,0,"A man visits the doctor. The doctor says ""I ha...",5.1,-8.79,-3.5,7.14,-8.79,9.22,-4.03,3.11,...,-5.29,5.73,1.99,3.01,-0.63,-0.78,8.06,2.82,0.19,-9.47
1,1,This couple had an excellent relationship goin...,4.9,-0.87,-2.91,-3.88,-0.58,9.37,-1.55,0.92,...,-5.97,2.09,-9.51,8.69,-3.45,-9.71,-7.62,0.24,-8.54,-9.47
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...,1.75,1.99,-2.18,-3.06,-0.58,-3.93,-3.64,7.52,...,0.68,3.25,-2.18,3.4,-1.12,0.0,3.93,0.0,-0.15,-9.37
3,3,Q. What's the difference between a man and a t...,-4.17,-4.61,-0.1,0.05,8.98,9.27,-6.99,0.49,...,-5.53,-3.69,-9.17,8.2,-1.26,-4.47,3.45,0.0,-6.46,-9.22
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...,5.15,5.39,7.52,6.26,7.67,3.45,5.44,-0.58,...,0.87,4.03,4.03,7.52,2.72,2.33,-7.96,-2.09,8.79,-1.36


#### Build Model

##### 1

In [None]:
# Generate a matrix of common terms that show up in each movie

from sklearn.feature_extraction.text import TfidfVectorizer
model_tv = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, stop_words='english')
tfidf_matrix = model_tv.fit_transform(df_f0['JokeText'])
tfidf_matrix.shape

(100, 3774)

The similarity between any two movies (x) and (y) is defined as the **Cosine Similarity**:
cosine(x,y)=x.y⊺||x||.||y||

Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give us the Cosine Similarity Score.

In [None]:
# Calculate cosine similarity between each pair of movies as a function of the similarity of the common terms

from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(100, 100)

###### Predict

In [None]:
# Prepare recommendation function (build code from scratch and then package as function for ease of understanding)

jokes = df_f0['JokeText']
indices = pd.Series(df_f0.index, index=df_f0['JokeText'])

def get_similar_jokes(joke):
    idx = indices[joke]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return jokes.iloc[movie_indices]

In [None]:
a_joke = df_f0['JokeText'][np.random.randint(0,100)]
print('The joke:',a_joke)
print('Similar jokes')
similar_jokes = get_similar_jokes(a_joke).head(10)
count = 1
for i in similar_jokes:
  print(count,')')
  print(i)
  count += 1

The joke: Q: What's the difference between a Lawyer and a Plumber? 
A: A Plumber works to unclog the system.

Similar jokes
1 )
A lawyer opened the door of his BMW, when suddenly a car came along
and hit the door, ripping it off completely.  When the police arrived
at the scene, the lawyer was complaining bitterly about the damage to
his precious BMW.  
"Officer, look what they've done to my Beeeeemer!!!", he whined.  
"You lawyers are so materialistic, you make
me sick!!!"  retorted the officer.  "You're so worried about your
stupid BMW, that you didn't even notice that your left arm was ripped
off!!!"  
"Oh my gaaaad...", replied the lawyer, finally noticing the
bloody left shoulder where his arm once was.  "Where's my
Rolex???!!!!"

2 )
Q: What's the difference between the government  and  the Mafia?

A: One of them is organized.

3 )
Q: What is the difference between George  Washington, Richard Nixon,
and Bill Clinton?

A: Washington couldn't tell a lie, Nixon couldn't   tell the t

##### 2

In [None]:
# Generate a matrix of common terms that show up in each movie

from sklearn.feature_extraction.text import TfidfVectorizer
model_tv = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, stop_words='english')
tfidf_matrix = model_tv.fit_transform(df_dropped['JokeText'])
tfidf_matrix.shape

(100, 3774)

In [None]:
# Calculate cosine similarity between each pair of movies as a function of the similarity of the common terms

from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(100, 100)

###### Predict

In [None]:
# Prepare recommendation function (build code from scratch and then package as function for ease of understanding)

jokes = df_dropped['JokeText']
indices = pd.Series(df_dropped.index, index=df_dropped['JokeText'])

def get_similar_jokes(joke):
    idx = indices[joke]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return jokes.iloc[movie_indices]

In [None]:
a_joke = df_dropped['JokeText'][np.random.randint(0,100)]
print('The joke:',a_joke)
print('Similar jokes')
similar_jokes = get_similar_jokes(a_joke).head(10)
count = 1
for i in similar_jokes:
  print(count,')')
  print(i)
  count += 1

The joke: A Panda bear walks into a bar.  Sits down at a table and orders a beer 
and a double cheeseburger.  After he is finished eating, he pulls out a gun
and rips the place with gunfire.  Patrons scatter and dive under chairs and
tables as the bear runs out the door.  After ensuring that no one is hurt, 
the bartender races out the door, and calls after the bear "What the hell did
you do that for?"  The bear calls back, "I'm a Panda bear.  Look it up in the
dictionary."  

The bartender returns, pulls out his dictionary.

panda : \Pan"da\, n. (Zo["o]l.)
A small Asiatic mammal (Ailurus fulgens) having fine soft fur.
It is related to the bears, and inhabits the mountains of Northern India.
Eats shoots and leaves.

Similar jokes
1 )
A horse walks into a bar. Bartender says:
"So, why the long face?"

2 )
A neutron walks into a bar and orders a drink.
"How much do I owe you?" the neutron asks.

The bartender replies, "for you, no charge."

3 )
A guy walks into a bar, orders a beer and s

### 2. Collaborative Filtering

#### Prepare data

#### Build Model

In [None]:
df_ratings = df_f0.melt(id_vars=['JokeId', 'JokeText'], var_name='userId', value_name='rating')
#df_ratings = df_ratings[df_ratings['rating'] != 0]

In [None]:
df_ratings.head()

Unnamed: 0,JokeId,JokeText,userId,rating
0,0,"A man visits the doctor. The doctor says ""I ha...",User1,5.1
1,1,This couple had an excellent relationship goin...,User1,4.9
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...,User1,1.75
3,3,Q. What's the difference between a man and a t...,User1,-4.17
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...,User1,5.15


In [None]:
# Prepare data into Surprise library format

!pip3 install scikit-surprise #or !conda install -c conda-forge scikit-surprise
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(-10,10))
X = Dataset.load_from_df(df_ratings[['userId', 'JokeId', 'rating']], reader)
X_train, X_test = train_test_split(X, test_size=.25)



In [None]:
# Define SVD model

from surprise import SVD

model_svd = SVD()

In [None]:
# Fit SVD model

model_svd.fit(X_train)
test_pred = model_svd.test(X_test)

In [None]:
# Evalute SVD accuracy

from surprise import accuracy

accuracy.rmse(test_pred)

RMSE: 3.6955


3.695486022015179

In [None]:
# Tune hyperparameters

from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(X)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

3.528692795725435
{'n_epochs': 15, 'lr_all': 0.002, 'reg_all': 0.4}


In [None]:
# Cross-validate

from surprise.model_selection import cross_validate

cross_validate(model_svd, X, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.7504  3.7495  3.7499  3.7524  3.7416  3.7487  0.0037  
MAE (testset)     2.6588  2.6560  2.6563  2.6585  2.6519  2.6563  0.0025  
Fit time          188.47  158.74  155.93  159.05  156.76  163.79  12.40   
Test time         21.39   30.18   21.17   22.03   27.88   24.53   3.75    


{'test_rmse': array([3.75036554, 3.74949885, 3.74985742, 3.75242468, 3.74159969]),
 'test_mae': array([2.6587822 , 2.65599755, 2.65630394, 2.65851709, 2.65190856]),
 'fit_time': (188.46678376197815,
  158.73842573165894,
  155.9326331615448,
  159.04670238494873,
  156.75541591644287),
 'test_time': (21.39285659790039,
  30.175179719924927,
  21.17397427558899,
  22.027415990829468,
  27.881200313568115)}