In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train = pd.read_csv('/content/drive/My Drive/Jokes_Rating/train.csv')
jokes = pd.read_csv('/content/drive/My Drive/Jokes_Rating/jokes.csv')
test = pd.read_csv('/content/drive/My Drive/Jokes_Rating/test.csv')

In [None]:
train.head()

Unnamed: 0,id,user_id,joke_id,Rating
0,31030_110,31030,110,2.75
1,16144_109,16144,109,5.094
2,23098_6,23098,6,-6.438
3,14273_86,14273,86,4.406
4,18419_134,18419,134,9.375


In [None]:
jokes.head()

Unnamed: 0,joke_id,joke_text
0,1,Q. What's O. J. Simpson's web address? A. Slas...
1,2,How many feminists does it take to screw in a ...
2,3,Q. Did you hear about the dyslexic devil worsh...
3,4,They asked the Japanese visitor if they have e...
4,5,Q: What did the blind person say when given so...


In [None]:
train['joke_text'] = train[['joke_id']].merge(jokes, how='left')['joke_text']

In [None]:
train.head()

Unnamed: 0,id,user_id,joke_id,Rating,joke_text
0,31030_110,31030,110,2.75,"Judy was having trouble with her computer, so ..."
1,16144_109,16144,109,5.094,One day the first grade teacher was reading th...
2,23098_6,23098,6,-6.438,Q. What is orange and sounds like a parrot? A....
3,14273_86,14273,86,4.406,Two attorneys went into a diner and ordered tw...
4,18419_134,18419,134,9.375,A man is driving in the country one evening wh...


# Collaborative Fitlering using Keras
This is just a kind of implementation of Low Rank Matrix Factorization 

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import copy

# Preprocessing

In [None]:
df = copy.deepcopy(train)
user_ids = df["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
jokes_ids = df["joke_id"].unique().tolist()
jokes2jokes_encoded = {x: i for i, x in enumerate(jokes_ids)}
joke_encoded2joke = {i: x for i, x in enumerate(jokes_ids)}
df["user"] = df["user_id"].map(user2user_encoded)
df["joke"] = df["joke_id"].map(jokes2jokes_encoded)

num_users = len(user2user_encoded)
num_jokes = len(joke_encoded2joke)
df["Rating"] = df["Rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(df["Rating"])
max_rating = max(df["Rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_jokes, min_rating, max_rating
    )
)

Number of users: 40863, Number of Movies: 139, Min rating: -10.0, Max rating: 10.0


## Prepare for training

In [None]:
df = df.sample(frac=1, random_state=42)
x = df[["user", "joke"]].values
# Normalize the targets between 0 and 1. Makes it easy to train.
y = df["Rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
train_indices = int(1 * df.shape[0]). ## No Validation , You can modify if you want to make validation
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

# Model

In [None]:
EMBEDDING_SIZE = 50

class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_jokes, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_jokes = num_jokes
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.joke_embedding = layers.Embedding(
            num_jokes,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_jokes, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        joke_vector = self.joke_embedding(inputs[:, 1])
        joke_bias = self.movie_bias(inputs[:, 1])
        dot_user_joke = tf.tensordot(user_vector, joke_vector, 2)
        # Add all the components (including bias)
        x = dot_user_joke + user_bias + joke_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)

In [None]:
model = RecommenderNet(num_users, num_jokes, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
)
history = model.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
test["user"] = test["user_id"].map(user2user_encoded)
test["joke"] = test["joke_id"].map(jokes2jokes_encoded)
ratings = model.predict(test[["user", "joke"]].values).flatten()
test['Rating'] = ratings

In [None]:
test['Rating'] = test['Rating']*(max_rating - min_rating) + min_rating. # Unscale
test.head()

Unnamed: 0,id,user_id,joke_id,user,joke,Rating
0,6194_11,6194,11,10718,76,1.480949
1,19356_3,19356,3,16900,32,-2.180818
2,23426_79,23426,79,26083,62,3.986017
3,40030_3,40030,3,21033,32,-4.677805
4,19806_115,19806,115,8108,28,5.69249


In [None]:
test[['id', 'Rating']].to_csv('/content/drive/My Drive/Jokes_Rating/sub_keras.csv', index=False)

Well we got a score of 4.270991 (rank = 37 / 250). The best score is 3.99280

In [None]:
## Maybe Training for more epoch (bigger batch size) and smallest learning rate
model_ = RecommenderNet(num_users, num_jokes, EMBEDDING_SIZE)
model_.compile(
    loss='mse', optimizer=keras.optimizers.Adam(lr=0.0001)
)
history = model_.fit(
    x=x_train,
    y=y_train,
    batch_size=128,
    epochs=25,
    verbose=1,
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
ratings = model_.predict(test[["user", "joke"]].values).flatten()
test['Rating'] = ratings
test['Rating'] = test['Rating']*(max_rating - min_rating) + min_rating  # Unscale
test[['id', 'Rating']].to_csv('/content/drive/My Drive/Jokes_Rating/sub_keras_v1.csv', index=False)

In [None]:
test.head()

Unnamed: 0,id,user_id,joke_id,user,joke,Rating
0,6194_11,6194,11,10718,76,1.403801
1,19356_3,19356,3,16900,32,-2.701854
2,23426_79,23426,79,26083,62,2.931864
3,40030_3,40030,3,21033,32,-3.341493
4,19806_115,19806,115,8108,28,4.677395


We got a score of 4.3175 which is less than before, maybe the model 'overfit' and tried to capture something complex.

I can play with the embedding size and see If I can improve the result.

In [None]:
## Maybe Training for more epoch (bigger batch size) and smallest learning rate
model_100 = RecommenderNet(num_users, num_jokes, 100)
model_100.compile(
    loss='mse', optimizer=keras.optimizers.Adam(lr=0.0001)
)
history = model_100.fit(
    x=x_train,
    y=y_train,
    batch_size=64,
    epochs=6,
    verbose=1,
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
ratings = model_100.predict(test[["user", "joke"]].values).flatten()
test['Rating'] = ratings
test['Rating'] = test['Rating']*(max_rating - min_rating) + min_rating  # Unscale
test[['id', 'Rating']].to_csv('/content/drive/My Drive/Jokes_Rating/sub_keras_v2.csv', index=False)

Lowest score that we had : 4.62 :/

I thought that capturing more feature related to each user & each joke will help to decrease the error ..

@TODO:

Look at the last technique : https://www.kaggle.com/rajmehra03/cf-based-recsys-by-low-rank-matrix-factorization

Look at this implementation : https://github.com/tonytonev/JokeRecommender

# Collaborative Filtering using Bert and HumourDistilbert

In [None]:
!pip install transformers

## Bert Large Uncased Whole Word Masking

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer

In [None]:
df.head()

Unnamed: 0,id,user_id,joke_id,Rating,joke_text,user,joke
0,31030_110,31030,110,2.75,"Judy was having trouble with her computer, so ...",0,0
1,16144_109,16144,109,5.094,One day the first grade teacher was reading th...,1,1
2,23098_6,23098,6,-6.438,Q. What is orange and sounds like a parrot? A....,2,2
3,14273_86,14273,86,4.406,Two attorneys went into a diner and ordered tw...,3,3
4,18419_134,18419,134,9.375,A man is driving in the country one evening wh...,4,4


In [None]:
# Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
jokes = df.joke_text.unique().tolist()
tokens_jokes = tokenizer.batch_encode_plus(
    jokes,
    max_length = 250,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…






In [None]:
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
import torch
device = torch.device("cuda")
model.cuda()
embedding = []
for i in range(len(jokes)):
  a = model(torch.tensor([tokens_jokes['input_ids'][i]]).to(device), torch.tensor([tokens_jokes['attention_mask'][i]]).to(device))
  embedding.append(a)

In [None]:
model(torch.tensor([tokens_jokes['input_ids'][0]]), torch.tensor([tokens_jokes['attention_mask'][0]]))[0].shape

torch.Size([1, 250, 768])