In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import warnings
import tensorflow as tf
import keras
from keras.layers import Input, Embedding, Flatten, Dot, Dense
from keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from keras.regularizers import l2
#import keras.utils; reload(utils)
#from keras.utils import *
#from __future__ import division, print_function


warnings.filterwarnings("ignore")

In [2]:
t_start = time.clock()
df = pd.read_csv('../input/edsa-recommender-system-predict/train.csv')
test = pd.read_csv('../input/edsa-recommender-system-predict/test.csv')
t_end = time.clock()

print("Start Time: {}".format(t_start))
print("End Time: {}".format(t_end))

Start Time: 6.463781
End Time: 12.806703


In [3]:
print("train")
display(df.head(2))
print("test")
display(test.head(2))

train


Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739


test


Unnamed: 0,userId,movieId
0,1,2011
1,1,4144


In [4]:
#
num_users = df.userId.unique()
num_movies = df.movieId.unique()

In [5]:
userid2idx = {o:i for i,o in enumerate(num_users)}
movieid2idx = {o:i for i,o in enumerate(num_movies)}

In [6]:
df["userId"] = df["userId"].apply(lambda x: userid2idx[x])
df["movieId"] = df["movieId"].apply(lambda x: movieid2idx[x])

In [7]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,1518349992
1,1,1,4.5,1206238739
2,2,2,5.0,1076215539
3,3,3,2.0,1423042565
4,4,4,3.0,833375837


In [8]:
#min max 
user_min, user_max, movie_min, movie_max = (df.userId.min(),df.userId.max(), df.movieId.min(), df.movieId.max())
user_min, user_max, movie_min, movie_max

(0, 162540, 0, 48212)

**Data Encoding**

In [9]:
# 
np.random.seed(42)
rands = np.random.rand(len(df))
msk = rands < 0.8
train = df[msk].copy()
val = df[~msk].copy()

In [10]:
g = df.groupby("userId")["rating"].count()
topusers = g.sort_values(ascending=False)[:15]

In [11]:
g = df.groupby("movieId")["rating"].count()
topmovies = g.sort_values(ascending=False)[:15]

In [12]:
top_r = df.join(topusers, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(topmovies, rsuffix='_r', how='inner', on='movieId')
pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc = np.sum)

movieId,28,59,66,70,81,99,118,180,188,237,273,397,437,615,778
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
516,4.5,4.5,,,5.0,,5.0,,4.0,4.0,,5.0,,,5.0
949,,,5.0,5.0,5.0,,5.0,,5.0,5.0,5.0,,,5.0,5.0
1114,,5.0,,5.0,5.0,5.0,5.0,,,1.0,,,,,
1129,,4.5,4.0,4.0,,4.0,,,,,,,,4.0,4.5
1164,,,5.0,4.0,5.0,,,2.0,1.0,,2.5,,,,
1329,,,4.0,,4.5,,,,3.0,,3.0,,4.0,,
1465,,,4.0,,,,,3.5,3.5,3.5,,,4.0,,
1573,,5.0,,,,,,3.5,,4.0,,,,5.0,
1641,,5.0,,5.0,5.0,,,2.0,,,,,4.0,,5.0
2596,,,,,4.0,,3.0,4.0,,,,,4.0,4.5,


# keras trial

In [13]:
user_ids = df["userId"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
movie_ids = df["movieId"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
#df["user"] = df["userId"].map(user2user_encoded)
#df["movie"] = df["movieId"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["rating"] = df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 162541, Number of Movies: 48213, Min rating: 0.5, Max rating: 5.0


In [14]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,1518349992
1,1,1,4.5,1206238739
2,2,2,5.0,1076215539
3,3,3,2.0,1423042565
4,4,4,3.0,833375837


Prepare training and validation data

In [17]:
df = df.sample(frac=1, random_state=42)
x = df[["userId", "movieId"]].values
# Normalize the targets between 0 and 1. Makes it easy to train.
y = df["rating"]
#.apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
train_indices = int(0.8 * df.shape[0])
x_train, x_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:],
)

In [45]:
#source code from keras
EMBEDDING_SIZE = 50


class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            #embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)


model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
)

In [46]:
from keras.layers import Dense, Dropout

In [47]:
history = model.fit(
    x=x_train[:20000],
    y=y_train[:20000],
    batch_size=50,
    epochs=1,
    verbose=1,
    validation_data=(x_val, y_val),
)



In [None]:
ratings_pred = model.predict(x_val)

In [None]:
ratings_pred[:10]

# Now I have a problem with denormalising the values

In [22]:
r_mean = df['rating'].mean()
r_std = df['rating'].std()

In [23]:
tes = list(ratings_pred)
denorm1 = [((i * r_std) + r_mean) for i in tes]

In [24]:
max(denorm1)

array([4.1282663], dtype=float32)

In [25]:
denorm2 =[(i*(max_rating - min_rating)) for i in tes]

In [28]:
min(denorm2)

array([2.072858], dtype=float32)