# Embedding Lesson

## Importing stuff

In [1]:
%matplotlib inline

from utils import *

path = "../data/ml-latest-small/"
model_path = path+'models/'
from keras.layers import dot

if not os.path.exists(model_path):os.mkdir(model_path)
batch_size=64

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Reading Movies Data and PreProcessing

In [2]:
ratings= pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
ratings.rating.unique()

array([2.5, 3. , 2. , 4. , 3.5, 1. , 5. , 4.5, 1.5, 0.5])

In [4]:
movie_names= pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()

In [5]:
users=ratings.userId.unique()
movies=ratings.movieId.unique()

In [6]:
user2idx={o:i for i,o in enumerate(users)}
movieid2idx={o:i for i,o in enumerate(movies)}

In [7]:
ratings.movieId = ratings.movieId.apply(lambda x : movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x : user2idx[x])

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.5,1260759144
1,0,1,3.0,1260759179
2,0,2,3.0,1260759182
3,0,3,2.0,1260759185
4,0,4,4.0,1260759205


In [9]:
user_min,user_max,movie_min,movie_max=(ratings.userId.min(),ratings.userId.max(),
                                      ratings.movieId.min(),ratings.movieId.max());
user_min,user_max,movie_min,movie_max

(0, 670, 0, 9065)

In [10]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

n_users,n_movies

(671, 9066)

In [11]:
# Latent factors

n_factors = 50
np.random.seed=42

In [12]:
msk=np.random.rand(len(ratings)) < 0.8
trn=ratings[msk]
val=ratings[~msk]

In [13]:
val.size

80512

## Analzying top10 users and movies

In [None]:
g=ratings.groupby('userId')['rating'].count()
topUsers = g.sort_values(ascending=False)[:15]

In [None]:
g=ratings.groupby('movieId')['rating'].count()
topMovies = g.sort_values(ascending=False)[:15]

In [None]:
top_r = ratings.join(topUsers,rsuffix='_r',how='inner',on='userId')

In [None]:
top_r = top_r.join(topMovies,rsuffix='_r',how='inner',on='movieId')

In [None]:
pd.crosstab(top_r.userId,top_r.movieId,top_r.rating,aggfunc=np.sum)

## Dot Product

In [None]:
user_in = Input(shape=(1,),dtype='int64',name='user_in')
u=Embedding(n_users,n_factors,input_length=1,embeddings_regularizer=l2(1e-4))(user_in)

movie_in = Input(shape=(1,),dtype='int64',name='movie_in')
m=Embedding(n_movies,n_factors,input_length=1,embeddings_regularizer=l2(1e-4))(movie_in)

In [None]:
x=dot([u,m],axes=1)

In [None]:
x=Flatten()(x)

In [None]:
model=Model([user_in,movie_in],x)

## neural network

In [37]:
from keras.layers import add,concatenate,Input

In [45]:
user_in=Input(shape=(1,),dtype='int64',name='user_in')
u=Embedding(n_users,n_factors,input_length=1,embeddings_regularizer=l2(1e-4))(user_in)

In [46]:
movie_in=Input(shape=(1,),dtype='int64',name='movie_in')
m=Embedding(n_movies,n_factors,input_length=1,embeddings_regularizer=l2(1e-4))(movie_in)

In [47]:
u

<tf.Tensor 'embedding_5/Gather:0' shape=(?, 1, 50) dtype=float32>

In [48]:
m

<tf.Tensor 'embedding_6/Gather:0' shape=(?, 1, 50) dtype=float32>

In [49]:
x=concatenate([u,m])

In [50]:
x

<tf.Tensor 'concatenate_4/concat:0' shape=(?, 1, 100) dtype=float32>

In [51]:
x=Flatten()(x)

In [53]:
x=Dropout(0.3)(x)

In [55]:
x=Dense(70,activation='relu')(x)

In [56]:
x=Dropout(0.75)(x)

In [57]:
x=Dense(1)(x)

In [58]:
model = Model([user_in,movie_in],x)

In [59]:
model.compile(optimizer=Adam(lr=0.001),loss='mse')

In [84]:
model.fit([trn.userId,trn.movieId],trn.rating,batch_size=64, epochs=8, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 79876 samples, validate on 20128 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x1271c29b0>

In [65]:
type([trn.userId,trn.movieId])

list

In [83]:
model.predict([np.array([1]),np.array([2])])

array([[3.4216]], dtype=float32)