# Embedding Lesson

## Importing stuff

In [2]:
%matplotlib inline

from utils import *

path = "../../data/ml-latest-small/"
model_path = path+'models/'
from keras.layers import dot

if not os.path.exists(model_path):os.mkdir(model_path)
batch_size=64

## Reading Movies Data and PreProcessing

In [3]:
ratings= pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
ratings.rating.unique()

array([2.5, 3. , 2. , 4. , 3.5, 1. , 5. , 4.5, 1.5, 0.5])

In [5]:
movie_names= pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()

In [7]:
users=ratings.userId.unique()
movies=ratings.movieId.unique()

In [8]:
user2idx={o:i for i,o in enumerate(users)}
movieid2idx={o:i for i,o in enumerate(movies)}

In [9]:
ratings.movieId = ratings.movieId.apply(lambda x : movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x : user2idx[x])

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.5,1260759144
1,0,1,3.0,1260759179
2,0,2,3.0,1260759182
3,0,3,2.0,1260759185
4,0,4,4.0,1260759205


In [11]:
user_min,user_max,movie_min,movie_max=(ratings.userId.min(),ratings.userId.max(),
                                      ratings.movieId.min(),ratings.movieId.max());
user_min,user_max,movie_min,movie_max

(0, 670, 0, 9065)

In [12]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()

n_users,n_movies

(671, 9066)

In [13]:
# Latent factors

n_factors = 50
np.random.seed=42

In [14]:
msk=np.random.rand(len(ratings)) < 0.8
trn=ratings[msk]
val=ratings[~msk]

In [15]:
val.size

79700

## Analzying top10 users and movies

In [None]:
g=ratings.groupby('userId')['rating'].count()
topUsers = g.sort_values(ascending=False)[:15]

In [None]:
g=ratings.groupby('movieId')['rating'].count()
topMovies = g.sort_values(ascending=False)[:15]

In [None]:
top_r = ratings.join(topUsers,rsuffix='_r',how='inner',on='userId')

In [None]:
top_r = top_r.join(topMovies,rsuffix='_r',how='inner',on='movieId')

In [None]:
pd.crosstab(top_r.userId,top_r.movieId,top_r.rating,aggfunc=np.sum)

## Dot Product

In [None]:
user_in = Input(shape=(1,),dtype='int64',name='user_in')
u=Embedding(n_users,n_factors,input_length=1,embeddings_regularizer=l2(1e-4))(user_in)

movie_in = Input(shape=(1,),dtype='int64',name='movie_in')
m=Embedding(n_movies,n_factors,input_length=1,embeddings_regularizer=l2(1e-4))(movie_in)

In [None]:
x=dot([u,m],axes=1)

In [None]:
x=Flatten()(x)

In [None]:
model=Model([user_in,movie_in],x)

## neural network

In [21]:
from keras.layers import add,concatenate,Input

In [16]:
user_in=Input(shape=(1,),dtype='int64',name='user_in')
u=Embedding(n_users,n_factors,input_length=1,embeddings_regularizer=l2(1e-4))(user_in)

In [17]:
movie_in=Input(shape=(1,),dtype='int64',name='movie_in')
m=Embedding(n_movies,n_factors,input_length=1,embeddings_regularizer=l2(1e-4))(movie_in)

In [18]:
u

<tf.Tensor 'embedding_1/embedding_lookup/Identity:0' shape=(?, 1, 50) dtype=float32>

In [19]:
m

<tf.Tensor 'embedding_2/embedding_lookup/Identity:0' shape=(?, 1, 50) dtype=float32>

In [22]:
x=concatenate([u,m])

In [23]:
x

<tf.Tensor 'concatenate_1/concat:0' shape=(?, 1, 100) dtype=float32>

In [24]:
x=Flatten()(x)

In [25]:
x=Dropout(0.3)(x)

In [26]:
x=Dense(70,activation='relu')(x)

In [27]:
x=Dropout(0.75)(x)

In [28]:
x=Dense(1)(x)

In [29]:
model = Model([user_in,movie_in],x)

In [30]:
model.compile(optimizer=Adam(lr=0.001),loss='mse')

In [31]:
model.fit([trn.userId,trn.movieId],trn.rating,batch_size=64, epochs=8, 
          validation_data=([val.userId, val.movieId], val.rating))

Train on 80079 samples, validate on 19925 samples
Epoch 1/8
Epoch 2/8
15808/80079 [====>.........................] - ETA: 6s - loss: 1.5385

KeyboardInterrupt: 

In [65]:
type([trn.userId,trn.movieId])

list

In [83]:
model.predict([np.array([1]),np.array([2])])

array([[3.4216]], dtype=float32)

In [33]:
trn.rating.shape

(80079,)