In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
import difflib
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import mean_squared_error,mean_absolute_error

In [2]:
Dataset_path = "./DataSets/"
Movies_metadata = pd.read_csv(Dataset_path+"MoviesInfo.csv")
ratings = pd.read_csv(Dataset_path+"ml-latest/ratings.csv")

### just keep rating year between 1995, 2001

In [3]:
from datetime import datetime

# strftime('%Y-%m-%d %H:%M:%S')
ratings['year_rated'] = ratings['timestamp'].apply(lambda x: int( datetime.fromtimestamp( x ).strftime('%Y') ) )

ratings = ratings[ ratings['year_rated'].between(1995, 2001)].copy()

ratings.reset_index(drop=True, inplace=True)

In [4]:
print( f"number of Ratings : { ratings.shape[0] }")
print( f"number of movies : { ratings.groupby('movieId').count().shape[0] }")
print( f"number of users : { ratings.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( ratings.groupby('rating').count().index )}, {np.max( ratings.groupby('rating').count().index )})  ")

number of Ratings : 7329482
number of movies : 4937
number of users : 103827
range of rating : ( 1.0, 5.0)  


In [5]:
MyCol = ['movieId','year','duration','metascore','reviews_from_users','reviews_from_critics'
         ,'country','language','director','production_company','mean_vote']
FinalDataset = pd.merge(ratings, Movies_metadata[ MyCol ], how='inner')

In [6]:
FinalDataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7206877 entries, 0 to 7206876
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   userId                int64  
 1   movieId               int64  
 2   rating                float64
 3   timestamp             int64  
 4   year_rated            int64  
 5   year                  int64  
 6   duration              int64  
 7   metascore             float64
 8   reviews_from_users    float64
 9   reviews_from_critics  float64
 10  country               object 
 11  language              object 
 12  director              object 
 13  production_company    object 
 14  mean_vote             float64
dtypes: float64(5), int64(6), object(4)
memory usage: 879.7+ MB


In [7]:
myobj = ['country','language','director','production_company']
obj_FinalDataset= FinalDataset.select_dtypes(include=['object']).copy()

for i in myobj:
    obj_FinalDataset[i] = obj_FinalDataset[i].astype('category')
    FinalDataset[i] = obj_FinalDataset[i].cat.codes
    
FinalDataset = FinalDataset.fillna(100000)
FinalDataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7206877 entries, 0 to 7206876
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   userId                int64  
 1   movieId               int64  
 2   rating                float64
 3   timestamp             int64  
 4   year_rated            int64  
 5   year                  int64  
 6   duration              int64  
 7   metascore             float64
 8   reviews_from_users    float64
 9   reviews_from_critics  float64
 10  country               int16  
 11  language              int16  
 12  director              int16  
 13  production_company    int16  
 14  mean_vote             float64
dtypes: float64(5), int16(4), int64(6)
memory usage: 714.8 MB


In [8]:
data =  FinalDataset[['userId' ,'movieId','year','duration','metascore','reviews_from_users','reviews_from_critics'
                              ,'country','language','director','production_company','mean_vote','rating']]

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled = scaler.fit_transform(data.drop(['movieId','userId','rating'],axis=1))
data_norm = pd.DataFrame(scaled,columns=(data.drop(['movieId','userId','rating'],axis=1).columns))
data_norm= data_norm.join(data['userId'])
data_norm= data_norm.join(data['movieId'])
data_norm= data_norm.join(data['rating'])

                               
from scipy import stats
z_scores = stats.zscore(data_norm.drop(['movieId','userId'],axis=1))
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
newdata = data_norm[filtered_entries]

newdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6529017 entries, 0 to 7206874
Data columns (total 13 columns):
 #   Column                Dtype  
---  ------                -----  
 0   year                  float64
 1   duration              float64
 2   metascore             float64
 3   reviews_from_users    float64
 4   reviews_from_critics  float64
 5   country               float64
 6   language              float64
 7   director              float64
 8   production_company    float64
 9   mean_vote             float64
 10  userId                int64  
 11  movieId               int64  
 12  rating                float64
dtypes: float64(11), int64(2)
memory usage: 697.4 MB


In [13]:
#from sklearn.decomposition import PCA
#pca01 = PCA(n_components = 5)
#principlecomp_train = pca01.fit_transform(newdata.drop(['movieId','userId','rating'],axis=1))
#pca01.explained_variance_ratio_

array([0.22314369, 0.15953152, 0.1481663 , 0.12817592, 0.11141828])

### Keras

In [10]:
model01 = Sequential()
model01.add(Dense(300, activation='relu', input_dim=10))
model01.add(Dense(150, activation='relu'))
model01.add(Dense(30, activation='relu'))
model01.add(Dense(1, activation='linear'))

# Compile the model
model01.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mean_squared_error'])

### Fit Model

In [11]:
def fit_model(model,data):
    target = data['rating'] 
    predictors = data.drop(['userId','movieId','rating'],axis=1)
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.30, random_state=40)
    model.fit(X_train, y_train, epochs=20, batch_size=10)
    y_pred= model.predict(X_test)
    print('MSE:',mean_squared_error(y_test, y_pred))
    print('MAE:',mean_absolute_error(y_test, y_pred))
    print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred)))

In [12]:
fit_model(model01,newdata[newdata['userId'] == 19])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE: 0.6023024297543214
MAE: 0.5516458650430044
RMSE: 0.7760814581951571


In [13]:
#newdata[newdata['userId'] == 19]

In [14]:
def get_Movie_info(Movie_id):
    
    """
    Returns some basic information about a Movie given the Movie id and the metadata dataframe.
    """
    
    Movie_info = Movies_metadata[Movies_metadata['movieId'] == int(Movie_id)][['movieId', 'genres', 
                                                            'title', 'original_title', 'year', 'duration','mean_vote']]
    return Movie_info


### Test Generate_Recommendation

In [15]:
def test_generate_recommendation(model,data):
    frames = pd.DataFrame()
    movie_all  = data.movieId.unique().tolist()
    for i in movie_all:
        mask = data[data.movieId.isin([i])]
        rat = mask['rating'].tolist()
        mask = mask.drop(['userId','movieId','rating'],axis=1)
        pred = model.predict(mask)
        tmp = get_Movie_info(i)
        tmp['pred_rating'] = pred
        tmp['rating'] = rat
        frames = frames.append(tmp, ignore_index=True)

    return frames

In [16]:
test_generate_recommendation(model01,newdata[newdata['userId'] == 19])

Unnamed: 0,movieId,genres,title,original_title,year,duration,mean_vote,pred_rating,rating
0,640,Drama|Thriller,Diabolique,Diabolique,1996,107,5.5,3.052536,3.0
1,1321,Comedy|Horror|Thriller,Un lupo mannaro americano a Londra,An American Werewolf in London,1981,97,7.6,2.882828,3.0
2,10,Action|Adventure|Thriller,GoldenEye,GoldenEye,1995,130,7.3,3.937888,4.0
3,21,Comedy|Crime|Thriller,Get Shorty,Get Shorty,1995,105,7.0,3.231258,4.0
4,32,Mystery|Sci-Fi|Thriller,L'esercito delle 12 scimmie,Twelve Monkeys,1995,129,8.0,3.985126,3.0
...,...,...,...,...,...,...,...,...,...
232,1359,Children|Comedy,Una promessa è una promessa,Jingle All the Way,1996,89,5.8,3.604990,4.0
233,1370,Action|Adventure|Thriller,58 minuti per morire - Die Harder,Die Hard 2,1990,124,7.2,3.978699,4.0
234,1377,Action|Crime,Batman - Il ritorno,Batman Returns,1992,126,7.1,4.155005,4.0
235,1382,Action|Drama,Programmato per uccidere,Marked for Death,1990,93,6.1,2.515264,3.0


### Generate_Recommendation

In [17]:
def generate_recommendation(model,data,user_id):
    
    if user_id not in data.userId.to_list():
        print('user does not exist!')
    
    movie_seen = data[data.userId==user_id].movieId.unique().tolist()
    movie_all  = data.movieId.unique().tolist()
    
    print(f"user({user_id}) see {len(movie_seen)} movie from {len(movie_all)} movie")
    
    Movies_ID = list(set(movie_all)-set(movie_seen))
    random.shuffle(Movies_ID)
        
    data = data.drop(['userId','rating'],axis=1)
    data = data.drop_duplicates(subset=['movieId'])
    frames = pd.DataFrame()
    
    for i in movie_all:
        mask = data[data.movieId.isin([i])]
        mask = mask.drop(['movieId'],axis=1)
        pred = model.predict(mask)
        tmp = get_Movie_info(i)
        tmp['pred_rating'] = pred
        frames = frames.append(tmp, ignore_index=True)

    return frames

In [18]:
generate_recommendation(model01,newdata,19)

user(19) see 237 movie from 3719 movie


Unnamed: 0,movieId,genres,title,original_title,year,duration,mean_vote,pred_rating
0,640,Drama|Thriller,Diabolique,Diabolique,1996,107,5.5,3.052536
1,1321,Comedy|Horror|Thriller,Un lupo mannaro americano a Londra,An American Werewolf in London,1981,97,7.6,2.882828
2,1645,Drama|Mystery|Thriller,L'avvocato del diavolo,The Devil's Advocate,1997,144,7.6,4.621778
3,1825,Comedy|Drama,The Players Club,The Players Club,1998,104,6.2,3.071371
4,1985,Horror,Halloween 4 - Il ritorno di Michael Myers,Halloween 4: The Return of Michael Myers,1988,88,6.2,2.308758
...,...,...,...,...,...,...,...,...
3714,4449,Drama,Adanggaman,Adanggaman,2000,90,7.0,4.858596
3715,4962,Adventure|Western,Texas Rangers,Texas Rangers,2001,90,5.3,3.915496
3716,4764,Romance|Thriller,Kill Me Later,Kill Me Later,2001,89,6.6,3.843874
3717,5041,Animation|Fantasy,Fire and Ice - Fuoco e ghiaccio,Fire and Ice,1983,81,6.8,2.299033
