In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB :
    from google.colab import drive
    drive.mount('/content/drive')
    
if IN_COLAB :
  Dataset_path = "/content/drive/MyDrive/Colab Notebooks/DataSets/"
else :
  Dataset_path = "./DataSets/"

ml_path = Dataset_path + "ml-latest/"

In [18]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import seaborn as sns

import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import mean_squared_error,mean_absolute_error
import random

In [3]:
Movies_metadata = pd.read_csv(Dataset_path+"Movies_metadata.csv")
ratings_2001 = pd.read_csv(Dataset_path+"ratings_2001.csv")

In [4]:
data =  Movies_metadata.drop(['genre', 'production_company', 'actors', 'avg_vote',
       'description', 'language', 'country', 'total_votes', 'title',
       'imdb_title_id', 'writer','original_title' , 'genres','votes', 'date_published' ],axis=1)

In [5]:
FinalDataset = pd.merge(ratings_2001, data, how='inner')
FinalDataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4328182 entries, 0 to 4328181
Data columns (total 49 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   userId                 int64  
 1   movieId                int64  
 2   rating                 float64
 3   timestamp              int64  
 4   budget                 float64
 5   usa_gross_income       float64
 6   worlwide_gross_income  float64
 7   metascore              float64
 8   reviews_from_users     float64
 9   reviews_from_critics   float64
 10  director_r             float64
 11  mean_vote              float64
 12  duration               float64
 13  year                   float64
 14  EU                     int64  
 15  AS                     int64  
 16  NA                     int64  
 17  AF                     int64  
 18  AN                     int64  
 19  SA                     int64  
 20  OC                     int64  
 21  idk                    int64  
 22  English           

In [6]:
newdata =  FinalDataset.drop(['timestamp'],axis=1)

In [8]:
#from sklearn.decomposition import PCA
#pca01 = PCA(n_components = 5)
#principlecomp_train = pca01.fit_transform(newdata.drop(['movieId','userId','rating'],axis=1))
#pca01.explained_variance_ratio_

### Keras

In [9]:
model01 = Sequential()
model01.add(Dense(300, activation='relu', input_dim=45))
model01.add(Dense(150, activation='relu'))
model01.add(Dense(30, activation='relu'))
model01.add(Dense(1, activation='linear'))

# Compile the model
model01.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mean_squared_error'])

### Fit Model

In [10]:
def fit_model(model,data):
    target = data['rating'] 
    predictors = data.drop(['userId','movieId','rating'],axis=1)
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.30, random_state=40)
    model.fit(X_train, y_train, epochs=20, batch_size=10)
    y_pred= model.predict(X_test)
    print('MSE:',mean_squared_error(y_test, y_pred))
    print('MAE:',mean_absolute_error(y_test, y_pred))
    print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred)))

In [11]:
fit_model(model01,newdata[newdata['userId'] == 19])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MSE: 0.4048128347328403
MAE: 0.5020923571927207
RMSE: 0.6362490351527775


In [12]:
#newdata[newdata['userId'] == 19]

In [13]:
def get_Movie_info(Movie_id):
    
    """
    Returns some basic information about a Movie given the Movie id and the metadata dataframe.
    """
    
    Movie_info = Movies_metadata[Movies_metadata['movieId'] == int(Movie_id)][['movieId', 'genres', 
                                                            'title', 'original_title', 'year', 'duration','mean_vote']]
    return Movie_info


### Test Generate_Recommendation

In [14]:
def test_generate_recommendation(model,data):
    frames = pd.DataFrame()
    movie_all  = data.movieId.unique().tolist()
    for i in movie_all:
        mask = data[data.movieId.isin([i])]
        rat = mask['rating'].tolist()
        mask = mask.drop(['userId','movieId','rating'],axis=1)
        pred = model.predict(mask)
        tmp = get_Movie_info(i)
        tmp['pred_rating'] = pred
        tmp['rating'] = rat
        frames = frames.append(tmp, ignore_index=True)

    return frames

In [15]:
test_generate_recommendation(model01,newdata[newdata['userId'] == 19])

Unnamed: 0,movieId,genres,title,original_title,year,duration,mean_vote,pred_rating,rating
0,640,Drama|Thriller,Diabolique,Diabolique,0.234654,0.297962,-0.848307,2.905211,3.0
1,1321,Comedy|Horror|Thriller,Un lupo mannaro americano a Londra,An American Werewolf in London,-0.372194,-0.174812,1.177577,3.280694,3.0
2,21,Comedy|Crime|Thriller,Get Shorty,Get Shorty,0.194197,0.203407,0.598753,3.937194,4.0
3,32,Mystery|Sci-Fi|Thriller,L'esercito delle 12 scimmie,Twelve Monkeys,0.194197,1.338065,1.563459,3.497882,3.0
4,161,Drama|Thriller|War,Allarme rosso,Crimson Tide,0.194197,0.723458,0.984636,4.151942,4.0
...,...,...,...,...,...,...,...,...,...
179,1275,Action|Adventure|Fantasy,Highlander - L'ultimo immortale,Highlander,-0.169911,0.723458,0.888165,3.161069,3.0
180,1281,Comedy|Drama|War,Il grande dittatore,The Great Dictator,-2.030909,1.148955,1.852871,4.032206,4.0
181,1359,Children|Comedy,Una promessa è una promessa,Jingle All the Way,0.234654,-0.553031,-0.558895,3.736671,4.0
182,1382,Action|Drama,Programmato per uccidere,Marked for Death,-0.008085,-0.363922,-0.269483,2.970121,3.0


### Generate_Recommendation

In [19]:
def generate_recommendation(model,data,user_id):
    
    if user_id not in data.userId.to_list():
        print('user does not exist!')
    
    movie_seen = data[data.userId==user_id].movieId.unique().tolist()
    movie_all  = data.movieId.unique().tolist()
    
    print(f"user({user_id}) see {len(movie_seen)} movie from {len(movie_all)} movie")
    
    Movies_ID = list(set(movie_all)-set(movie_seen))
    random.shuffle(Movies_ID)
        
    data = data.drop(['userId','rating'],axis=1)
    data = data.drop_duplicates(subset=['movieId'])
    frames = pd.DataFrame()
    
    for i in movie_all:
        mask = data[data.movieId.isin([i])]
        mask = mask.drop(['movieId'],axis=1)
        pred = model.predict(mask)
        tmp = get_Movie_info(i)
        tmp['pred_rating'] = pred
        frames = frames.append(tmp, ignore_index=True)

    return frames

In [22]:
generate_recommendation(model01,newdata,19)

user(19) see 184 movie from 4119 movie


MemoryError: Unable to allocate 1.16 GiB for an array with shape (36, 4328182) and data type int64