In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB :
    from google.colab import drive
    drive.mount('/content/drive')
    
if IN_COLAB :
  Dataset_path = "/content/drive/MyDrive/Colab Notebooks/DataSets/"
else :
  Dataset_path = "./DataSets/"

ml_path = Dataset_path + "ml-latest/"

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
import difflib
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import LinearRegression

In [3]:
Movies_metadata = pd.read_csv(Dataset_path+"Movies_metadata.csv")
ratings_2001 = pd.read_csv(Dataset_path+"ratings_2001.csv")

In [5]:
data =  Movies_metadata.drop(['genre', 'production_company', 'actors', 'avg_vote',
       'description', 'language', 'country', 'total_votes', 'title',
       'imdb_title_id', 'writer','original_title' , 'genres','votes', 'date_published' ],axis=1)

In [7]:
FinalDataset = pd.merge(ratings_2001, data, how='inner')
FinalDataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4328182 entries, 0 to 4328181
Data columns (total 49 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   userId                 int64  
 1   movieId                int64  
 2   rating                 float64
 3   timestamp              int64  
 4   budget                 float64
 5   usa_gross_income       float64
 6   worlwide_gross_income  float64
 7   metascore              float64
 8   reviews_from_users     float64
 9   reviews_from_critics   float64
 10  director_r             float64
 11  mean_vote              float64
 12  duration               float64
 13  year                   float64
 14  EU                     int64  
 15  AS                     int64  
 16  NA                     int64  
 17  AF                     int64  
 18  AN                     int64  
 19  SA                     int64  
 20  OC                     int64  
 21  idk                    int64  
 22  English           

In [8]:
newdata =  FinalDataset.drop(['timestamp'],axis=1)

In [25]:
#from sklearn.decomposition import PCA
#pca01 = PCA(n_components = 5)
#principlecomp_train = pca01.fit_transform(newdata.drop(['movieId','userId','rating'],axis=1))
#pca01.explained_variance_ratio_

array([0.23716036, 0.12885989, 0.10128093, 0.06290801, 0.05666811])

# Regression

In [9]:
def fit_model(model,data):
    target = data['rating'] 
    predictors = data.drop(['userId','movieId','rating'],axis=1)
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.30, random_state=40)
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)
    print('MSE:',mean_squared_error(y_test, y_pred))
    print('MAE:',mean_absolute_error(y_test, y_pred))
    print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred)))

### Poisson Regression

In [10]:
model01 = PoissonRegressor()

In [11]:
fit_model(model01,newdata[newdata['userId'] == 19])

MSE: 0.2865158361823043
MAE: 0.44229027633090656
RMSE: 0.5352717405041147


### Linear Regression

In [12]:
model02 = LinearRegression()

In [13]:
fit_model(model02,newdata[newdata['userId'] == 19])

MSE: 0.42335147855237265
MAE: 0.5035381593896255
RMSE: 0.6506546538313337


In [22]:
def get_Movie_info(Movie_id):
    
    """
    Returns some basic information about a Movie given the Movie id and the metadata dataframe.
    """
    
    Movie_info = Movies_metadata[Movies_metadata['movieId'] == int(Movie_id)][['movieId', 'genres', 
                                                            'title', 'original_title', 'year', 'duration','mean_vote']]
    return Movie_info


### Test Generate_Recommendation

In [23]:
def test_generate_recommendation(model,data):
    frames = pd.DataFrame()
    movie_all  = data.movieId.unique().tolist()
    for i in movie_all:
        mask = data[data.movieId.isin([i])]
        rat = mask['rating'].tolist()
        mask = mask.drop(['userId','movieId','rating'],axis=1)
        pred = model.predict(mask)
        tmp = get_Movie_info(i)
        tmp['pred_rating'] = pred
        tmp['rating'] = rat
        frames = frames.append(tmp, ignore_index=True)

    return frames

In [16]:
test_generate_recommendation(model01,newdata[newdata['userId'] == 19])

Unnamed: 0,movieId,genres,title,original_title,year,duration,mean_vote,pred_rating,rating
0,640,Drama|Thriller,Diabolique,Diabolique,0.234654,0.297962,-0.848307,3.360425,3.0
1,1321,Comedy|Horror|Thriller,Un lupo mannaro americano a Londra,An American Werewolf in London,-0.372194,-0.174812,1.177577,3.126529,3.0
2,21,Comedy|Crime|Thriller,Get Shorty,Get Shorty,0.194197,0.203407,0.598753,3.338123,4.0
3,32,Mystery|Sci-Fi|Thriller,L'esercito delle 12 scimmie,Twelve Monkeys,0.194197,1.338065,1.563459,3.597817,3.0
4,161,Drama|Thriller|War,Allarme rosso,Crimson Tide,0.194197,0.723458,0.984636,3.489956,4.0
...,...,...,...,...,...,...,...,...,...
179,1275,Action|Adventure|Fantasy,Highlander - L'ultimo immortale,Highlander,-0.169911,0.723458,0.888165,3.201305,3.0
180,1281,Comedy|Drama|War,Il grande dittatore,The Great Dictator,-2.030909,1.148955,1.852871,3.648058,4.0
181,1359,Children|Comedy,Una promessa è una promessa,Jingle All the Way,0.234654,-0.553031,-0.558895,3.037540,4.0
182,1382,Action|Drama,Programmato per uccidere,Marked for Death,-0.008085,-0.363922,-0.269483,3.143623,3.0


In [17]:
test_generate_recommendation(model02,newdata[newdata['userId'] == 19])

Unnamed: 0,movieId,genres,title,original_title,year,duration,mean_vote,pred_rating,rating
0,640,Drama|Thriller,Diabolique,Diabolique,0.234654,0.297962,-0.848307,3.500201,3.0
1,1321,Comedy|Horror|Thriller,Un lupo mannaro americano a Londra,An American Werewolf in London,-0.372194,-0.174812,1.177577,2.862895,3.0
2,21,Comedy|Crime|Thriller,Get Shorty,Get Shorty,0.194197,0.203407,0.598753,3.234848,4.0
3,32,Mystery|Sci-Fi|Thriller,L'esercito delle 12 scimmie,Twelve Monkeys,0.194197,1.338065,1.563459,3.155579,3.0
4,161,Drama|Thriller|War,Allarme rosso,Crimson Tide,0.194197,0.723458,0.984636,3.220268,4.0
...,...,...,...,...,...,...,...,...,...
179,1275,Action|Adventure|Fantasy,Highlander - L'ultimo immortale,Highlander,-0.169911,0.723458,0.888165,2.793697,3.0
180,1281,Comedy|Drama|War,Il grande dittatore,The Great Dictator,-2.030909,1.148955,1.852871,3.776567,4.0
181,1359,Children|Comedy,Una promessa è una promessa,Jingle All the Way,0.234654,-0.553031,-0.558895,2.984173,4.0
182,1382,Action|Drama,Programmato per uccidere,Marked for Death,-0.008085,-0.363922,-0.269483,3.213894,3.0


### Generate_Recommendation

In [18]:
def generate_recommendation(model,data,user_id):
    
    if user_id not in data.userId.to_list():
        print('user does not exist!')
    
    movie_seen = data[data.userId==user_id].movieId.unique().tolist()
    movie_all  = data.movieId.unique().tolist()
    
    print(f"user({user_id}) see {len(movie_seen)} movie from {len(movie_all)} movie")
    
    Movies_ID = list(set(movie_all)-set(movie_seen))
    random.shuffle(Movies_ID)
        
    data = data.drop(['userId','rating'],axis=1)
    data = data.drop_duplicates(subset=['movieId'])
    frames = pd.DataFrame()
    
    for i in movie_all:
        mask = data[data.movieId.isin([i])]
        mask = mask.drop(['movieId'],axis=1)
        pred = model.predict(mask)
        tmp = get_Movie_info(i)
        tmp['pred_rating'] = pred
        frames = frames.append(tmp, ignore_index=True)

    return frames

In [19]:
generate_recommendation(model01,newdata,19)

user(19) see 184 movie from 4119 movie


Unnamed: 0,movieId,genres,title,original_title,year,duration,mean_vote,pred_rating
0,640,Drama|Thriller,Diabolique,Diabolique,0.234654,0.297962,-0.848307,3.360425
1,828,Adventure|Children,Le straordinarie avventure di Pinocchio,The Adventures of Pinocchio,0.234654,-0.411199,-0.848307,3.054781
2,960,Crime|Drama,Infernale avventura (L'),Angel on My Shoulder,-1.788170,-0.032980,0.598753,3.232400
3,1321,Comedy|Horror|Thriller,Un lupo mannaro americano a Londra,An American Werewolf in London,-0.372194,-0.174812,1.177577,3.126529
4,1645,Drama|Mystery|Thriller,L'avvocato del diavolo,The Devil's Advocate,0.275110,2.047226,1.177577,3.831189
...,...,...,...,...,...,...,...,...
4114,4764,Romance|Thriller,Kill Me Later,Kill Me Later,0.436936,-0.553031,0.212870,3.078412
4115,5041,Animation|Fantasy,Fire and Ice - Fuoco e ghiaccio,Fire and Ice,-0.291281,-0.931251,0.405812,3.009320
4116,3195,Drama,Tess of the Storm Country,Tess of the Storm Country,-2.759126,1.716284,0.888165,3.700442
4117,4905,Crime,Tommy Gibbs criminale per giustizia,Hell Up in Harlem,-0.695845,-0.316644,-0.173012,3.116410


In [24]:
generate_recommendation(model02,newdata,19)

user(19) see 184 movie from 4119 movie


Unnamed: 0,movieId,genres,title,original_title,year,duration,mean_vote,pred_rating
0,640,Drama|Thriller,Diabolique,Diabolique,0.234654,0.297962,-0.848307,3.500201
1,828,Adventure|Children,Le straordinarie avventure di Pinocchio,The Adventures of Pinocchio,0.234654,-0.411199,-0.848307,2.787182
2,960,Crime|Drama,Infernale avventura (L'),Angel on My Shoulder,-1.788170,-0.032980,0.598753,3.841807
3,1321,Comedy|Horror|Thriller,Un lupo mannaro americano a Londra,An American Werewolf in London,-0.372194,-0.174812,1.177577,2.862895
4,1645,Drama|Mystery|Thriller,L'avvocato del diavolo,The Devil's Advocate,0.275110,2.047226,1.177577,3.412358
...,...,...,...,...,...,...,...,...
4114,4764,Romance|Thriller,Kill Me Later,Kill Me Later,0.436936,-0.553031,0.212870,2.985396
4115,5041,Animation|Fantasy,Fire and Ice - Fuoco e ghiaccio,Fire and Ice,-0.291281,-0.931251,0.405812,2.979723
4116,3195,Drama,Tess of the Storm Country,Tess of the Storm Country,-2.759126,1.716284,0.888165,5.133274
4117,4905,Crime,Tommy Gibbs criminale per giustizia,Hell Up in Harlem,-0.695845,-0.316644,-0.173012,3.197085
