In [17]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
import difflib
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import LinearRegression

In [2]:
Dataset_path = "./DataSets/"
Movies_metadata = pd.read_csv(Dataset_path+"MoviesInfo.csv")
ratings = pd.read_csv(Dataset_path+"ml-latest/ratings.csv")

### just keep rating year between 1995, 2001

In [3]:
from datetime import datetime

# strftime('%Y-%m-%d %H:%M:%S')
ratings['year_rated'] = ratings['timestamp'].apply(lambda x: int( datetime.fromtimestamp( x ).strftime('%Y') ) )

ratings = ratings[ ratings['year_rated'].between(1995, 2001)].copy()

ratings.reset_index(drop=True, inplace=True)

In [4]:
print( f"number of Ratings : { ratings.shape[0] }")
print( f"number of movies : { ratings.groupby('movieId').count().shape[0] }")
print( f"number of users : { ratings.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( ratings.groupby('rating').count().index )}, {np.max( ratings.groupby('rating').count().index )})  ")

number of Ratings : 7329482
number of movies : 4937
number of users : 103827
range of rating : ( 1.0, 5.0)  


In [5]:
MyCol = ['movieId','year','duration','metascore','reviews_from_users','reviews_from_critics'
         ,'country','language','director','production_company','mean_vote']
FinalDataset = pd.merge(ratings, Movies_metadata[ MyCol ], how='inner')

In [6]:
FinalDataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7206877 entries, 0 to 7206876
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   userId                int64  
 1   movieId               int64  
 2   rating                float64
 3   timestamp             int64  
 4   year_rated            int64  
 5   year                  int64  
 6   duration              int64  
 7   metascore             float64
 8   reviews_from_users    float64
 9   reviews_from_critics  float64
 10  country               object 
 11  language              object 
 12  director              object 
 13  production_company    object 
 14  mean_vote             float64
dtypes: float64(5), int64(6), object(4)
memory usage: 879.7+ MB


In [7]:
myobj = ['country','language','director','production_company']
obj_FinalDataset= FinalDataset.select_dtypes(include=['object']).copy()

for i in myobj:
    obj_FinalDataset[i] = obj_FinalDataset[i].astype('category')
    FinalDataset[i] = obj_FinalDataset[i].cat.codes
    
FinalDataset = FinalDataset.fillna(100000)
FinalDataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7206877 entries, 0 to 7206876
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   userId                int64  
 1   movieId               int64  
 2   rating                float64
 3   timestamp             int64  
 4   year_rated            int64  
 5   year                  int64  
 6   duration              int64  
 7   metascore             float64
 8   reviews_from_users    float64
 9   reviews_from_critics  float64
 10  country               int16  
 11  language              int16  
 12  director              int16  
 13  production_company    int16  
 14  mean_vote             float64
dtypes: float64(5), int16(4), int64(6)
memory usage: 714.8 MB


In [8]:
data =  FinalDataset[['userId' ,'movieId','year','duration','metascore','reviews_from_users','reviews_from_critics'
                              ,'country','language','director','production_company','mean_vote','rating']]

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled = scaler.fit_transform(data.drop(['movieId','userId','rating'],axis=1))
data_norm = pd.DataFrame(scaled,columns=(data.drop(['movieId','userId','rating'],axis=1).columns))
data_norm= data_norm.join(data['userId'])
data_norm= data_norm.join(data['movieId'])
data_norm= data_norm.join(data['rating'])

                               
from scipy import stats
z_scores = stats.zscore(data_norm.drop(['movieId','userId'],axis=1))
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
newdata = data_norm[filtered_entries]

newdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6529017 entries, 0 to 7206874
Data columns (total 13 columns):
 #   Column                Dtype  
---  ------                -----  
 0   year                  float64
 1   duration              float64
 2   metascore             float64
 3   reviews_from_users    float64
 4   reviews_from_critics  float64
 5   country               float64
 6   language              float64
 7   director              float64
 8   production_company    float64
 9   mean_vote             float64
 10  userId                int64  
 11  movieId               int64  
 12  rating                float64
dtypes: float64(11), int64(2)
memory usage: 697.4 MB


# Regression

In [11]:
predictors = newdata.drop(['movieId','userId','rating'],axis=1)
target = newdata['rating']
X_train, X_test, Y_train, Y_test = train_test_split(predictors,target, test_size = 0.3)

### Poisson Regression

In [18]:
model01 = PoissonRegressor().fit(X_train, Y_train)

In [19]:
y_pred01 = model01.predict(X_test)

In [20]:
print('MSE:',mean_squared_error(Y_test, y_pred01))
print('MAE:',mean_absolute_error(Y_test, y_pred01))
print('RMSE:',np.sqrt(mean_squared_error(Y_test, y_pred01)))

MSE: 0.9992985404102491
MAE: 0.8039737149558099
RMSE: 0.9996492086778487


### Linear Regression

In [21]:
model02 = LinearRegression().fit(X_train, Y_train)

In [22]:
y_pred02 = model02.predict(X_test)

In [23]:
print('MSE:',mean_squared_error(Y_test, y_pred02))
print('MAE:',mean_absolute_error(Y_test, y_pred02))
print('RMSE:',np.sqrt(mean_squared_error(Y_test, y_pred02)))

MSE: 0.9895843165844799
MAE: 0.7954320952504925
RMSE: 0.9947785263989567
