In [3]:
import pandas as pd 
from surprise import Reader, Dataset, SVD, NormalPredictor, BaselineOnly, KNNBasic, NMF
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate

In [4]:
movies = pd.read_csv('./movielens_dataset/u.item' , header = None , sep = "|" , encoding='latin-1')
movies.columns = ['movie_id' , 'movie_title' , 'release_date' , 'video_release_date' ,
             'IMDb_URL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
             'Childrens' , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
             'Film_Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci_Fi' ,
             'Thriller' , 'War' , 'Western']

users = pd.read_csv('./movielens_dataset/u.user', header= None , sep = '|')
users.columns = ['user_id' , 'age' , 'gender' , 'occupation' , 'zip_code']

ratings = pd.read_csv('./movielens_dataset/u.data', header= None , sep = '\t')
ratings.columns = ['user_id' , 'movie_id' , 'rating' , 'timestamp']

In [5]:
data = ratings.merge(users , on='user_id')
data = data.merge(movies , on='movie_id')
data.head(3)

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,zip_code,movie_title,release_date,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,196,242,3,881250949,49,M,writer,55105,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
1,305,242,5,886307828,23,M,programmer,94086,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
2,6,242,4,883268170,42,M,executive,98101,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0


In [6]:
reader = Reader()
dataset = Dataset.load_from_df(data[['user_id', 'movie_id', 'rating']], reader)
kf = KFold(n_splits=5)
kf.split(dataset)

<generator object KFold.split at 0x7f99205cb970>

In [5]:
algorithm = NormalPredictor()
cross_validate(algorithm, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5143  1.5215  1.5113  1.5098  1.5110  1.5135  0.0042  
MAE (testset)     1.2150  1.2189  1.2133  1.2117  1.2118  1.2142  0.0027  
Fit time          0.10    0.13    0.13    0.14    0.13    0.13    0.01    
Test time         0.19    0.13    0.14    0.19    0.18    0.17    0.02    


{'test_rmse': array([1.51425978, 1.52148053, 1.51126707, 1.50976416, 1.51095733]),
 'test_mae': array([1.214998  , 1.21894758, 1.21330018, 1.21171602, 1.21181009]),
 'fit_time': (0.10125041007995605,
  0.12813353538513184,
  0.13375282287597656,
  0.1360940933227539,
  0.13320159912109375),
 'test_time': (0.19108080863952637,
  0.1343975067138672,
  0.13960862159729004,
  0.18828415870666504,
  0.18171238899230957)}

In [6]:
algorithm = SVD()
cross_validate(algorithm, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9333  0.9423  0.9334  0.9354  0.9340  0.9357  0.0034  
MAE (testset)     0.7357  0.7432  0.7338  0.7374  0.7380  0.7376  0.0031  
Fit time          6.40    7.37    6.97    6.17    6.66    6.71    0.42    
Test time         0.15    0.19    0.13    0.14    0.17    0.16    0.02    


{'test_rmse': array([0.93330953, 0.94229762, 0.93340524, 0.93543194, 0.93400545]),
 'test_mae': array([0.73572345, 0.7432017 , 0.73381064, 0.73739257, 0.73796651]),
 'fit_time': (6.4001545906066895,
  7.366527080535889,
  6.969428300857544,
  6.166753530502319,
  6.66348123550415),
 'test_time': (0.14932990074157715,
  0.1907520294189453,
  0.13091206550598145,
  0.13720703125,
  0.17358756065368652)}

In [7]:
algorithm = KNNBasic(k=20)
cross_validate(algorithm, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9752  0.9767  0.9767  0.9792  0.9772  0.9770  0.0013  
MAE (testset)     0.7684  0.7701  0.7713  0.7725  0.7706  0.7706  0.0014  
Fit time          0.26    0.27    0.30    0.31    0.27    0.28    0.02    
Test time         2.33    2.26    2.30    2.38    2.37    2.33    0.04    


{'test_rmse': array([0.97523472, 0.97669678, 0.97674796, 0.97916361, 0.9772161 ]),
 'test_mae': array([0.76842078, 0.7700646 , 0.77127075, 0.77253517, 0.77060034]),
 'fit_time': (0.2559497356414795,
  0.2668159008026123,
  0.2952394485473633,
  0.3067927360534668,
  0.26993441581726074),
 'test_time': (2.33162260055542,
  2.260124921798706,
  2.3034017086029053,
  2.375002145767212,
  2.366453170776367)}

In [8]:
algorithm = KNNBasic(sim_options={'user_based': False} , k=20) # https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#similarity-measure-configuration
cross_validate(algorithm, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9820  0.9839  0.9840  0.9848  0.9812  0.9832  0.0014  
MAE (testset)     0.7757  0.7776  0.7766  0.7774  0.7740  0.7763  0.0013  
Fit time          0.46    0.51    0.46    0.50    0.44    0.47    0.03    
Test time         2.78    2.86    2.83    2.77    2.66    2.78    0.07    


{'test_rmse': array([0.98202273, 0.98394501, 0.98404681, 0.98480242, 0.98117066]),
 'test_mae': array([0.77571131, 0.77761229, 0.77661958, 0.7774048 , 0.7740139 ]),
 'fit_time': (0.45612668991088867,
  0.5093095302581787,
  0.4637179374694824,
  0.5021648406982422,
  0.4422900676727295),
 'test_time': (2.7753746509552,
  2.860525369644165,
  2.8316128253936768,
  2.77130389213562,
  2.6636292934417725)}

In [9]:
algorithm = NMF()
cross_validate(algorithm, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9574  0.9684  0.9671  0.9570  0.9714  0.9642  0.0059  
MAE (testset)     0.7559  0.7596  0.7590  0.7529  0.7623  0.7579  0.0032  
Fit time          4.30    4.28    4.29    4.28    4.25    4.28    0.02    
Test time         0.11    0.15    0.15    0.11    0.16    0.14    0.02    


{'test_rmse': array([0.95741139, 0.96836161, 0.96707204, 0.95699722, 0.97139041]),
 'test_mae': array([0.75588511, 0.75961746, 0.75904561, 0.75287508, 0.76227074]),
 'fit_time': (4.29699969291687,
  4.2832114696502686,
  4.287365913391113,
  4.279693841934204,
  4.249091625213623),
 'test_time': (0.11280989646911621,
  0.1497507095336914,
  0.15164566040039062,
  0.1120767593383789,
  0.15675616264343262)}

In [7]:
trainset = dataset.build_full_trainset()
algorithm = SVD(n_factors = 200 , lr_all = 0.005 , reg_all = 0.02 , n_epochs = 40 , init_std_dev = 0.05)
algorithm.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9920994c70>

In [8]:
def get_recommendations(uid = None , iid = None):
    predictions = []
    if uid is None:
        for ui in trainset.all_users():
            predictions.append(algorithm.predict(ui, iid, verbose = False))
        return predictions
    
    if iid is None:
        for ii in trainset.all_items():
            ii = trainset.to_raw_iid(ii)
            predictions.append(algorithm.predict(uid, ii, verbose = False))
        return predictions
    return predictins.append(algorithm.predict(uid,iid,verbose = False))

In [9]:
predictions = get_recommendations(uid = 1)
predictions.sort(key=lambda x: x.est, reverse=True)

[Prediction(uid=1, iid=302, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=511, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=154, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=174, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=652, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=169, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=171, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=48, r_ui=None, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=50, r_ui=None, est=4.982723791553698, details={'was_impossible': False}),
 Prediction(uid=1, iid=109, r_ui=None, est=4.974462115714218, details={'was_impossible': False}),
 Prediction(uid=1, iid=175, r_ui=None, est=4.969747673114561, details={'was_impossible': False}),
 Prediction(uid=1, iid=87, r_ui=None, est=4.96879696

In [19]:
str(movies[movies['movie_id'] == int('178')]['movie_title'].to_numpy()[0])

'12 Angry Men (1957)'