In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from surprise import Reader, Dataset


In [2]:
u_cols = ['user_id', 'sex', 'age', 'occupation', 'zip_code']
users = pd.read_csv('data/users.dat', sep="::", names=u_cols,
encoding='latin-1')


  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
users.head()

Unnamed: 0,user_id,sex,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


The names of the occupations need to be substituted according to the following:
    
	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

In [4]:
users['occupation'] = users['occupation'].astype('str')

In [5]:
users['occupation'] = users['occupation'].str.replace("10", "K student")
users['occupation'] = users['occupation'].str.replace("11", "lawyer")
users['occupation'] = users['occupation'].str.replace("12", "programmer")
users['occupation'] = users['occupation'].str.replace("13", "retired")
users['occupation'] = users['occupation'].str.replace("14", "sales/marketing")
users['occupation'] = users['occupation'].str.replace("15", "scientist")
users['occupation'] = users['occupation'].str.replace("16", "self-employed")
users['occupation'] = users['occupation'].str.replace("17", "technician/engineer")
users['occupation'] = users['occupation'].str.replace("18", "tradesman/craftsman")
users['occupation'] = users['occupation'].str.replace("19", "unemployed")
users['occupation'] = users['occupation'].str.replace("20", "writer")
users['occupation'] = users['occupation'].str.replace("0", "other or not specified")
users['occupation'] = users['occupation'].str.replace("1", "academic/educator")
users['occupation'] = users['occupation'].str.replace("2", "artist")
users['occupation'] = users['occupation'].str.replace("3", "clerical/admin")
users['occupation'] = users['occupation'].str.replace("4", "college/grad student")
users['occupation'] = users['occupation'].str.replace("5", "customer service")
users['occupation'] = users['occupation'].str.replace("6", "doctor/health care")
users['occupation'] = users['occupation'].str.replace("7", "executive/managerial")
users['occupation'] = users['occupation'].str.replace("8", "farmer")
users['occupation'] = users['occupation'].str.replace("9", "homemaker")


In [6]:
users

Unnamed: 0,user_id,sex,age,occupation,zip_code
0,1,F,1,K student,48067
1,2,M,56,self-employed,70072
2,3,M,25,scientist,55117
3,4,M,45,executive/managerial,02460
4,5,M,25,writer,55455
...,...,...,...,...,...
6035,6036,F,25,scientist,32603
6036,6037,F,45,academic/educator,76006
6037,6038,F,56,academic/educator,14706
6038,6039,F,45,other or not specified,01060


In [7]:
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ratings.dat', sep='::', names=r_cols,
encoding='latin-1')

  after removing the cwd from sys.path.


In [8]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
movies_columns = ['Title', 'Genre']
movies_1 = pd.read_csv('data/movies.dat', sep='::',names=movies_columns,
encoding='latin-1')

  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
movies_1.reset_index(inplace=True)

In [11]:
movies_1.head()

Unnamed: 0,index,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
movies_1.drop('index', inplace=True, axis=1)

In [13]:
# Creating list of multi-labels
types_list = movies_1.Genre.apply(lambda x: list(x.split("|")))


In [14]:
types_list

0        [Animation, Children's, Comedy]
1       [Adventure, Children's, Fantasy]
2                      [Comedy, Romance]
3                        [Comedy, Drama]
4                               [Comedy]
                      ...               
3878                            [Comedy]
3879                             [Drama]
3880                             [Drama]
3881                             [Drama]
3882                   [Drama, Thriller]
Name: Genre, Length: 3883, dtype: object

In [15]:
# Converting it into dataframe and working on it seperately
types_df =pd.DataFrame({"Type":types_list})
types_df.head()

Unnamed: 0,Type
0,"[Animation, Children's, Comedy]"
1,"[Adventure, Children's, Fantasy]"
2,"[Comedy, Romance]"
3,"[Comedy, Drama]"
4,[Comedy]


In [16]:
# importing MultiLabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
# instantiating MultiLabelBinarizer
mlb = MultiLabelBinarizer()
types_encoded = pd.DataFrame(mlb.fit_transform(types_df["Type"]),columns=mlb.classes_)
types_encoded.head()

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
movies_fin = pd.concat([movies_1,types_encoded], axis=1)

In [18]:
movies_fin

Unnamed: 0,Title,Genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,Meet the Parents (2000),Comedy,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,Requiem for a Dream (2000),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,Tigerland (2000),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,Two Family House (2000),Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


# 1st Collaborative Filtering with Surprise

In [32]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
user_id           1000209 non-null int64
movie_id          1000209 non-null int64
rating            1000209 non-null int64
unix_timestamp    1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB


In [34]:
ratings_ = ratings.drop('unix_timestamp',axis=1)

In [36]:
ratings_.shape

(1000209, 3)

In [37]:
reader = Reader()
data = Dataset.load_from_df(ratings_,reader)

In [38]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  6040 

Number of items:  3706


In [39]:
# importing relevant libraries
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [40]:
## Perform a gridsearch with SVD
params = {'n_factors': [20, 50, 100],
         'reg_all': [0.02, 0.05, 0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)


In [41]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 0.87042939959048, 'mae': 0.6847368633189289}
{'rmse': {'n_factors': 100, 'reg_all': 0.05}, 'mae': {'n_factors': 50, 'reg_all': 0.02}}


In [42]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson', 'user_based':True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [43]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([0.96206154, 0.96340425, 0.96142395, 0.96152729, 0.96171622]))
('test_mae', array([0.76582308, 0.76671178, 0.76471189, 0.76559246, 0.76593857]))
('fit_time', (213.03173875808716, 232.32887768745422, 241.76193499565125, 188.27505588531494, 93.68713212013245))
('test_time', (233.71677112579346, 222.43356704711914, 214.04109692573547, 198.66805386543274, 100.60746788978577))
-----------------------
0.9620266494953047


# Making Recommendations

In [44]:
movies_fin.head()

Unnamed: 0,Title,Genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [45]:
svd = SVD(n_factors= 50, reg_all=0.05)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd98d65d950>