In [2]:
import pandas as pd
import numpy as np

In [3]:
import sys
sys.path.append("..")

## Data

In [5]:
# pass in column names for each CSV and read them using pandas. 
# Column names available in the readme file

#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../data/ml-100k/u.user', sep='|', names=u_cols,
 encoding='latin-1')

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=r_cols,
 encoding='latin-1')

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('../data/ml-100k/u.item', sep='|', names=i_cols,
 encoding='latin-1')

In [8]:
print (users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [9]:
print (ratings.shape)
ratings.head()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [10]:
print (items.shape)
items.head()

(1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


#### Train-test dataset

In [12]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('../data/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../data/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_base.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [13]:
print (ratings_base.shape)
ratings_base.head()

(90570, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [14]:
print (ratings_test.shape)
ratings_test.head()

(9430, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,20,4,887431883
1,1,33,4,878542699
2,1,61,4,878542420
3,1,117,3,874965739
4,1,155,2,878542201


## Use `turicreate` library

In [15]:
import turicreate as tc
train_data = tc.SFrame(ratings_base)
test_data = tc.SFrame(ratings_test)

In [16]:
train_data

user_id,movie_id,rating,unix_timestamp
1,1,5,874965758
1,2,3,876893171
1,3,4,878542960
1,4,3,876893119
1,5,3,889751712
1,6,5,887431973
1,7,4,875071561
1,8,1,875072484
1,9,5,878543541
1,10,3,875693118


In [17]:
test_data

user_id,movie_id,rating,unix_timestamp
1,20,4,887431883
1,33,4,878542699
1,61,4,878542420
1,117,3,874965739
1,155,2,878542201
1,160,4,875072547
1,171,5,889751711
1,189,3,888732928
1,202,5,875072442
1,265,4,878542441


## Baseline model: A Simple Popularity Model

In [18]:
popularity_model = tc.popularity_recommender.create(train_data, 
                                                    user_id='user_id', 
                                                    item_id='movie_id', 
                                                    target='rating')

In [21]:
#Get recommendations for first 5 users and print them
#users = range(1,6) specifies user ID of first 5 users
#k=5 specifies top 5 recommendations to be given

popularity_recomm = popularity_model.recommend(users=[1,2,3,4,5], k=5)
popularity_recomm.print_rows(num_rows=25)

+---------+----------+-------+------+
| user_id | movie_id | score | rank |
+---------+----------+-------+------+
|    1    |   1599   |  5.0  |  1   |
|    1    |   1201   |  5.0  |  2   |
|    1    |   1189   |  5.0  |  3   |
|    1    |   1122   |  5.0  |  4   |
|    1    |   814    |  5.0  |  5   |
|    2    |   1599   |  5.0  |  1   |
|    2    |   1201   |  5.0  |  2   |
|    2    |   1189   |  5.0  |  3   |
|    2    |   1122   |  5.0  |  4   |
|    2    |   814    |  5.0  |  5   |
|    3    |   1599   |  5.0  |  1   |
|    3    |   1201   |  5.0  |  2   |
|    3    |   1189   |  5.0  |  3   |
|    3    |   1122   |  5.0  |  4   |
|    3    |   814    |  5.0  |  5   |
|    4    |   1599   |  5.0  |  1   |
|    4    |   1201   |  5.0  |  2   |
|    4    |   1189   |  5.0  |  3   |
|    4    |   1122   |  5.0  |  4   |
|    4    |   814    |  5.0  |  5   |
|    5    |   1599   |  5.0  |  1   |
|    5    |   1201   |  5.0  |  2   |
|    5    |   1189   |  5.0  |  3   |
|    5    | 

In [24]:
ratings_base.groupby(by='movie_id')['rating'].mean().sort_values(ascending=False).head(20)

movie_id
1500    5.000000
1293    5.000000
1122    5.000000
1189    5.000000
1656    5.000000
1201    5.000000
1599    5.000000
814     5.000000
1467    5.000000
1536    5.000000
1449    4.714286
1642    4.500000
1463    4.500000
1594    4.500000
1398    4.500000
114     4.491525
408     4.480769
169     4.476636
318     4.475836
483     4.459821
Name: rating, dtype: float64

## Collaborative filtering model: `cosine`

In [28]:
#Training the model
item_sim_model_cos = tc.item_similarity_recommender.create(train_data, user_id='user_id', 
                                                       item_id='movie_id', target='rating', 
                                                       similarity_type='cosine')

#Making recommendations
item_sim_recomm_cos = item_sim_model.recommend(users=[1,2,3,4,5],k=5)
item_sim_recomm_cos.print_rows(num_rows=25)

+---------+----------+-------+------+
| user_id | movie_id | score | rank |
+---------+----------+-------+------+
|    1    |   1599   |  5.0  |  1   |
|    1    |   1201   |  5.0  |  2   |
|    1    |   1189   |  5.0  |  3   |
|    1    |   1122   |  5.0  |  4   |
|    1    |   814    |  5.0  |  5   |
|    2    |   1599   |  5.0  |  1   |
|    2    |   1201   |  5.0  |  2   |
|    2    |   1189   |  5.0  |  3   |
|    2    |   1122   |  5.0  |  4   |
|    2    |   814    |  5.0  |  5   |
|    3    |   1599   |  5.0  |  1   |
|    3    |   1201   |  5.0  |  2   |
|    3    |   1189   |  5.0  |  3   |
|    3    |   1122   |  5.0  |  4   |
|    3    |   814    |  5.0  |  5   |
|    4    |   1599   |  5.0  |  1   |
|    4    |   1201   |  5.0  |  2   |
|    4    |   1189   |  5.0  |  3   |
|    4    |   1122   |  5.0  |  4   |
|    4    |   814    |  5.0  |  5   |
|    5    |   1599   |  5.0  |  1   |
|    5    |   1201   |  5.0  |  2   |
|    5    |   1189   |  5.0  |  3   |
|    5    | 

## Collaborative filtering model: `pearson`

In [29]:
#Training the model
item_sim_model_pearson = tc.item_similarity_recommender.create(train_data, user_id='user_id', 
                                                       item_id='movie_id', target='rating', 
                                                       similarity_type='pearson')

#Making recommendations
item_sim_recomm_pearson = item_sim_model.recommend(users=[1,2,3,4,5],k=5)
item_sim_recomm_pearson.print_rows(num_rows=25)

+---------+----------+-------+------+
| user_id | movie_id | score | rank |
+---------+----------+-------+------+
|    1    |   1599   |  5.0  |  1   |
|    1    |   1201   |  5.0  |  2   |
|    1    |   1189   |  5.0  |  3   |
|    1    |   1122   |  5.0  |  4   |
|    1    |   814    |  5.0  |  5   |
|    2    |   1599   |  5.0  |  1   |
|    2    |   1201   |  5.0  |  2   |
|    2    |   1189   |  5.0  |  3   |
|    2    |   1122   |  5.0  |  4   |
|    2    |   814    |  5.0  |  5   |
|    3    |   1599   |  5.0  |  1   |
|    3    |   1201   |  5.0  |  2   |
|    3    |   1189   |  5.0  |  3   |
|    3    |   1122   |  5.0  |  4   |
|    3    |   814    |  5.0  |  5   |
|    4    |   1599   |  5.0  |  1   |
|    4    |   1201   |  5.0  |  2   |
|    4    |   1189   |  5.0  |  3   |
|    4    |   1122   |  5.0  |  4   |
|    4    |   814    |  5.0  |  5   |
|    5    |   1599   |  5.0  |  1   |
|    5    |   1201   |  5.0  |  2   |
|    5    |   1189   |  5.0  |  3   |
|    5    | 

## Evaluation

In [34]:
eval1 = tc.recommender.util.compare_models(test_data, [popularity_model, item_sim_model_pearson], 
                                   model_names=["popularity", "pearson"])

PROGRESS: Evaluate model popularity

Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    | 0.0005302226935312836  | 0.00010604453870625672 |
|   3    | 0.00035348179568752205 | 0.00010604453870625672 |
|   4    | 0.0002651113467656418  | 0.00010604453870625672 |
|   5    | 0.00021208907741251343 | 0.00010604453870625672 |
|   6    | 0.00017674089784376103 | 0.00010604453870625672 |
|   7    | 0.0003029843963035904  | 0.0002120890774125135  |
|   8    | 0.00026511134676564187 | 0.0002120890774125135  |
|   9    | 0.0002356545304583481  | 0.0002120890774125135  |
|   10   | 0.00021208907741251343 | 0.00021208907741251343 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.041764796943981

Pe

In [35]:
eval2 = tc.recommender.util.compare_models(test_data, [item_sim_model_cos, item_sim_model_pearson], 
                                   model_names=["cosine", "pearson"])


PROGRESS: Evaluate model cosine

Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------------------+----------------------+
|   1    | 0.08059384941675501 | 0.008059384941675511 |
|   2    | 0.07953340402969249 | 0.015906680805938496 |
|   3    | 0.07104984093319193 | 0.021314952279957573 |
|   4    | 0.06495227995758222 |  0.0259809119830329  |
|   5    | 0.06299045599151652 | 0.03149522799575826  |
|   6    | 0.06380346412159779 | 0.03828207847295866  |
|   7    | 0.06423269201636118 | 0.04496288441145274  |
|   8    | 0.06309650053022285 | 0.050477200424178116 |
|   9    | 0.06374455048898317 | 0.05737009544008484  |
|   10   |  0.0640509013785791 |  0.0640509013785791  |
+--------+---------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 3.362139685476231

Per User RMSE (best)
+---------+--------------------+-------+
| user_id |   

In [None]:
''