In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import graphlab


In [2]:
## read datasets into dataframe

ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')


In [3]:
## check structure of datasets
print ratings.shape
ratings.head()


(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
print movies.shape
movies.head()

(9125, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Data Preprocessing


In [5]:
## removing rare movies 

rare_movies = ratings.groupby('movieId').count().add_suffix('_count').reset_index()
#rare_movies.sort_values(by='rating_count')
rare_movies = rare_movies[rare_movies['rating_count'] <= 5]
print 'number of rare movies: ',len(rare_movies)

movies = movies[~movies['movieId'].isin(rare_movies['movieId'])]
movies.shape

number of rare movies:  5967


(3158, 3)

In [6]:
ratings = ratings[ratings['rating']>=4]
ratings = ratings[~ratings['movieId'].isin(rare_movies['movieId'])]
ratings.shape

(46819, 4)

In [7]:
## Extract year, title, and genre
movies['year'] = movies['title'].apply(lambda x: x[-5:-1])
movies['title'] = movies['title'].apply(lambda x: x[:-7])
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
ratings['timestamp'] = ratings['timestamp'].astype(datetime)

In [8]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [9]:
## Prepare traing and testing sets

ratings = graphlab.SFrame(ratings)
training_data, testing_data = graphlab.recommender.util.random_split_by_user(ratings, 'userId', 'movieId')


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1512691070.log


This non-commercial license of GraphLab Create for academic use is assigned to nanlee_89@yahoo.com and will expire on December 07, 2018.


### Popularity Model

In [10]:
popularity_model = graphlab.popularity_recommender.create(training_data, 'userId', 'movieId', target='rating')

In [11]:
popularity_model.recommend(users=range(1,6),k=5)

userId,movieId,score,rank
1,4835,5.0,1
1,2852,5.0,2
1,2269,5.0,3
1,326,5.0,4
1,3865,5.0,5
2,4835,5.0,1
2,2852,5.0,2
2,2269,5.0,3
2,326,5.0,4
2,3865,5.0,5


### Collaborative Filtering Model - Item Similarity Model

In [12]:
### Train Recommender Model ###
item_similarity_model = graphlab.recommender.create(training_data, 'userId', 'movieId')

In [13]:
item_similarity_model.recommend(users=range(1,6),k=5)

userId,movieId,score,rank
1,2020,0.269305020571,1
1,2243,0.260135143995,2
1,2917,0.256366729736,3
1,2352,0.246626168489,4
1,2313,0.245161294937,5
2,588,0.0906161398723,1
2,364,0.076685482058,2
2,593,0.0686295566888,3
2,377,0.067112456108,4
2,47,0.0669275542785,5


### Model Evaluation

In [14]:
model_performance = graphlab.compare(testing_data, [popularity_model, item_similarity_model])

PROGRESS: Evaluate model M0

Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    |        0.0        |        0.0        |
|   2    |        0.0        |        0.0        |
|   3    | 0.000510464522716 | 1.34332769136e-05 |
|   4    | 0.000382848392037 | 1.34332769136e-05 |
|   5    | 0.000918836140888 | 0.000375012313837 |
|   6    | 0.000765696784074 | 0.000375012313837 |
|   7    | 0.000656311529206 | 0.000375012313837 |
|   8    | 0.000574272588055 | 0.000375012313837 |
|   9    | 0.000680619363621 | 0.000465094288434 |
|   10   | 0.000612557427259 | 0.000465094288434 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1

Precision and recall summary statistics by cutoff
+--------+----------------+-----------------+
| cutoff | mean_precision |   mean_recall   |
+--------+---