# Loading data

In [3]:
! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
! unzip ml-1m.zip -d .


--2016-10-17 14:09:06--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org... 128.101.34.146
Connecting to files.grouplens.org|128.101.34.146|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: 'ml-1m.zip.1'


2016-10-17 14:09:27 (279 KB/s) - 'ml-1m.zip.1' saved [5917549/5917549]

Archive:  ml-1m.zip
  inflating: ./ml-1m/movies.dat      
  inflating: ./ml-1m/ratings.dat     
  inflating: ./ml-1m/README          
  inflating: ./ml-1m/users.dat       


In [37]:
import pandas as pd
import numpy as np

In [10]:
ratings = (pd.read_csv('./ml-1m/ratings.dat', sep='::', names=['user', 'item', 'rating', 'timestamp'])
    .assign(timestamp=lambda df:pd.to_datetime(df.timestamp * 1000000000))
          )

movies = pd.read_csv('./ml-1m/movies.dat', sep='::', names=['item', 'title', 'genres'], index_col='item')
movies['first_genre'] = movies.genres.str.split('|').str.get(0)
# See http://files.grouplens.org/datasets/movielens/ml-1m-README.txt for more details
users = pd.read_csv('./ml-1m/users.dat', sep='::', names=['user', 'gender', 'age', 'occupation', 'zipcode'], index_col='user')

  app.launch_new_instance()


In [11]:
movies.head()

Unnamed: 0_level_0,title,genres,first_genre
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Animation|Children's|Comedy,Animation
2,Jumanji (1995),Adventure|Children's|Fantasy,Adventure
3,Grumpier Old Men (1995),Comedy|Romance,Comedy
4,Waiting to Exhale (1995),Comedy|Drama,Comedy
5,Father of the Bride Part II (1995),Comedy,Comedy


In [12]:
users.head()


Unnamed: 0_level_0,gender,age,occupation,zipcode
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [25]:
print(ratings.shape)
ratings.head()

(1000209, 4)


Unnamed: 0,user,item,rating,timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


## Sparsity analysis

In [5]:
item_supports = ratings.groupby('item')['user'].nunique().to_frame('item_support')
item_supports.describe()

user_supports = ratings.groupby('user')['item'].nunique().to_frame('user_support')
user_supports.describe()

pd.concat([user_supports.describe(), item_supports.describe()], axis=1)

Unnamed: 0,user_support,item_support
count,6040.0,3706.0
mean,165.597517,269.889099
std,192.747029,384.047838
min,20.0,1.0
25%,44.0,33.0
50%,96.0,123.5
75%,208.0,350.0
max,2314.0,3428.0


## Train/test split

 * Ideally time based split
 * For the sake of simplicity, let's just sample ratings uniformly (breaking the time machine rule)

In [61]:
test_ratings = ratings.sample(n=100000, random_state=0)
train_ratings_mask = ~ratings.index.isin(test_ratings.index)
train_ratings = ratings.loc[train_ratings_mask]

test_user_items = test_ratings[['user', 'item']]

print(train_ratings.shape)
print(test_ratings.shape)

(900209, 4)
(100000, 4)


## Evaluation function and 1st baseline

In [59]:
def rmse(predicted_ratings, ground_truth_ratings=test_ratings):

    # predicted rating will be nan if no prediction => returning nan as a result if not all test ratings provided
    joined_ratings = pd.merge(
        ground_truth_ratings,
        predicted_ratings, 
        on=['user', 'item'], how='left', suffixes=['_ground_truth', '_predicted'])

    squared_errors = np.power(joined_ratings.rating_ground_truth - joined_ratings.rating_predicted, 2)
    
    return np.sqrt(np.average(squared_errors))

0.0

In [70]:
class AverageTrainingModel:
    
    def __init__(self, average_rating):
        self.average_rating = average_rating

    @classmethod
    def train_model(cls, train_ratings):
        """ Factory method
        """
        average_rating = train_ratings.rating.mean()
        return cls(average_rating)
    
    def average_ratings_array(self, user_items):
        return np.repeat(self.average_rating, user_items.shape[0])
    
    def predict(self, user_items):
        return user_items.assign(rating=self.average_ratings_array)

In [71]:
average_model = AverageTrainingModel.train_model(train_ratings)
average_ratings = average_model.predict(test_user_items)

print(rmse(predicted_ratings=average_ratings))
average_ratings.head()

1.11416005105


Unnamed: 0,user,item,rating
324271,1922,2094,3.581826
818637,4918,2808,3.581826
148677,957,1660,3.581826
778790,4653,914,3.581826
525489,3245,3324,3.581826
