# Loading data

In [None]:
! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
! unzip ml-1m.zip -d .

In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = (pd.read_csv('./ml-1m/ratings.dat', engine='python', sep='::', names=['user', 'item', 'rating', 'timestamp'])
    .assign(timestamp=lambda df:pd.to_datetime(df.timestamp * 1000000000))
          )

movies = (pd.read_csv('./ml-1m/movies.dat', engine='python', sep='::', names=['item', 'title', 'genres'])
          .assign(genres=lambda df:df.genres.str.split('|').values)
          .set_index('item', drop=False))

# See http://files.grouplens.org/datasets/movielens/ml-1m-README.txt for more details
users = pd.read_csv('./ml-1m/users.dat', engine='python', sep='::', 
                    names=['user', 'gender', 'age', 'occupation', 'zipcode'])\
    .set_index('user', drop=False)

## User-item ratings to features

In [3]:
from sklearn import preprocessing
# flatmap equivalent
from itertools import chain

def columns_to_key_feature_pairs(row, key_column, feature_columns):
    return [(row[key_column], '{}={}'.format(column, row[column])) for column in feature_columns]

def array_column_to_key_feature_pairs(row, key_column, array_column):
    return [(row[key_column], u'{}={}'.format(array_column, value)) for value in row[array_column]]

feature_columns=['user', 'gender', 'occupation', 'zipcode']

user_features = pd.DataFrame.from_records(
    data=chain.from_iterable(
        columns_to_key_feature_pairs(row, key_column='user', feature_columns=feature_columns)
        for _, row in users.iterrows()),
    columns=['user', 'feature'])

print(user_features.shape)
user_features.head(10)

(24160, 2)


Unnamed: 0,user,feature
0,1,user=1
1,1,gender=F
2,1,occupation=10
3,1,zipcode=48067
4,2,user=2
5,2,gender=M
6,2,occupation=16
7,2,zipcode=70072
8,3,user=3
9,3,gender=M


In [4]:
item_features = pd.DataFrame.from_records(
    data=chain.from_iterable(
        columns_to_key_feature_pairs(row, key_column='item', feature_columns=['item']) +\
            array_column_to_key_feature_pairs(row, key_column='item', array_column='genres')
        for _, row in movies.iterrows()), columns=['item', 'feature'])

print(item_features.shape)
item_features.head()

(10291, 2)


Unnamed: 0,item,feature
0,1,item=1
1,1,genres=Animation
2,1,genres=Children's
3,1,genres=Comedy
4,2,item=2


In [5]:
from sklearn import preprocessing

features_encoder = preprocessing.LabelEncoder()
features_encoder.fit(np.concatenate([user_features.feature.values, item_features.feature.values]))

item_features = item_features.assign(encoded_feature=lambda df: features_encoder.transform(df.feature))
user_features = user_features.assign(encoded_feature=lambda df: features_encoder.transform(df.feature))

In [6]:
user_features.head()

Unnamed: 0,user,feature,encoded_feature
0,1,user=1,3924
1,1,gender=F,0
2,1,occupation=10,3905
3,1,zipcode=48067,11552
4,2,user=2,5035


In [7]:
keyed_ratings = ratings.assign(sample_id=lambda df: np.arange(df.shape[0]))

augmented_ratings = (pd.concat([
    pd.merge(keyed_ratings, user_features, on='user', how='inner'),
    pd.merge(keyed_ratings, item_features, on='item', how='inner')],
    axis=0).sort_values('sample_id'))

print(augmented_ratings.shape)
augmented_ratings.head(10)

(7102860, 7)


Unnamed: 0,user,item,rating,timestamp,sample_id,feature,encoded_feature
0,1,1193,5,2000-12-31 22:12:40,0,user=1,3924
1,1,1193,5,2000-12-31 22:12:40,0,gender=F,0
2,1,1193,5,2000-12-31 22:12:40,0,occupation=10,3905
3,1,1193,5,2000-12-31 22:12:40,0,zipcode=48067,11552
1,1,1193,5,2000-12-31 22:12:40,0,genres=Drama,9
0,1,1193,5,2000-12-31 22:12:40,0,item=1193,232
3452,1,661,3,2000-12-31 22:35:09,1,genres=Children's,5
3453,1,661,3,2000-12-31 22:35:09,1,genres=Musical,13
3451,1,661,3,2000-12-31 22:35:09,1,genres=Animation,4
3450,1,661,3,2000-12-31 22:35:09,1,item=661,3536


In [8]:
augmented_ratings.head(100).groupby('sample_id').encoded_feature.apply(list).head()

sample_id
0            [3924, 0, 3905, 11552, 9, 232]
1    [5, 13, 4, 3536, 3905, 3924, 0, 11552]
2      [3905, 11552, 3810, 3924, 13, 15, 0]
3           [2639, 11552, 3905, 0, 3924, 9]
4     [1470, 4, 6, 5, 3924, 0, 3905, 11552]
Name: encoded_feature, dtype: object

In [None]:
augmented_ratings = (pd.concat([
    pd.merge(keyed_ratings, user_features, on='user', how='inner'),
    pd.merge(keyed_ratings, item_features, on='item', how='inner')],
    axis=0).sort_values('sample_id'))

print(augmented_ratings.shape)
augmented_ratings.head(10)

## Train/test split

 * Ideally time based split
 * For the sake of simplicity, let's just sample ratings uniformly (breaking the time machine rule)

In [10]:
test = ratings.sample(n=100000, random_state=0)
train_ratings_mask = ~ratings.index.isin(test.index)
train = ratings.loc[train_ratings_mask]

test_user_items = test[['user', 'item']]

print(train.shape)
print(test.shape)

test.head()

(900209, 4)
(100000, 4)


Unnamed: 0,user,item,rating,timestamp
324271,1922,2094,4,2000-11-20 04:34:27
818637,4918,2808,1,2000-07-08 19:29:05
148677,957,1660,4,2000-11-25 05:28:13
778790,4653,914,5,2000-11-29 21:22:43
525489,3245,3324,1,2000-09-07 06:33:31


## Evaluation function

In [11]:
def rmse(predicted_ratings, ground_truth_ratings=test):

    # predicted rating will be nan if no prediction => returning nan as a result if not all test ratings provided
    joined_ratings = pd.merge(
        ground_truth_ratings,
        predicted_ratings, 
        on=['user', 'item'], how='left', suffixes=['_ground_truth', '_predicted'])

    squared_errors = np.power(joined_ratings.rating_ground_truth - joined_ratings.rating_predicted, 2)
    
    return np.sqrt(np.average(squared_errors))

## Linear model based on featurized user-item ratings

In [159]:
from scipy import sparse


augmented_ratings_in_train = augmented_ratings.sample_id.isin(train.index).values
# features
features_mask = np.ones_like(augmented_ratings_in_train)
features_mask = augmented_ratings.feature.str.split('=').str[0].isin(['user', 'item', 'genres']).values
augmented_train = augmented_ratings[augmented_ratings_in_train & features_mask]

def to_coo(row_indexes, col_indexes, values, shape=None, dtype=np.float64):
    return sparse.coo_matrix((values, (row_indexes, col_indexes)), shape=shape, dtype=dtype)

condensed_sample_ids = preprocessing.LabelEncoder().fit_transform(augmented_train.sample_id)
train_features_matrix = to_coo(
    row_indexes=condensed_sample_ids, 
    col_indexes=augmented_train.encoded_feature.values, 
    values=np.ones_like(augmented_train.sample_id))

train_features_matrix

<900209x9964 sparse matrix of type '<class 'numpy.float64'>'
	with 3692342 stored elements in COOrdinate format>

In [166]:
from sklearn.linear_model import SGDRegressor, LinearRegression

# m = SGDRegressor(fit_intercept=True, n_iter=5, alpha=1.0, penalty='elasticnet', l1_ratio=.5, verbose=True)
# m = LinearRegression(fit_intercept=True)
m = SGDRegressor(fit_intercept=True, n_iter=10, alpha=.0001, penalty='elasticnet', l1_ratio=0, verbose=True)
m.fit(X=train_features_matrix, y=train.rating)

-- Epoch 1
Norm: 10.87, NNZs: 9750, Bias: 1.171217, T: 900209, Avg. loss: 0.882533
Total training time: 0.18 seconds.
-- Epoch 2
Norm: 13.83, NNZs: 9750, Bias: 1.565152, T: 1800418, Avg. loss: 0.755815
Total training time: 0.38 seconds.
-- Epoch 3
Norm: 15.62, NNZs: 9750, Bias: 1.795024, T: 2700627, Avg. loss: 0.688794
Total training time: 0.64 seconds.
-- Epoch 4
Norm: 16.83, NNZs: 9750, Bias: 1.946182, T: 3600836, Avg. loss: 0.646610
Total training time: 0.86 seconds.
-- Epoch 5
Norm: 17.71, NNZs: 9750, Bias: 2.053743, T: 4501045, Avg. loss: 0.617357
Total training time: 1.10 seconds.
-- Epoch 6
Norm: 18.38, NNZs: 9750, Bias: 2.134022, T: 5401254, Avg. loss: 0.595730
Total training time: 1.32 seconds.
-- Epoch 7
Norm: 18.93, NNZs: 9750, Bias: 2.197272, T: 6301463, Avg. loss: 0.579000
Total training time: 1.55 seconds.
-- Epoch 8
Norm: 19.39, NNZs: 9750, Bias: 2.248241, T: 7201672, Avg. loss: 0.565612
Total training time: 1.82 seconds.
-- Epoch 9
Norm: 19.77, NNZs: 9750, Bias: 2.29060

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0, learning_rate='invscaling',
       loss='squared_loss', n_iter=10, penalty='elasticnet', power_t=0.25,
       random_state=None, shuffle=True, verbose=True, warm_start=False)

In [167]:
augmented_ratings_in_test = augmented_ratings.sample_id.isin(test.index)

sample_id_encoder = preprocessing.LabelEncoder()
augmented_test = augmented_ratings[augmented_ratings_in_test & features_mask]\
    .assign(condensed_sample_id=lambda df: sample_id_encoder.fit_transform(df.sample_id))

test_features_matrix = to_coo(
    row_indexes=augmented_test.condensed_sample_id, 
    col_indexes=augmented_test.encoded_feature.values, 
    values=np.ones_like(augmented_test.sample_id))

sampled_predicted_ratings = pd.Series(
    index=sample_id_encoder.classes_,
    data=m.predict(X=test_features_matrix)).to_frame('rating')

user_item_predicted_ratings = sampled_predicted_ratings.join(test_user_items)

rmse(user_item_predicted_ratings)

0.96128110463891214

## Model inspection

In [130]:
m.intercept_

3.5818259981848661

In [107]:
m.coef_[features_encoder.transform(['user=1'])]

array([ 0.10665691])

In [128]:
from scipy.stats import rankdata

coefficient_ranks = rankdata(-m.coef_, method='max')
top_coefficients_indexes = np.where(coefficient_ranks <=10)[0]
features_encoder.inverse_transform(top_coefficients_indexes)

array(['item=2503', 'item=3233', 'item=3382', 'item=557', 'item=578',
       'item=787', 'item=989', 'user=2155', 'user=3902', 'user=46'], dtype=object)

In [125]:
train.query('item == 3382')

Unnamed: 0,user,item,rating,timestamp
883623,5334,3382,5,2000-06-12 07:49:19


In [123]:
movies[movies.title.str.startswith('Shawshank')]

Unnamed: 0_level_0,item,title,genres
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
318,318,"Shawshank Redemption, The (1994)",[Drama]


In [127]:
train.query('item == 318').mean()

user      3048.568227
item       318.000000
rating       4.551793
dtype: float64

In [129]:
m.coef_[features_encoder.transform(['item=318'])]

array([ 1.09760585])