In [9]:
! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ../data/ml-1m.zip
! unzip ../data/ml-1m.zip -d ../data

--2017-04-22 15:56:41--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org... 128.101.34.146
Connecting to files.grouplens.org|128.101.34.146|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: '../data/ml-1m.zip'


2017-04-22 15:56:47 (1.01 MB/s) - '../data/ml-1m.zip' saved [5917549/5917549]

Archive:  ../data/ml-1m.zip
   creating: ../data/ml-1m/
  inflating: ../data/ml-1m/movies.dat  
  inflating: ../data/ml-1m/ratings.dat  
  inflating: ../data/ml-1m/README    
  inflating: ../data/ml-1m/users.dat  


In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [44]:
DATA_DIR = '../data/ml-1m/'
import os
ratings = (pd.read_csv(os.path.join(DATA_DIR, 'ratings.dat'), 
                       engine='python', sep='::', names=['user', 'item', 'rating', 'timestamp'])
           .assign(timestamp=lambda df:pd.to_datetime(df.timestamp * 1000000000))
          )

movies = (pd.read_csv(os.path.join(DATA_DIR, 'movies.dat'), engine='python', sep='::', names=['item', 'title', 'genres'])
          .assign(genres=lambda df:df.genres.str.split('|').values)
          .set_index('item', drop=False))

# See http://files.grouplens.org/datasets/movielens/ml-1m-README.txt for more details
users = (
    pd.read_csv(os.path.join(DATA_DIR, 'users.dat'), engine='python', sep='::', 
                names=['user', 'gender', 'age', 'occupation', 'zipcode'])
    .set_index('user', drop=False))

## Features

In [62]:
from IPython.display import display, HTML

output_css = """
.output {
    flex-direction: column;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [92]:
from sklearn import preprocessing
from itertools import chain

def columns_to_key_feature_pairs(row, key_column, feature_columns):
    return [(row[key_column], '{}={}'.format(column, row[column])) for column in feature_columns]

def array_column_to_key_feature_pairs(row, key_column, array_column):
    return [(row[key_column], u'{}={}'.format(array_column, value)) for value in row[array_column]]

feature_columns=['user', 'gender', 'occupation', 'zipcode']

user_features = pd.DataFrame.from_records(
    data=chain.from_iterable(
        columns_to_key_feature_pairs(row, key_column='user', feature_columns=feature_columns)
        for _, row in users.iterrows()),
    index='user',
    columns=['user', 'feature_name'])

item_features = pd.DataFrame.from_records(
    data=chain.from_iterable(
        columns_to_key_feature_pairs(row, key_column='item', feature_columns=['item']) +\
            array_column_to_key_feature_pairs(row, key_column='item', array_column='genres')
        for _, row in movies.iterrows()), 
    columns=['item', 'feature_name'],
    index='item')

features_encoder = preprocessing.LabelEncoder()
features_encoder.fit(np.hstack([user_features.feature_name, item_features.feature_name]))

user_features = user_features.assign(feature=lambda df: features_encoder.transform(df.feature_name))
item_features = item_features.assign(feature=lambda df: features_encoder.transform(df.feature_name))

display(user_features.head(10))
display(item_features.head(10))

Unnamed: 0_level_0,feature_name,feature
user,Unnamed: 1_level_1,Unnamed: 2_level_1
1,user=1,3924
1,gender=F,0
1,occupation=10,3905
1,zipcode=48067,11552
2,user=2,5035
2,gender=M,1
2,occupation=16,3911
2,zipcode=70072,12212
3,user=3,6146
3,gender=M,1


Unnamed: 0_level_0,feature_name,feature
item,Unnamed: 1_level_1,Unnamed: 2_level_1
1,item=1,20
1,genres=Animation,4
1,genres=Children's,5
1,genres=Comedy,6
2,item=2,1075
2,genres=Adventure,3
2,genres=Children's,5
2,genres=Fantasy,10
3,item=3,2185
3,genres=Comedy,6


In [93]:
batch_samples = ratings[['user', 'item', 'rating']].head(2)\
    .assign(sample_id=lambda df: np.arange(df.shape[0]))\
    .set_index('sample_id')

batch_samples_with_features = pd.concat([
    pd.merge(batch_samples, user_features, left_on='user', right_index=True),
    pd.merge(batch_samples, item_features, left_on='item', right_index=True)],
    axis=0).sort_index()

batch_samples_with_features

Unnamed: 0_level_0,user,item,rating,feature_name,feature
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,1193,5,user=1,3924
0,1,1193,5,gender=F,0
0,1,1193,5,occupation=10,3905
0,1,1193,5,zipcode=48067,11552
0,1,1193,5,item=1193,232
0,1,1193,5,genres=Drama,9
1,1,661,3,user=1,3924
1,1,661,3,gender=F,0
1,1,661,3,occupation=10,3905
1,1,661,3,zipcode=48067,11552


In [94]:
batch_samples_with_features\
    .groupby(by=batch_samples_with_features.index)\
    .feature.apply(np.array)

sample_id
0            [3924, 0, 3905, 11552, 232, 9]
1    [3924, 0, 3905, 11552, 3536, 4, 5, 13]
Name: feature, dtype: object

In [116]:
import scipy.sparse as sp

def to_sparse_indicators(featurized_batch_df):
    sample_ids_as_row_indexes = featurized_batch_df.index.values
    encoded_feature_as_col_indexes = featurized_batch_df.feature.values
    
    return sp.csr_matrix((
        np.ones_like(sample_ids_as_row_indexes),
                         (sample_ids_as_row_indexes, encoded_feature_as_col_indexes)))

batch_sparse = to_sparse_indicators(batch_samples_with_features)

batch_sparse

<2x11553 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [109]:
feature_type_to_range = pd.concat([user_features, item_features])\
    .assign(type=lambda df: df.feature_name.str.split('=').str[0])\
    .groupby('type').feature.aggregate([min, max])
    
feature_type_to_range

Unnamed: 0_level_0,min,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1
gender,0,1
genres,2,19
item,20,3902
occupation,3903,3923
user,3924,9963
zipcode,9964,13402


In [176]:
left_feature_start, left_feature_stop = feature_type_to_range.loc['gender'].values
right_feature_start, right_feature_stop = feature_type_to_range.loc['genres'].values

l = batch_sparse[:, left_feature_start:left_feature_stop+1]
r = batch_sparse[:, right_feature_start:right_feature_stop+1]

batch_interactions = sp.kron(l, r, format='csr')[:2:]

batch_interactions

<2x36 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [182]:
from itertools import product

interaction_names = list(product(
    features_encoder.classes_[left_feature_start:left_feature_stop+1],
    features_encoder.classes_[right_feature_start:right_feature_stop+1]))

np.array(interaction_names)[batch_interactions[1].nonzero()[1]]

array([['gender=F', 'genres=Animation'],
       ['gender=F', "genres=Children's"],
       ['gender=F', 'genres=Musical']], 
      dtype='<U18')