Collaborative, content-based and hybrid filtering for GoodReads dataset.

Ideas from https://github.com/jfkirk/tensorrec/blob/master/examples/getting_started.py

In [1]:
import tensorrec

In [4]:
from collections import defaultdict
import csv
import numpy
import random
from scipy import sparse
from sklearn.preprocessing import MultiLabelBinarizer

import tensorrec

import logging
logging.getLogger().setLevel(logging.INFO)

# Colaborative filtering  

In [5]:
print('Loading ratings')
with open('ratings.csv', 'r') as ratings_file:
    ratings_file_reader = csv.reader(ratings_file)
    raw_ratings = list(ratings_file_reader)
raw_ratings_header = raw_ratings.pop(0)

Loading ratings


In [6]:
len(raw_ratings)

5976479

In [7]:
# Shuffle the ratings and split them in to train/test sets 80%/20%
random.shuffle(raw_ratings)  # Shuffles the list in-place
cutoff = int(.8 * len(raw_ratings))
train_ratings = raw_ratings[:cutoff]
test_ratings = raw_ratings[cutoff:]
print("{} train ratings, {} test ratings".format(len(train_ratings), len(test_ratings)))

4781183 train ratings, 1195296 test ratings


In [8]:
# Iterate through the input to map MovieLens IDs to new internal IDs
# The new internal IDs will be created by the defaultdict on insertion
books_to_internal_user_ids = defaultdict(lambda: len(books_to_internal_user_ids))
books_to_internal_item_ids = defaultdict(lambda: len(books_to_internal_item_ids))
for row in raw_ratings:
    row[0] = books_to_internal_user_ids[int(row[0])]
    row[1] = books_to_internal_item_ids[int(row[1])]
    row[2] = float(row[2])
n_users = len(books_to_internal_user_ids)
n_items = len(books_to_internal_item_ids)

In [9]:
# This method converts a list of (user, item, rating, time) to a sparse matrix
def interactions_list_to_sparse_matrix(interactions):
    users_column, items_column, ratings_column = zip(*interactions)
    return sparse.coo_matrix((ratings_column, (users_column, items_column)),
shape=(n_users, n_items))

In [10]:
# Create sparse matrices of interaction data
sparse_train_ratings = interactions_list_to_sparse_matrix(train_ratings)
sparse_test_ratings = interactions_list_to_sparse_matrix(test_ratings)

# Construct indicator features for users and items
user_indicator_features = sparse.identity(n_users)
item_indicator_features = sparse.identity(n_items)

In [11]:
# Build a matrix factorization collaborative filter model
cf_model = tensorrec.TensorRec(n_components=5)

# Fit the collaborative filter model
print("Training collaborative filter")

cf_model.fit(interactions=sparse_train_ratings,
             user_features=user_indicator_features,
item_features=item_indicator_features)

Training collaborative filter


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [12]:
predictions = cf_model.predict(user_features=user_indicator_features,
                            item_features=item_indicator_features )

In [13]:
predictions

array([[4.4641094, 4.122511 , 4.95325  , ..., 3.0775526, 2.9214177,
        2.9621034],
       [4.233461 , 3.932898 , 4.9445686, ..., 2.8686118, 2.7207112,
        2.7786856],
       [3.507633 , 3.1823165, 3.872143 , ..., 2.3151288, 2.1689658,
        2.2503536],
       ...,
       [2.5142386, 2.6790123, 3.4420972, ..., 2.0793655, 1.9218086,
        1.9502971],
       [2.9340856, 3.0414152, 3.7331324, ..., 2.3826573, 2.2201478,
        2.239263 ],
       [2.6138353, 2.6591885, 3.4123716, ..., 2.0277312, 1.8762405,
        1.9223231]], dtype=float32)

In [14]:
# Create sets of train/test interactions that are only ratings >= 4.0
sparse_train_ratings_4plus = sparse_train_ratings.multiply(sparse_train_ratings >= 4.0)
sparse_test_ratings_4plus = sparse_test_ratings.multiply(sparse_test_ratings >= 34.0)


# This method consumes item ranks for each user and prints out recall@10 train/test metrics
def check_results(ranks):
    train_recall_at_10 = tensorrec.eval.recall_at_k(
        test_interactions=sparse_train_ratings_4plus,
        predicted_ranks=ranks,
        k=10
    ).mean()
    test_recall_at_10 = tensorrec.eval.recall_at_k(
        test_interactions=sparse_test_ratings_4plus,
        predicted_ranks=ranks,
        k=10
    ).mean()
    print("Recall at 10: Train: {:.4f} Test: {:.4f}".format(train_recall_at_10,
                                                            test_recall_at_10))

In [15]:
print("Matrix factorization collaborative filter:")
predicted_ranks = cf_model.predict_rank(user_features=user_indicator_features,
                                        item_features=item_indicator_features)
check_results(predicted_ranks)


Matrix factorization collaborative filter:


KeyboardInterrupt: 

In [15]:
# Let's try a new loss function: WMRB


###
print("Training collaborative filter with WMRB loss")
# ranking_cf_model = tensorrec.TensorRec(n_components=5,
#                                        loss_graph=tensorrec.loss_graphs.WMRBLossGraph())
# ranking_cf_model.fit(interactions=sparse_train_ratings_3plus,
#                      user_features=user_indicator_features,
#                      item_features=item_indicator_features,
#                      n_sampled_items=int(n_items * .01))

# Check the results of the WMRB MF CF model
print("WMRB matrix factorization collaborative filter:")
# predicted_ranks = ranking_cf_model.predict_rank(user_features=user_indicator_features,
#                                                 item_features=item_indicator_features)
check_results(predicted_ranks)

Training collaborative filter with WMRB loss
WMRB matrix factorization collaborative filter:
Recall at 10: Train: 0.0497 Test: 0.0469


# Content-based filtering

In [16]:
# To improve the recommendations, lets read in the movie genres
print('Loading movie metadata')
with open('books_features_no_authors.csv', 'r') as movies_file:
    movies_file_reader = csv.reader(movies_file)
    raw_movie_metadata = list(movies_file_reader)
raw_movie_metadata_header = raw_movie_metadata.pop(0)

Loading movie metadata


In [15]:
raw_movie_metadata

[['0.9976108308999205',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['0.9946907353331564',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['0.9968144411998938',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['0.9848685956994956',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['0.9755773825325194',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['0.9986726838332891',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0

In [17]:
from scipy import sparse
import pandas as pd
b_feat = pd.read_csv("books_features_no_authors.csv")
books_features = sparse.coo_matrix(b_feat)
n_features = b_feat.shape[1]

In [18]:
books_features.shape

(10000, 22)

In [18]:
n_features

22

In [17]:
b_feat.head()

Unnamed: 0,year_scaled,lang_ara,lang_dan,lang_en,lang_fil,lang_fre,lang_ger,lang_ind,lang_ita,lang_jpn,...,lang_nor,lang_per,lang_pol,lang_por,lang_rum,lang_rus,lang_spa,lang_swe,lang_tur,lang_vie
0,0.997611,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.994691,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.996814,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.984869,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.975577,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Fit a content-based model using the genres as item features
print("Training content-based recommender")
content_model = tensorrec.TensorRec(
    n_components=n_features,
    item_repr_graph=tensorrec.representation_graphs.FeaturePassThroughRepresentationGraph(),
#    loss_graph=tensorrec.loss_graphs.WMRBLossGraph()
)

Training content-based recommender


In [None]:
content_model.fit(interactions=sparse_train_ratings_4plus,
                  user_features=user_indicator_features,
                  item_features=books_features,)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [22]:
# Check the results of the content-based model
print("Content-based recommender:")
predicted_ranks = content_model.predict_rank(user_features=user_indicator_features,
                                             item_features=books_features)


Content-based recommender:


ModelNotFitException: predict_rank() has been called before model fitting. Call fit() or fit_partial() before calling predict_rank().

In [20]:
predicted_ranks.shape

(53424, 10000)

In [21]:
check_results(predicted_ranks)

Recall at 10: Train: 0.0212 Test: nan


  app.launch_new_instance()
  ret = ret.dtype.type(ret / rcount)


In [25]:
# Try concatenating the genres on to the indicator features for a hybrid recommender system
full_item_features = sparse.hstack([item_indicator_features, books_features])

In [27]:
print("Training hybrid recommender")
hybrid_model = tensorrec.TensorRec(
    n_components=5,
#    loss_graph=tensorrec.loss_graphs.WMRBLossGraph()
)
hybrid_model.fit(interactions=sparse_train_ratings_4plus,
                 user_features=user_indicator_features,
                 item_features=full_item_features,
n_sampled_items=int(n_items * .01))



Training hybrid recommender


In [28]:
print("Hybrid recommender:")
predicted_hybrid_ranks = hybrid_model.predict_rank(user_features=user_indicator_features,
                                            item_features=full_item_features)
check_results(predicted_hybrid_ranks)

Hybrid recommender:
Recall at 10: Train: 0.0083 Test: 0.0082
