Collaborative, content-based and hybrid filtering for GoodReads dataset.

Ideas from https://github.com/jfkirk/tensorrec/blob/master/examples/getting_started.py

In [23]:
import tensorrec

In [24]:
from collections import defaultdict
import csv
import numpy
import random
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import MultiLabelBinarizer
import tensorrec
import logging
logging.getLogger().setLevel(logging.INFO)
#TODO: check for the random seed
import tensorflow as tf 
tf.set_random_seed(0)

In [25]:
#parameters of the experiments 
N_BOOKS = 2000
EMBEDDING_DIMENTIONS = 50

# Colaborative filtering  

In [26]:
print('Loading ratings')
with open('data/ratings_reduced.csv', 'r') as ratings_file:
    ratings_file_reader = csv.reader(ratings_file, delimiter=' ')
    raw_ratings = list(ratings_file_reader)
# raw_ratings_header = raw_ratings.pop(0)

Loading ratings


In [27]:
raw_ratings

[['35040', '1', '5'],
 ['34489', '1', '4'],
 ['33191', '1', '5'],
 ['27976', '1', '5'],
 ['28689', '1', '4'],
 ['9517', '1', '4'],
 ['52256', '1', '5'],
 ['653', '1', '3'],
 ['37909', '1', '5'],
 ['21092', '1', '4'],
 ['42879', '1', '4'],
 ['17991', '1', '5'],
 ['14565', '1', '4'],
 ['14255', '1', '4'],
 ['20455', '1', '5'],
 ['3623', '1', '5'],
 ['12438', '1', '5'],
 ['19923', '1', '5'],
 ['4389', '1', '4'],
 ['27164', '1', '5'],
 ['25957', '1', '5'],
 ['27457', '1', '5'],
 ['46416', '1', '5'],
 ['43625', '1', '4'],
 ['44635', '1', '4'],
 ['3572', '1', '5'],
 ['41222', '1', '5'],
 ['51468', '1', '4'],
 ['12926', '1', '5'],
 ['5509', '1', '4'],
 ['3886', '1', '5'],
 ['4789', '1', '4'],
 ['23286', '1', '4'],
 ['13590', '1', '5'],
 ['6272', '1', '5'],
 ['42672', '1', '4'],
 ['33627', '1', '5'],
 ['47426', '1', '5'],
 ['2736', '1', '4'],
 ['19534', '1', '5'],
 ['488', '1', '4'],
 ['21594', '1', '5'],
 ['3022', '1', '4'],
 ['45419', '1', '5'],
 ['10536', '1', '5'],
 ['32232', '1', '5'],
 [

In [28]:
len(raw_ratings)

2242110

In [29]:
# Shuffle the ratings and split them in to train/test sets 80%/20%
def train_test_split(raw_ratings, ratio=0.8):
    random.Random(0).shuffle(raw_ratings)  # Shuffles the list in-place
    cutoff = int(.8 * len(raw_ratings))
    train_ratings = raw_ratings[:cutoff]
    test_ratings = raw_ratings[cutoff:]
    return train_ratings, test_ratings

train_ratings, test_ratings = train_test_split(raw_ratings)
print("{} train ratings, {} test ratings".format(len(train_ratings), len(test_ratings)))

1793688 train ratings, 448422 test ratings


In [30]:
# Iterate through the input to map goodreads IDs to new internal IDs
# The new internal IDs will be created by the defaultdict on insertion
books_to_internal_user_ids = defaultdict(lambda: len(books_to_internal_user_ids))
books_to_internal_item_ids = defaultdict(lambda: len(books_to_internal_item_ids))
for row in raw_ratings:
    row[0] = books_to_internal_user_ids[int(row[0])]
    row[1] = books_to_internal_item_ids[int(row[1])]
    row[2] = float(row[2])
n_users = len(books_to_internal_user_ids)
n_items = len(books_to_internal_item_ids)

In [31]:
# This method converts a list of (user, item, rating, time) to a sparse matrix
def interactions_list_to_sparse_matrix(interactions):
    users_column, items_column, ratings_column = zip(*interactions)
    return sparse.coo_matrix((ratings_column, (users_column, items_column)),
shape=(n_users, n_items))

In [32]:
# Create sparse matrices of interaction data
sparse_train_ratings = interactions_list_to_sparse_matrix(train_ratings)
sparse_test_ratings = interactions_list_to_sparse_matrix(test_ratings)

# Construct indicator features for users and items
user_indicator_features = sparse.identity(n_users)
item_indicator_features = sparse.identity(n_items)

In [33]:
# Build a matrix factorization collaborative filter model
cf_model = tensorrec.TensorRec(n_components=5)

# Fit the collaborative filter model
print("Training collaborative filter")

cf_model.fit(interactions=sparse_train_ratings,
             user_features=user_indicator_features,
item_features=item_indicator_features)

Training collaborative filter


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [34]:
predictions = cf_model.predict(user_features=user_indicator_features,
                            item_features=item_indicator_features )

In [35]:
predictions

array([[3.4599879, 3.4518158, 3.1528764, ..., 3.1779628, 3.2775123,
        3.280781 ],
       [4.5791855, 4.2004714, 4.039655 , ..., 3.7314146, 3.9133234,
        3.8777552],
       [2.8395836, 2.945003 , 2.9636202, ..., 2.965521 , 3.064954 ,
        3.1529503],
       ...,
       [2.9269216, 2.9726179, 3.0228062, ..., 2.9429085, 3.0586312,
        3.204864 ],
       [2.926758 , 2.9701684, 3.0223927, ..., 2.9416153, 3.0567262,
        3.2042649],
       [2.92774  , 2.969245 , 3.029328 , ..., 2.9419734, 3.0570192,
        3.2037506]], dtype=float32)

In [36]:
# Create sets of train/test interactions that are only ratings >= 4.0
sparse_train_ratings_4plus = sparse_train_ratings.multiply(sparse_train_ratings >= 4.0)
sparse_test_ratings_4plus = sparse_test_ratings.multiply(sparse_test_ratings >= 4.0)  
    

In [37]:
# This method consumes item ranks for each user and prints out recall@10 train/test metrics
def check_results(ranks):
    train_recall_at_10 = tensorrec.eval.recall_at_k(
        test_interactions=sparse_train_ratings_4plus,
        predicted_ranks=ranks,
        k=10
    ).mean()
    test_recall_at_10 = tensorrec.eval.recall_at_k(
        test_interactions=sparse_test_ratings_4plus,
        predicted_ranks=ranks,
        k=10
    ).mean()
    train_precision_at_10 = tensorrec.eval.precision_at_k(
        test_interactions=sparse_train_ratings_4plus,
        predicted_ranks=ranks,
        k=10
    ).mean()
    test_precision_at_10 = tensorrec.eval.precision_at_k(
        test_interactions=sparse_test_ratings_4plus,
        predicted_ranks=ranks,
        k=10
    ).mean()
    train_ndcg_at_10 = tensorrec.eval.ndcg_at_k(
        test_interactions=sparse_train_ratings_4plus,
        predicted_ranks=ranks,
        k=10
    ).mean()
    test_ndcg_at_10 = tensorrec.eval.ndcg_at_k(
        test_interactions=sparse_test_ratings_4plus,
        predicted_ranks=ranks,
        k=10
    ).mean()
    print("Recall at 10: Train: {:.4f} Test: {:.4f}".format(train_recall_at_10,
                                                            test_recall_at_10))
    print("Precision at 10: Train: {:.4f} Test: {:.4f}".format(train_precision_at_10,
                                                            test_precision_at_10))
    print("NDCG at 10: Train: {:.4f} Test: {:.4f}".format(train_ndcg_at_10,
                                                            test_ndcg_at_10))

In [38]:
print("Matrix factorization collaborative filter:")
predicted_ranks = cf_model.predict_rank(user_features=user_indicator_features,
                                        item_features=item_indicator_features)
check_results(predicted_ranks)


Matrix factorization collaborative filter:


  ndcg = dcg/idcg


Recall at 10: Train: 0.0678 Test: 0.0585
Precision at 10: Train: 0.1239 Test: 0.0310
NDCG at 10: Train: 0.1306 Test: 0.0494


  ndcg = dcg/idcg


In [39]:
# Let's try a new loss function: WMRB


###
print("Training collaborative filter with WMRB loss")
ranking_cf_model = tensorrec.TensorRec(n_components=5,
                                       loss_graph=tensorrec.loss_graphs.WMRBLossGraph())
ranking_cf_model.fit(interactions=sparse_train_ratings_4plus,
                     user_features=user_indicator_features,
                     item_features=item_indicator_features,
                     n_sampled_items=int(n_items * .01))

# Check the results of the WMRB MF CF model
print("WMRB matrix factorization collaborative filter:")
predicted_ranks = ranking_cf_model.predict_rank(user_features=user_indicator_features,
                                                item_features=item_indicator_features)
check_results(predicted_ranks)

Training collaborative filter with WMRB loss


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


WMRB matrix factorization collaborative filter:


  ndcg = dcg/idcg


Recall at 10: Train: 0.1580 Test: 0.1306
Precision at 10: Train: 0.3055 Test: 0.0710
NDCG at 10: Train: 0.2877 Test: 0.1067


  ndcg = dcg/idcg


In [40]:
#TODO - намалих си данните и така ми се увеличи резултата??? нещо не е наред, да разгледам 
#малко по-подробно функциите и евентуално да направя cross-evaluation  

# Content-based filtering with book embeddings

In [41]:
def create_book_embeddings(infile, outfile, n_books=N_BOOKS, emd_dim=EMBEDDING_DIMENTIONS):
    """
    Create a file with the embeddings that are only of the book nodes. 
    """
    nodes_emb = pd.read_csv(infile, sep=' ', header=None, index_col=0, skiprows=1)
    print(len(nodes_emb))
    print(nodes_emb.head())
    nodes_emb_sorted = nodes_emb.sort_index()
    nodes_emb_books=nodes_emb_sorted.head(n_books)
    print(len(nodes_emb_books))
    print(nodes_emb_books.head())
    nodes_emb_books.to_csv(outfile, header=False, index=False)

In [42]:
INFILE = "emb/book_tags_100_directed_raw.emd"
OUTFILE = "emb/book_tags_100_directed.emd"
create_book_embeddings(INFILE, OUTFILE)
#OUTFILE = "emb/tags_and_genres_old.emd"

#'tag_count_reduced.csv'

13380
             1         2         3         4         5         6         7    \
0                                                                              
130574  0.343741  1.144867  0.520509  0.184540  0.697892  0.377195  0.899975   
111743  0.144308  0.509947  0.236566  0.082273  0.303050  0.161530  0.403527   
108717  0.138341  0.482995  0.226414  0.080604  0.291387  0.153742  0.382147   
111557  0.119607  0.413404  0.194941  0.071990  0.244740  0.135814  0.327056   
111305  0.119086  0.406895  0.185425  0.069152  0.246882  0.133862  0.321073   

             8         9         10   ...       119       120       121  \
0                                     ...                                 
130574  0.387989  0.777880 -0.814670  ...  0.141990  1.411326 -1.217900   
111743  0.170486  0.344542 -0.360932  ...  0.065931  0.627696 -0.539220   
108717  0.157744  0.323199 -0.341189  ...  0.056345  0.594580 -0.512271   
111557  0.140245  0.278803 -0.293099  ...  0.052401  0.508

In [43]:
# To improve the recommendations, lets read in the movie genres
print('Loading book embeddings... ')
with open(OUTFILE, 'r') as emb_file:
    emb_file_reader = csv.reader(emb_file)
    raw_tags_embeddings = list(emb_file_reader)
# raw_movie_metadata_header = raw_movie_metadata.pop(0)

Loading book embeddings... 


In [44]:
from scipy import sparse
import pandas as pd
print('Loading book embeddings...')
b_feat = pd.read_csv(OUTFILE, header=None)
books_features = sparse.coo_matrix(b_feat)
n_features = b_feat.shape[1]

Loading book embeddings...


In [45]:
b_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.005412,-0.009758,-0.001974,-0.001245,-0.005370,-0.007581,-0.010505,-0.004591,-0.008552,0.010850,...,-0.000926,-0.013913,0.011258,0.001815,0.005929,0.002649,0.001986,0.001555,-0.000508,0.002407
1,0.004807,0.006320,0.004236,-0.001680,0.000761,-0.000484,0.000115,-0.001093,-0.000153,-0.002557,...,-0.002360,0.006336,-0.006319,0.000603,-0.004650,-0.002330,-0.004314,-0.002294,0.005619,-0.003281
2,-0.005753,-0.016819,-0.007610,-0.003173,-0.013899,-0.008617,-0.016474,-0.002846,-0.011146,0.009794,...,-0.004271,-0.023070,0.016920,-0.001107,0.015458,0.001389,0.005143,0.005356,-0.006792,-0.000260
3,0.000944,-0.000083,-0.001413,-0.002312,-0.004882,-0.001933,-0.004694,-0.000444,-0.002381,0.002736,...,-0.001447,0.000013,0.001518,-0.000564,0.001440,0.001668,-0.000605,0.001455,0.000007,0.002271
4,0.003615,0.003535,0.003176,-0.003561,0.000043,0.003531,-0.000639,0.003457,0.002574,-0.004269,...,0.002350,0.007062,-0.004668,-0.001814,-0.001687,0.002416,0.000795,-0.004800,0.000493,0.000343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.002474,-0.010803,-0.004824,-0.003697,-0.003351,-0.005582,-0.005707,-0.005117,-0.003806,0.008666,...,0.001354,-0.013355,0.008643,-0.003358,0.006447,0.002162,-0.000213,0.003503,-0.000205,0.002113
1996,-0.004030,-0.004153,0.000781,0.001401,-0.003253,-0.000085,-0.004588,0.003523,0.002980,0.000581,...,-0.002651,-0.003164,-0.000266,0.000308,-0.001204,0.002242,0.002648,0.000049,0.002612,0.000050
1997,-0.004043,-0.002431,-0.001540,0.002563,-0.002461,-0.001567,-0.004501,-0.000371,0.000387,0.001208,...,-0.003474,-0.003056,0.003739,-0.002558,-0.002405,-0.000657,-0.002760,-0.003646,0.000878,-0.000970
1998,-0.004515,-0.004896,0.001805,-0.000948,0.000606,0.000751,-0.000890,-0.003430,-0.006524,-0.000418,...,0.000371,-0.003519,0.002104,0.002492,0.005905,0.000733,0.002440,0.003454,-0.000438,0.001823


In [46]:
books_features.shape

(2000, 128)

In [47]:
n_features

128

In [48]:
b_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.005412,-0.009758,-0.001974,-0.001245,-0.00537,-0.007581,-0.010505,-0.004591,-0.008552,0.01085,...,-0.000926,-0.013913,0.011258,0.001815,0.005929,0.002649,0.001986,0.001555,-0.000508,0.002407
1,0.004807,0.00632,0.004236,-0.00168,0.000761,-0.000484,0.000115,-0.001093,-0.000153,-0.002557,...,-0.00236,0.006336,-0.006319,0.000603,-0.00465,-0.00233,-0.004314,-0.002294,0.005619,-0.003281
2,-0.005753,-0.016819,-0.00761,-0.003173,-0.013899,-0.008617,-0.016474,-0.002846,-0.011146,0.009794,...,-0.004271,-0.02307,0.01692,-0.001107,0.015458,0.001389,0.005143,0.005356,-0.006792,-0.00026
3,0.000944,-8.3e-05,-0.001413,-0.002312,-0.004882,-0.001933,-0.004694,-0.000444,-0.002381,0.002736,...,-0.001447,1.3e-05,0.001518,-0.000564,0.00144,0.001668,-0.000605,0.001455,7e-06,0.002271
4,0.003615,0.003535,0.003176,-0.003561,4.3e-05,0.003531,-0.000639,0.003457,0.002574,-0.004269,...,0.00235,0.007062,-0.004668,-0.001814,-0.001687,0.002416,0.000795,-0.0048,0.000493,0.000343


In [49]:
# Fit a content-based model using the genres as item features
print("Training content-based recommender")
content_model = tensorrec.TensorRec(
    n_components=n_features,
    item_repr_graph=tensorrec.representation_graphs.FeaturePassThroughRepresentationGraph(),
     loss_graph=tensorrec.loss_graphs.WMRBLossGraph(),
#     prediction_graph=CosineSimilarityPredictionGraph,
)

Training content-based recommender


In [50]:
content_model.fit(interactions=sparse_train_ratings_4plus,
                  user_features=user_indicator_features,
                  item_features=books_features,
                 n_sampled_items=int(n_items * .01))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [51]:
# Check the results of the content-based model
print("Content-based recommender:")
predicted_ranks = content_model.predict_rank(user_features=user_indicator_features,
                                             item_features=books_features)


Content-based recommender:


In [52]:
predicted_ranks.shape

(52943, 2000)

In [53]:
print(OUTFILE)
check_results(predicted_ranks)

emb/book_tags_100_directed.emd


  ndcg = dcg/idcg


Recall at 10: Train: 0.0066 Test: 0.0066
Precision at 10: Train: 0.0164 Test: 0.0044
NDCG at 10: Train: 0.0099 Test: 0.0039


  ndcg = dcg/idcg


In [22]:
emb/book_tags_class_stacked_100_dim.emb - random seed 0
Recall at 10: Train: 0.0677 Test: 0.0389
Precision at 10: Train: 0.1390 Test: 0.0211
NDCG at 10: Train: 0.1126 Test: 0.0277

SyntaxError: invalid syntax (<ipython-input-22-9158b6cd5835>, line 1)

In [31]:
emb/book_tags_100_dim.emb
Recall at 10: Train: 0.0565 Test: 0.0236
Precision at 10: Train: 0.1204 Test: 0.0131
NDCG at 10: Train: 0.0961 Test: 0.0161


SyntaxError: invalid syntax (<ipython-input-31-df568698a3b4>, line 2)

In [65]:
emb/book_tags_100_dim.emb
Recall at 10: Train: 0.0748 Test: 0.0329
Precision at 10: Train: 0.1488 Test: 0.0171
NDCG at 10: Train: 0.1213 Test: 0.0207


SyntaxError: invalid syntax (<ipython-input-65-253a8eacffe3>, line 2)

In [49]:
emb/book_tag_class_new.emb
Recall at 10: Train: 0.1018 Test: 0.0486
Precision at 10: Train: 0.2041 Test: 0.0256
NDCG at 10: Train: 0.1960 Test: 0.0363

SyntaxError: invalid syntax (<ipython-input-49-660a49373f6d>, line 2)

In [65]:
emb/tags_and_genres_d_w_128.emd
Recall at 10: Train: 0.0506 Test: 0.0509
Precision at 10: Train: 0.1077 Test: 0.0285
NDCG at 10: Train: 0.1090 Test: 0.0466

SyntaxError: invalid syntax (<ipython-input-65-633b828e7263>, line 2)

emb/tags_and_genres_old.emd
Recall at 10: Train: 0.0102 Test: 0.0104
Precision at 10: Train: 0.0230 Test: 0.0061
NDCG at 10: Train: 0.0248 Test: 0.0102


# Content-based filtering with book metadata

TODO: Content-based filtering also should be one function 

In [12]:
# To improve the recommendations, lets read in the movie genres
#TODO: Do I need this? 
print('Loading books metadata')
with open('book_feat_reduced_new.csv', 'r') as metadata_file:
    metadata_file_reader = csv.reader(metadata_file)
    raw_metadata = list(metadata_file_reader)


Loading books metadata


In [13]:
from scipy import sparse
import pandas as pd
b_feat = pd.read_csv('book_feat_reduced_new.csv', header=None)
books_features = sparse.coo_matrix(b_feat)
n_features = b_feat.shape[1]

In [14]:
# Fit a content-based model using the genres as item features
print("Training content-based recommender")
content_model = tensorrec.TensorRec(
    n_components=n_features,
    item_repr_graph=tensorrec.representation_graphs.FeaturePassThroughRepresentationGraph(),
     loss_graph=tensorrec.loss_graphs.WMRBLossGraph()
)

Training content-based recommender


In [15]:
content_model.fit(interactions=sparse_train_ratings_4plus,
                  user_features=user_indicator_features,
                  item_features=books_features,
                 n_sampled_items=int(n_items * .01))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [16]:
# Check the results of the content-based model
print("Content-based recommender:")
predicted_ranks = content_model.predict_rank(user_features=user_indicator_features,
                                             item_features=books_features)

Content-based recommender:


In [17]:
check_results(predicted_ranks)

Recall at 10: Train: 0.0314 Test: 0.0308
