In [219]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from lightfm import LightFM
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import src.utils as utils
import ml_metrics as metrics

In [223]:
# %load_ext line_profiler


In [224]:
ratings_raw = pd.read_csv('data/books/BX-Book-Ratings.csv', sep = ';', error_bad_lines=False)
ratings_raw.columns = ['user_id', 'isbn', 'rating']

user_counts = ratings_raw['user_id'].value_counts()
valid_users = user_counts.loc[user_counts>4].index.values

book_counts = ratings_raw.loc[ratings_raw.user_id.isin(valid_users)]['isbn'].value_counts()
valid_books = book_counts.loc[book_counts>4].index.values

ratings = ratings_raw.loc[(ratings_raw.isbn.isin(valid_books)) & (ratings_raw.user_id.isin(valid_users))]

In [225]:
ratings['binary_rating'] = 0
ratings.loc[ratings['rating'] > 0, 'binary_rating'] = 1



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [267]:
# books_data = pd.read_csv('data/books/BX-Books.csv', sep=';', error_bad_lines=False)
# books_data.columns = ['isbn', 'title', 'author', 'publication_year', 'publisher', 'image_url_s', 'image_url_m', 'image_url_l']
# books_data = books_data.loc[books_data.isbn.isin(ratings.isbn.unique())]

# books_data.author = books_data.author.apply(lambda x: x.replace(',', '_').replace(' ', '').lower())
# books_data.publisher = books_data.publisher.apply(lambda x: x.replace(',', '_').replace(' ', '').lower())
# books_data.year = books_data.publication_year.astype(str)

item_side_data = pd.get_dummies(books_data.set_index('isbn')[['publisher', 'author', 'publication_year']])

In [227]:
users_data = pd.read_csv('data/books/BX-Users.csv', sep=';', error_bad_lines=False)
users_data.columns = ['user_id', 'location', 'age']
users_data = users_data.loc[users_data.user_id.isin(ratings.user_id.unique())]
users_data.location = users_data.location.apply(lambda x: x.replace(',', '_').replace(' ', '').lower())


location_count = users_data.location.value_counts()
null_locs = location_count.loc[location_count < 5].index.values
users_data.location = users_data.location.apply(lambda x: '' if x in null_locs else x)

user_side_data = pd.get_dummies(users_data).fillna(0)

In [228]:
def mapk(actual, predicted, k):
    apk_list = [metrics.apk(actual[i].tolist(), predicted[i], k) for i, r in enumerate(predicted)]
    return np.mean(apk_list)

# No Side Data

In [229]:
train, test = train_test_split(ratings)

In [230]:
books = train.isbn.unique()
idxs = np.arange(len(books))
idx_item_map = dict(zip(idxs, books))
item_idx_map = dict(zip(books, idxs))

users = train.user_id.unique()
idxs = np.arange(len(users))
idx_user_map = dict(zip(idxs, users))
user_idx_map = dict(zip(users, idxs))

rows = train.user_id.map(user_idx_map).values
cols = train.isbn.map(item_idx_map).values
data = train.binary_rating.values

rtngs = sp.csr_matrix(
    (data, (rows, cols)),
    shape=(len(np.unique(rows)), len(np.unique(cols)))
)


In [246]:
model = LightFM(no_components=30, loss='warp', learning_rate=0.01)
model.fit(rtngs, epochs=30)

<lightfm.lightfm.LightFM at 0x122924790>

In [247]:
np.argsort(-model.item_biases)

array([  105,  1244,   833, ..., 26094, 17949, 28800])

In [248]:
print idx_item_map[105], idx_item_map[1244], idx_item_map[901]

0971880107 0316666343 059035342X


In [249]:
print model.item_biases[105], model.item_biases[1244], model.item_biases[901]
print np.dot(model.user_embeddings[7546], model.item_embeddings[105])
print np.dot(model.user_embeddings[7546], model.item_embeddings[1244])
print np.dot(model.user_embeddings[7546], model.item_embeddings[901])

0.9957839 0.9927258 0.8682729
1.5202367
1.6802928
1.5175416


In [250]:
train.isbn.value_counts()[:3]

0971880107    1276
0316666343     724
0385504209     542
Name: isbn, dtype: int64

In [251]:
print model.item_biases[651], model.item_biases[321], model.item_biases[429]
print np.dot(model.user_embeddings[7546], model.item_embeddings[651])
print np.dot(model.user_embeddings[7546], model.item_embeddings[321])
print np.dot(model.user_embeddings[7546], model.item_embeddings[429])

0.43009782 0.38511506 0.53316766
0.92682326
0.81278217
1.0860568


In [252]:
rtngs

<21525x39697 sparse matrix of type '<type 'numpy.int64'>'
	with 456574 stored elements in Compressed Sparse Row format>

In [253]:
truths_all = []
recs_all = []
pops_all = []
pc = 0

pops = train.isbn.value_counts().index.values[:100]
for user_id in tqdm(test[['user_id']].drop_duplicates().sample(2000).user_id.values):
    truth = test.loc[test.user_id == user_id].isbn.values
    truths_all.append(truth)
    known = [] #train.loc[train.user_id == user_id].isbn.values
    
    if user_id not in user_idx_map.keys():
        pc += 1
        recs = [p for p in pops if p not in known][:10]
    
    else:
        user_idx = user_idx_map[user_id]
        rec_scores = model.predict(user_idx, np.array(idx_item_map.keys()))
        rec_idx = np.argsort(-rec_scores)

        recs = [idx_item_map[rec] for rec in rec_idx[:100]]
    
    recs = [rec for rec in recs if rec not in known][:10]
    recs_all.append(recs)
    
    pops_cur = [p for p in pops if p not in known][:10]
    pops_all.append(pops_cur)


100%|██████████| 2000/2000 [00:21<00:00, 92.86it/s]


In [254]:
[np.around(mapk(truths_all, recs_all, k), 3) for k in np.arange(1, 10)]

[0.016, 0.012, 0.011, 0.01, 0.01, 0.009, 0.009, 0.009, 0.009]

In [255]:
[np.around(mapk(truths_all, pops_all, k), 3) for k in np.arange(1, 10)]

[0.024, 0.015, 0.013, 0.011, 0.01, 0.01, 0.01, 0.009, 0.009]

# With user side data

In [270]:
user_side_data['user_idx'] = user_side_data['user_id'].map(user_idx_map)
user_side_data_full = pd.merge(pd.DataFrame(idx_user_map.keys(), columns=['user_idx']), user_side_data, how='left').fillna(0)
user_side_data_full = user_side_data_full.sort_values('user_idx')

# user_side_data_full['user_id'] = user_side_data_full['user_id'].astype(str)
# user_side_data_dmy = pd.get_dummies(user_side_data_full.drop(['user_idx', 'age'], axis=1))
# usd = sp.csr_matrix(user_side_data_dmy)

In [274]:
user_side_data.head()

Unnamed: 0,user_id,age,location_,location_aachen_nordrhein-westfalen_germany,location_abbotsford_britishcolumbia_canada,location_aberdeen_scotland_unitedkingdom,location_acton_massachusetts_usa,location_adelaide_southaustralia_australia,location_adrian_michigan_usa,location_akron_ohio_usa,...,location_woodinville_washington_usa,location_woodside_newyork_usa,location_woodstock_georgia_usa,location_worcester_massachusetts_usa,location_yarmouth_novascotia_canada,location_york_pennsylvania_usa,location_ypsilanti_michigan_usa,location_zaragoza_arag�n_spain,location_zaragoza_zaragoza_spain,user_idx
7,8,0.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1102.0
16,17,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5514.0
43,44,51.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,20107.0
52,53,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8123.0
68,69,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,15456.0


In [None]:
user

In [257]:
model = LightFM(no_components=30, loss='warp', learning_rate=0.01)
model.fit(rtngs, user_features = usd, epochs=30)

<lightfm.lightfm.LightFM at 0x12290e2d0>

In [258]:
truths_all = []
recs_all = []
pops_all = []

pops = train.isbn.value_counts().index.values[:100]
for user_id in tqdm(test[['user_id']].drop_duplicates().sample(500).user_id.values):
    truth = test.loc[test.user_id == user_id].isbn.values
    truths_all.append(truth)
    known = train.loc[train.user_id == user_id].isbn.values
    
    if user_id not in user_idx_map.keys():
        recs = [p for p in pops if p not in known][:10]
    
    else:
        user_idx = user_idx_map[user_id]
        rec_scores = model.predict(user_idx, np.array(idx_item_map.keys()), user_features = usd)
        rec_idx = np.argsort(-rec_scores)

        recs = [idx_item_map[rec] for rec in rec_idx[:100]]
    
    recs = [rec for rec in recs if rec not in known][:10]
    recs_all.append(recs)
    
    pops_cur = [p for p in pops if p not in known][:10]
    pops_all.append(pops_cur)


100%|██████████| 500/500 [00:06<00:00, 74.77it/s]


In [259]:
[np.around(mapk(truths_all, recs_all, k), 3) for k in np.arange(1, 10)]

[0.032, 0.021, 0.016, 0.013, 0.012, 0.012, 0.011, 0.011, 0.01]

In [260]:
[np.around(mapk(truths_all, pops_all, k), 3) for k in np.arange(1, 10)]

[0.034, 0.021, 0.015, 0.013, 0.012, 0.011, 0.01, 0.01, 0.01]

# With Item Side Data

In [None]:
item_side_data['item_idx'] = item_side_data['isbn'].map(item_idx_map)
item_side_data_full = pd.merge(pd.DataFrame(item_idx_map.keys(), columns=['item_idx']), item_side_data_full, how='left').fillna(0)
item_side_data_full.sort_values('item_idx')

user_side_data_full['user_id'] = user_side_data_full['user_id'].astype(str)
user_side_data_dmy = pd.get_dummies(user_side_data_full.drop(['user_idx', 'age'], axis=1))
usd = sp.csr_matrix(user_side_data_dmy)