# Recommender system based on LightFM
## The model with item_features

In [1]:
from scipy.sparse import coo_matrix as sp
import time
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import os
# import zipfile
# import csv
# import requests
import json
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import linear_model
from scipy.special import expit

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train.head(3)

Unnamed: 0,overall,verified,reviewTime,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image,userid,itemid,rating
0,5.0,True,"10 4, 2016",B01CPNIEQG,Heather,These are my FAVORITE spices in my collection....,Must Add to your Spice kitchen!,1475539200,,,,102179,37138,1.0
1,5.0,True,"03 1, 2016",B006F63M8U,Linda Odom,Add A package to my Coffee and it makes a good...,Milk Chocolate Swiis MIss Hot Cocoa Mix,1456790400,,{'Size:': ' 60-0.73 oz Envelopes'},,3625,17322,1.0
2,5.0,True,"06 26, 2016",B00112O8NG,DesertBlossom,"I love the Torani syrups, but not the prices o...","Love these sugar free syrups, but didn't love ...",1466899200,28.0,,,39495,5600,1.0


## Products

In [4]:
%%time
# Read products and create dataframe
f = open('meta_Grocery_and_Gourmet_Food.json', 'r') 
count = 0
data = []
while True: 
    count += 1
    line = f.readline()
    if not line: break
    data.append(json.loads(line))
f.close() 

prod = pd.DataFrame(data)
prod.drop(['date','feature','details','similar_item','tech1','fit'],axis=1, inplace=True)

CPU times: user 21 s, sys: 1.58 s, total: 22.6 s
Wall time: 22.6 s


In [5]:
# Leave products which are used in train and test
asin = np.concatenate([train.asin.values, test.asin.values])
prod = prod[(prod.asin.isin(asin))]

In [6]:
# Brands
prod.brand.fillna('unknown_brand',inplace=True)
brand_list = list(set(np.concatenate([prod.brand.values])))

# Joined feature list
feature_list = list(set(brand_list))

## Train / test

In [7]:
%%time
# Building the Model

dataset = Dataset()
dataset.fit(train.userid.values,
            train.asin.values,
            item_features=feature_list)
dataset.fit_partial(test.userid.values,
                    test.asin.values)

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 127496, num_items 41320.
CPU times: user 1.27 s, sys: 3.91 ms, total: 1.27 s
Wall time: 1.27 s


In [8]:
%%time
(interactions, weights) = dataset.build_interactions(
    [tuple(row) for row in train[['userid','asin','rating']].values])

print(repr(interactions),repr(weights))

weights_train, weights_test = \
    random_train_test_split(weights,
                            test_percentage=0.2,
                            random_state=np.random.RandomState(42))

<127496x41320 sparse matrix of type '<class 'numpy.int32'>'
	with 857895 stored elements in COOrdinate format> <127496x41320 sparse matrix of type '<class 'numpy.float32'>'
	with 857895 stored elements in COOrdinate format>
CPU times: user 2.42 s, sys: 39.9 ms, total: 2.46 s
Wall time: 2.46 s


In [9]:
def join_feat(row):
    return [row['brand']]

feat_subset = prod[['brand']].apply(join_feat, axis=1).values
feat_tuple = tuple(zip(prod['asin'], feat_subset))
item_features = dataset.build_item_features(feat_tuple, normalize=True)

In [10]:
%%time
# Training the Model

NUM_THREADS = 8 #число потоков
NUM_COMPONENTS = 200 #число параметров вектора 
NUM_EPOCHS = 5 #число эпох обучения

model = LightFM(learning_rate=0.03, loss='warp', # 'logistic', 'warp', 'bpr', 'warp-kos'
                no_components=NUM_COMPONENTS)

model = model.fit(weights_train,
                  item_features=item_features,
                  epochs=NUM_EPOCHS, 
                  num_threads=NUM_THREADS, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
CPU times: user 40.3 s, sys: 1.21 s, total: 41.5 s
Wall time: 10.7 s


## AUC score

In [11]:
%%time
train_auc = auc_score(model, weights_train, item_features=item_features,
#                      train_interactions=weights_train,
                     num_threads=8)
print('AUC: train %.4f' % (train_auc.mean()))

AUC: train 0.9293
CPU times: user 14min 52s, sys: 677 ms, total: 14min 53s
Wall time: 3min 46s


In [12]:
%%time
test_auc = auc_score(model, weights_test, item_features=item_features,
#                      train_interactions=weights_train,
                     num_threads=8)
print('AUC: test %.4f' % (test_auc.mean()))

AUC: test 0.8235
CPU times: user 10min 12s, sys: 540 ms, total: 10min 12s
Wall time: 2min 37s


## Precision

In [13]:
%%time
prec_score = precision_at_k(
                     model,
                     weights_train,
                     num_threads=NUM_THREADS,
                     k=3,
                     item_features=item_features)
print('prec_score: %.4f' % (prec_score.mean()))

prec_score: 0.1156
CPU times: user 14min 49s, sys: 572 ms, total: 14min 50s
Wall time: 3min 43s


In [14]:
%%time
prec_score_test = precision_at_k(
                     model,
                     weights_test,
                     num_threads=NUM_THREADS,
                     k=3,
                     item_features=item_features)
print('prec_score test: %.4f' % (prec_score_test.mean()))

prec_score test: 0.0305
CPU times: user 10min 20s, sys: 575 ms, total: 10min 21s
Wall time: 2min 49s


## Recall

In [15]:
%%time
recall_score = recall_at_k(model,
                     weights_train,
                     num_threads=NUM_THREADS,
                     k=3,
                     item_features=item_features)
print('recall_score: %.4f' % (recall_score.mean()))

recall_score: 0.0843
CPU times: user 15min 16s, sys: 787 ms, total: 15min 17s
Wall time: 4min 6s


In [16]:
%%time
recall_score_test = recall_at_k(model,
                     weights_test,
                     num_threads=NUM_THREADS,
                     k=3,
                     item_features=item_features)
print('recall_score test: %.4f' % (recall_score_test.mean()))

recall_score test: 0.0557
CPU times: user 10min 21s, sys: 707 ms, total: 10min 22s
Wall time: 2min 51s
