Python imports

In [1]:
import pandas as pd
from scipy import sparse
import numpy
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split
import operator

Loading product data

In [2]:
%time df = pd.read_csv('orders_last_year.csv', sep='\t', usecols = ['kundennummer','artikelnummer','werkid','autorid','autor','mediumid','medium'], dtype = {'werkid':"category", 'autorid':"category", 'kundennummer':"category", 'artikelnummer':"category"})


CPU times: user 1min 15s, sys: 2.08 s, total: 1min 17s
Wall time: 1min 17s


Read CSV File. It has a header with customerid,productid,amount

In [3]:
%time grouped = df.groupby(['kundennummer','autorid']).size().reset_index(name = 'amount')

grouped.head()

CPU times: user 3.6 s, sys: 551 ms, total: 4.15 s
Wall time: 4.17 s


Unnamed: 0,kundennummer,autorid,amount
0,7700160454,10990216,1
1,7700160454,11387441,1
2,7700160454,12058762,2
3,7700160454,137510,4
4,7700160454,4354809,1


Create a new LightFM Dataset containing the customer and product interactions

In [4]:
uniqueCustomer = grouped['kundennummer'].unique()
uniqueProducts = grouped['autorid'].unique()

dataset = Dataset()
%time dataset.fit(uniqueCustomer,uniqueProducts)
%time (interactions, weights) = dataset.build_interactions((row.kundennummer, row.autorid) for index, row in grouped.iterrows())

print(repr(interactions))
    
print('Total number of unique customers         : %.0f' % len(uniqueCustomer))
print('Total number of unique products orders   : %.0f' % len(uniqueProducts))


CPU times: user 2.38 s, sys: 107 ms, total: 2.48 s
Wall time: 2.49 s
CPU times: user 14min 10s, sys: 259 ms, total: 14min 10s
Wall time: 14min 13s
<1978962x198112 sparse matrix of type '<class 'numpy.int32'>'
	with 7251609 stored elements in COOrdinate format>
Total number of unique customers         : 1978962
Total number of unique products orders   : 198112


Creating train and test split model

In [5]:
(train,test) = random_train_test_split(interactions, test_percentage=0.2)

Train the LightFM model

In [6]:
print('Training LightFM model')
model = LightFM(loss='warp', no_components=32)
%time model.fit(train, epochs=60, num_threads=6)

Training LightFM model
CPU times: user 32min 31s, sys: 1.14 s, total: 32min 32s
Wall time: 6min 39s


<lightfm.lightfm.LightFM at 0x7f44fef6e6d8>

Sample predictions

In [8]:
def sample_recommendation(model, data, user_ids):

    user2idx, user_feature_map, item2idx, item_feature_map = dataset.mapping()
    
    item2idx_inv = dict(map(reversed, item2idx.items()))    

    n_users, n_items = data.shape

    for user_id in user_ids:
        
        print('Recommentations for user %s' % user_id)
        
        scores = model.predict(user2idx[user_id], numpy.arange(n_items))
        
        comp = []
        
        for i in range(n_items):
            itemid = item2idx_inv[i]
            score = scores[i]
            comp.append((itemid,score))
        
        comp.sort(key = operator.itemgetter(1), reverse = True)    
        
        for i in range(10):
            (itemid,score) = comp[i]
            print('%s\t%2f' % (itemid, score))
    
sample_recommendation(model, interactions, ['7710910618'])

Recommentations for user 7710910618
10768003	4.308501
3383307	4.085265
494473	4.067701
91365	3.933437
14800838	3.867929
11466788	3.825294
13145224	3.824771
8691600	3.804009
65323	3.729458
174710	3.664338


Evaluate the model

In [None]:
print('Evaluating model')

aucscore = auc_score(model, interactions, num_threads=6).mean()
print('AUC Score:          %.2f.' % (aucscore))

aucscore_train = auc_score(model, train, num_threads=6).mean()
print('AUC Score train:    %.2f.' % (aucscore_train))

train_precision = precision_at_k(model, train, k=5, num_threads=6).mean()
test_precision = precision_at_k(model, test, k=5, num_threads=6).mean()
print('Precision@k5: train %.2f.' % (train_precision))
print('Precision@k5: test  %.2f.' % (test_precision))