Python imports

In [1]:
import pandas as pd
from scipy import sparse
import numpy
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split
from sklearn.externals import joblib
import operator

Loading product data

In [2]:
%time productdf = pd.read_csv('artikel_export.csv')
productdf = productdf[productdf.mandant==2]
productdf.head()

CPU times: user 32.5 s, sys: 2.3 s, total: 34.8 s
Wall time: 34.9 s


Unnamed: 0,artikel_id,mandant,titel,i_spannung_nervenkitzel,i_liebe_romantik,i_fantasy_sciencefiction,i_fremdsprachiges_anderelaender,i_reiseinvergangenezeiten,i_wissensdurst,i_schoenegeschichten,i_haus_garten,i_hobby_freizeit,i_humorvolles,i_spiel_spass,ag_bis12monate,ag_1bis3,ag_4bis7,ag_8bis11,ag_ab12
3,45359741,2,"Ernst, Herzog von Schwaben",f,f,f,f,f,f,t,f,f,f,f,f,f,f,f,f
9,38740121,2,Trek Fail!,f,f,f,t,f,f,f,f,f,f,f,f,f,f,f,f
10,105820598,2,Jean Hornsby's Tree-Riffic!,f,f,f,t,f,f,f,f,f,f,f,f,f,f,f,f
11,45851959,2,Runmagi,f,f,f,t,f,f,f,f,f,f,f,f,f,f,f,f
12,35872885,2,The Six Day War 1967,f,f,f,t,f,f,f,f,f,f,f,f,f,f,f,f


Read CSV File. It has a header with customerid,productid,amount

In [3]:
%time df = pd.read_csv('interactions.csv', usecols = ['customerid','productid'], dtype = {'customerid':str, 'productid':str})
df.head()

CPU times: user 5.47 s, sys: 132 ms, total: 5.6 s
Wall time: 5.63 s


Unnamed: 0,customerid,productid
0,10157849,140178989
1,33674328,90057363
2,36789149,28134702
3,35022983,45626443
4,36043966,44992879


Create a new LightFM Dataset containing the customer and product interactions

In [4]:
uniqueCustomer = df['customerid'].unique()
uniqueProducts = productdf['artikel_id'].unique()

dataset = Dataset()
%time dataset.fit(uniqueCustomer,uniqueProducts)
%time dataset.fit_partial(items = df['productid'].unique(), item_features = ['i_spannung_nervenkitzel','i_liebe_romantik','i_fantasy_sciencefiction','i_fremdsprachiges_anderelaender','i_reiseinvergangenezeiten','i_wissensdurst','i_schoenegeschichten','i_haus_garten','i_hobby_freizeit','i_humorvolles','i_spiel_spass','ag_bis12monate', 'ag_bis12monate', 'ag_1bis3','ag_4bis7','ag_8bis11','ag_ab12'])

def toitemfeatures(row):
    result = []
    if (row.i_spannung_nervenkitzel != 'f'):
        result.append('i_spannung_nervenkitzel')
    if (row.i_liebe_romantik != 'f'):
        result.append('i_liebe_romantik')
    if (row.i_fantasy_sciencefiction != 'f'):
        result.append('i_fantasy_sciencefiction')
    if (row.i_fremdsprachiges_anderelaender != 'f'):        
        result.append('i_fremdsprachiges_anderelaender')
    if (row.i_reiseinvergangenezeiten != 'f'):        
        result.append('i_reiseinvergangenezeiten')
    if (row.i_wissensdurst != 'f'):        
        result.append('i_wissensdurst')
    if (row.i_schoenegeschichten != 'f'):        
        result.append('i_schoenegeschichten')
    if (row.i_haus_garten != 'f'):        
        result.append('i_haus_garten')
    if (row.i_hobby_freizeit != 'f'):        
        result.append('i_hobby_freizeit')
    if (row.i_humorvolles != 'f'):        
        result.append('i_humorvolles')
    if (row.i_spiel_spass != 'f'):        
        result.append('i_spiel_spass')
    return result

%time item_features = dataset.build_item_features((row.artikel_id, toitemfeatures(row)) for index, row in productdf.iterrows())

%time (interactions, weights) = dataset.build_interactions((row.customerid, row.productid) for index, row in df.iterrows())

print(repr(interactions))
    
print('Total number of unique customers         : %.0f' % len(uniqueCustomer))
print('Total number of unique products overall  : %.0f' % len(uniqueProducts))
print('Total number of unique products orders   : %.0f' % len(df['productid'].unique()))


CPU times: user 4.96 s, sys: 138 ms, total: 5.1 s
Wall time: 5.12 s
CPU times: user 3.06 s, sys: 27.8 ms, total: 3.08 s
Wall time: 3.09 s
CPU times: user 18min 28s, sys: 538 ms, total: 18min 29s
Wall time: 18min 34s
CPU times: user 21min 25s, sys: 16.8 ms, total: 21min 25s
Wall time: 21min 30s
<996243x5136320 sparse matrix of type '<class 'numpy.int32'>'
	with 10868978 stored elements in COOrdinate format>
Total number of unique customers         : 996243
Total number of unique products overall  : 4291111
Total number of unique products orders   : 845209


Creating train and test split model

In [5]:
(train,test) = random_train_test_split(interactions, test_percentage=0.2)

Train the LightFM model

In [6]:
print('Training LightFM model')
model = LightFM(loss='warp', no_components=32)
%time model.fit(train, item_features=item_features, epochs=100, num_threads=6)

Training LightFM model
CPU times: user 2h 9min 51s, sys: 9.44 s, total: 2h 10min
Wall time: 20min 28s


<lightfm.lightfm.LightFM at 0x7fbf9f9f7160>

In [7]:
joblib.dump(model, 'lightfmmodel.sav')
print('Model saved');
joblib.dump(dataset, 'lightfmdataset.sav')
print('Dataset saved');

Model saved
Dataset saved


Sample predictions

In [11]:
def sample_recommendation(model, data, user_ids):

    user2idx, user_feature_map, item2idx, item_feature_map = dataset.mapping()
    
    item2idx_inv = dict(map(reversed, item2idx.items()))    

    n_users, n_items = data.shape

    for user_id in user_ids:
        
        print('Recommentations for user %s' % user_id)
        
        scores = model.predict(user2idx[user_id], numpy.arange(n_items), item_features=item_features)
        
        comp = []
        
        for i in range(n_items):
            itemid = item2idx_inv[i]
            score = scores[i]
            comp.append((itemid,score))
        
        comp.sort(key = operator.itemgetter(1), reverse = False)    
        
        for i in range(10):
            (itemid,score) = comp[i]
            print('%s\t%2f' % (itemid, score))
    
sample_recommendation(model, interactions, ['31460292'])

Recommentations for user 31460292
116765117	-4.353509
56060241	-4.243227
44370424	-4.213742
45217202	-4.211519
44370400	-4.210111
45612166	-4.209202
18794237	-4.207780
21363710	-4.207471
45360118	-4.207326
17835006	-4.206690


Evaluate the model

In [None]:
print('Evaluating model')

aucscore = auc_score(model, interactions, num_threads=6, item_features=item_features).mean()
print('AUC Score:          %.2f.' % (aucscore))

aucscore_train = auc_score(model, train, num_threads=6, item_features=item_features).mean()
print('AUC Score train:    %.2f.' % (aucscore_train))

train_precision = precision_at_k(model, train, k=5, num_threads=6, item_features=item_features).mean()
test_precision = precision_at_k(model, test, k=5, num_threads=6,item_features=item_features).mean()
print('Precision@k5: train %.2f.' % (train_precision))
print('Precision@k5: test  %.2f.' % (test_precision))