In [1]:
import pandas as pd

In [2]:
item_dt = pd.read_csv("items.csv", sep='|')
trans_dt = pd.read_csv("transactions.csv", sep='|')
eval_dt = pd.read_csv("evaluation.csv")

In [3]:
# Converting ID's to string
item_dt['itemID']=item_dt['itemID'].astype(str)
trans_dt['itemID']=trans_dt['itemID'].astype(str)
trans_dt['sessionID']=trans_dt['sessionID'].astype(str)
eval_dt['itemID']= eval_dt['itemID'].astype(str)

In [4]:
# Create column based on number of items in session
trans_dt['session_count'] = trans_dt.groupby('sessionID')['itemID'].transform('count')

#Filter by session_count>=2 to include sessions with more than 2 items.
sig_trans_dt = trans_dt[trans_dt['session_count']>=2]

In [5]:
# We can see only 1/3 of transaction data are remaining.
print(trans_dt.shape,sig_trans_dt.shape)

(365143, 6) (129501, 6)


In [6]:
# We don't need session_count from now
del trans_dt['session_count']
del sig_trans_dt['session_count']

In [7]:
sig_trans_dt.head()

Unnamed: 0,sessionID,itemID,click,basket,order
7,7,14576,1,1,0
8,7,17731,2,1,0
13,12,30277,1,0,0
14,12,29508,1,1,0
15,12,75659,1,0,0


In [8]:
#Only 304 items in evaluation dataset are included in sig_trans_dt
eval_dt[eval_dt['itemID'].isin(sig_trans_dt['itemID'])].shape

(304, 1)

In [9]:
eval_dt[eval_dt['itemID'].isin(sig_trans_dt['itemID'])]

Unnamed: 0,itemID
3,41371
8,56794
10,62060
16,24603
24,77956
...,...
983,56782
985,50648
990,58358
994,23570


In [24]:
#Let's work with one of the items. (41373, Dryadenhain & Dschinnenzauber (Märchenanthologie))
item_dt[item_dt['itemID'] == "56794"]

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
5808,56794,Damaris (Band 2): Der Ring des Fürsten,C. M. Spoerri,Sternensand Verlag,FMB,[FN]


In [25]:
item_included_sessions = sig_trans_dt[sig_trans_dt['itemID'] == "56794"]['sessionID']

In [26]:
#Only one session includes item 41371
item_included_sessions

14375      10851
34914      26480
41781      31578
51503      39038
57451      43652
80253      60993
177128    135370
224217    171438
236802    181175
291718    223153
351336    268879
Name: sessionID, dtype: object

In [27]:
included_dt = sig_trans_dt[sig_trans_dt['sessionID'].isin(item_included_sessions)]

In [28]:
# This session also includes item 29539 and 54683.
included_dt

Unnamed: 0,sessionID,itemID,click,basket,order
14370,10851,4000,1,1,0
14371,10851,12699,1,0,0
14372,10851,16463,1,0,0
14373,10851,21753,1,0,0
14374,10851,26243,1,1,0
...,...,...,...,...,...
351337,268879,37175,1,0,0
351338,268879,2121,3,0,0
351339,268879,20756,1,0,0
351340,268879,10305,1,0,0


In [38]:
item_count = included_dt[included_dt['itemID']!= "56794"].groupby('itemID').count()['sessionID']
item_click = included_dt[included_dt['itemID']!= "56794"].groupby('itemID').sum()['click']
item_basket = included_dt[included_dt['itemID']!= "56794"].groupby('itemID').sum()['basket']
item_order = included_dt[included_dt['itemID']!= "56794"].groupby('itemID').sum()['order']
score = item_count+2*item_click+3*item_basket+4*item_order
score = score.sort_values(ascending=False)

In [39]:
score

itemID
44420    43
2121     42
10305    17
5160      7
47195     6
         ..
27846     3
27704     3
25985     3
25620     3
890       3
Length: 78, dtype: int64

In [40]:
recommended_items = score.head(5).index.to_list() # Get top 5 values.
recommended_items

['44420', '2121', '10305', '5160', '47195']

In [41]:
# Let's see which items are they. We can find that these books have similar topics and same publisher.
item_dt[item_dt['itemID'].isin(recommended_items)]

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
1561,10305,Die Greifen-Saga - Die komplette Trilogie im S...,C. M. Spoerri,Sternensand Verlag,FMB,[FN]
5074,44420,Damaris: Das Vermächtnis der Wüstenzwerge,C. M. Spoerri,Sternensand Verlag,FMB,[FN]
5270,2121,Damaris (Band 1): Der Greifenorden von Chakas,C. M. Spoerri,Sternensand Verlag,FMB,[]
10650,47195,Die Brücke der Gezeiten 5,David Hair,Blanvalet Taschenbuchverl,FMB,[FYT]
12335,5160,Die Erellgorh-Trilogie / Erellgorh - Geheime Wege,Matthias Teut,Dichtfest GbR,FMB,"[5AX,2ACG,6FG,FB]"


## Let's start with whole items which are included in evaluation data

In [42]:
eval_included_sessions = sig_trans_dt[sig_trans_dt['itemID'].isin(eval_dt['itemID'])]['sessionID']

In [43]:
eval_included_trans_dt = sig_trans_dt[sig_trans_dt['sessionID'].isin(eval_included_sessions)]

In [44]:
# We get dataframe with 15551 rows and 5 columns
eval_included_trans_dt

Unnamed: 0,sessionID,itemID,click,basket,order
190,148,48175,1,0,0
191,148,47684,1,0,0
251,205,15528,0,1,0
252,205,74398,1,0,0
253,205,2417,2,0,0
...,...,...,...,...,...
365118,279343,15609,1,0,0
365119,279343,33928,9,0,0
365120,279343,68184,1,0,0
365121,279343,67832,2,0,0


In [45]:
# Item which are included in sig_trans_dt and eval_dt
items = eval_dt[eval_dt['itemID'].isin(sig_trans_dt['itemID'])]['itemID']

In [46]:
# We will get our recommendations as dictionary
recommendations = {}

In [47]:
for i in items:
    item_included_sessions = eval_included_trans_dt[eval_included_trans_dt['itemID']==i]['sessionID'] # Sessions which items are in.
    included_dt = sig_trans_dt[sig_trans_dt['sessionID'].isin(item_included_sessions)] # Filtering dataframe by session
    # Instead of item_count, we can also use sum of other columns, such as orders or clicks.
    item_count = included_dt[included_dt['itemID']!= i].groupby('itemID').count()['sessionID']
    item_click = included_dt[included_dt['itemID']!= i].groupby('itemID').sum()['click']
    item_basket = included_dt[included_dt['itemID']!= i].groupby('itemID').sum()['basket']
    item_order = included_dt[included_dt['itemID']!= i].groupby('itemID').sum()['order']
    score = item_count+2*item_click+3*item_basket+4*item_order
    score = score.sort_values(ascending=False)
    recommended_items = score.head(5).index.to_list() # Get top 5 values.
    recommendations[i] = recommended_items

In [48]:
recommendations

{'41371': ['29539', '54689'],
 '56794': ['44420', '2121', '10305', '5160', '47195'],
 '62060': ['1136', '37310', '23422', '34366', '40031'],
 '24603': ['44027', '76704', '44697', '32145', '14442'],
 '77956': ['33848', '70706', '32948', '45682', '54859'],
 '67776': ['68187', '5558', '55410', '67039', '57137'],
 '53929': ['69630', '1567', '75204', '27817', '59951'],
 '2319': ['63634', '16123', '22061', '24844', '27457'],
 '62494': ['27493', '28497', '76022', '78771', '50539'],
 '21497': ['77708', '19761', '48669', '23495', '36098'],
 '53008': ['67796', '59779', '52217', '63809', '15175'],
 '39308': ['20641', '40692', '42058', '77481', '46617'],
 '4690': ['13869', '59552', '15840', '73855', '23972'],
 '65353': ['47638', '30775', '73377', '66120', '65391'],
 '61593': ['25206', '6630', '2417', '61749', '13059'],
 '74398': ['75305', '15606', '19379', '72081', '56174'],
 '38760': ['32266', '5291', '29711', '45237'],
 '17665': ['50330', '9282', '65959', '37594', '38034'],
 '56282': ['9018', '2

In [49]:
recommendation_dt = pd.DataFrame()

In [50]:
recommendation_dt['itemID'] = recommendations.keys()
recommendation_dt = recommendation_dt.set_index('itemID')

In [51]:
recommendation_dt['rec_1'] = None
recommendation_dt['rec_2'] = None
recommendation_dt['rec_3'] = None
recommendation_dt['rec_4'] = None
recommendation_dt['rec_5'] = None

In [52]:
for i in recommendation_dt.index:
    for j in range(len(recommendations[i])):
        recommendation_dt.loc[i,"rec_%d"%(j+1)] = recommendations[i][j]    

In [53]:
recommendation_dt

Unnamed: 0_level_0,rec_1,rec_2,rec_3,rec_4,rec_5
itemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
41371,29539,54689,,,
56794,44420,2121,10305,5160,47195
62060,1136,37310,23422,34366,40031
24603,44027,76704,44697,32145,14442
77956,33848,70706,32948,45682,54859
...,...,...,...,...,...
56782,44335,67567,62845,37357,49638
50648,63736,,,,
58358,6095,26078,40044,57469,69185
23570,15202,13576,46308,25885,68711


In [54]:
recommendation_dt.to_csv("trans_recommend.csv")

In [55]:
item_dt[item_dt['itemID'].isin(["58358","6095","26078","40044","57469","69185"])]

Unnamed: 0,itemID,title,author,publisher,main topic,subtopics
112,69185,Die Kunst des Zeichnens für Kinder,Gecko Keck,Frech Verlag GmbH,WFA,"[5,5A,5AK,5J,5JA,5JB,W,WF,WFA]"
253,58358,Mein Zauberwald,Johanna Basford,Knesebeck Von Dem GmbH,YBL,"[5AJ,WFA,WFX,WZG,YN]"
1184,40044,Tiere zeichnen lernen,,Edition XXL GmbH,AFF,"[5AG,YBL]"
2381,6095,Mein verzauberter Garten,Johanna Basford,Knesebeck Von Dem GmbH,YNPG,"[5AJ,WFX,WZG,YBL]"
2770,26078,Verwunschene Malwelten,,Loewe Verlag GmbH,YBGC,"[WFX,WZG,YBGC,YBL,YNA,YNV]"
9643,57469,Zeichnen Schritt-für-Schritt,,Edition XXL,AFF,"[AGK,AGN,YNNB]"
