# Recomendação baseada em sessão em e-commerce

### Fonte da base completa: https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store?resource=download&select=2019-Oct.csv

In [None]:
!python3 -m pip install wget
import wget
!python3 -m wget https://github.com/mmanzato/6EABDARecSys/raw/main/dataset/ECommerce.csv

In [1]:
import pandas as pd
import numpy as np

In [8]:
subset = pd.read_csv('./ECommerce.csv')
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c


In [9]:
map_items = {item: idx for idx, item in enumerate(subset.product_id.unique())}
map_sessions = {item: idx for idx, item in enumerate(subset.user_session.unique())}
subset['itemId'] = subset['product_id'].map(map_items)
subset['sessionId'] = subset['user_session'].map(map_sessions)
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,0,0
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c,2,0
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,3,0


In [10]:
n_items = subset['itemId'].max()+1
print('No. items: ', n_items)
n_sessions = subset['sessionId'].max()+1
print('No. sessions: ', n_sessions)

No. items:  35380
No. sessions:  241671


In [14]:
# create a dataset
# remove sessions with less than 2 items
def create_data(df):
    df.sort_values(by=['sessionId', 'event_time'], inplace=True, ignore_index=True)
    sessions, session = [], []
    for index, value in df.iterrows():
        if index != 0:
            if value["sessionId"] == df.at[index-1, "sessionId"]:
                if value["event_type"] == 'view':
                    session.append(value["itemId"])
            else:
                if len(session) > 1:
                    sessions.append((df.at[index-1, "sessionId"], session))
                session = [value["itemId"]]
        else:
            session.append(value["itemId"])
    return sessions

In [15]:
sessions = create_data(subset)

In [16]:
print('No. sessions: ', len(sessions))
print('Session 1:', sessions[1])
subset.loc[subset.sessionId==1]

No. sessions:  148320
Session 1: (1, [6, 7, 8, 9, 10, 11, 12, 9, 13, 9, 0, 14, 1, 15, 16, 17])


Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
7,2019-10-06 11:24:45 UTC,view,1004768,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,6,1
8,2019-10-06 11:25:54 UTC,view,1005098,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,7,1
9,2019-10-06 11:25:59 UTC,view,1005073,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,8,1
10,2019-10-06 11:26:39 UTC,view,1004871,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,9,1
11,2019-10-06 11:26:53 UTC,view,1004751,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,10,1
12,2019-10-06 11:27:05 UTC,view,1004653,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,11,1
13,2019-10-06 11:27:24 UTC,view,1005015,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,12,1
14,2019-10-06 11:28:05 UTC,view,1004871,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,9,1
15,2019-10-06 11:28:34 UTC,view,1003527,electronics.smartphone,xiaomi,00000083-8816-4d58-a9b8-f52f54186edc,13,1
16,2019-10-06 11:28:45 UTC,view,1004871,electronics.smartphone,samsung,00000083-8816-4d58-a9b8-f52f54186edc,9,1


In [17]:
import random

random.shuffle(sessions)
split = len(sessions) * 0.8
train = sessions[:int(split)]
test = sessions[int(split):]
print('No. train sessions: ', len(train))
print('No. test sessions: ', len(test))

No. train sessions:  118656
No. test sessions:  29664


In [18]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [19]:
actual_session = test[12]
target = actual_session[1][0:-1]
print(actual_session)
print(target)
subset.loc[subset.sessionId==actual_session[0]]

(66232, [3617, 192, 195])
[3617, 192]


Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
275324,2019-10-12 14:09:02 UTC,view,6000208,auto.accessories.alarm,pandora,02a85955-993c-4e33-84f2-81da144a422b,3617,66232
275325,2019-10-12 14:11:55 UTC,view,6000094,auto.accessories.alarm,starline,02a85955-993c-4e33-84f2-81da144a422b,192,66232
275326,2019-10-12 14:17:51 UTC,view,6000229,auto.accessories.alarm,starline,02a85955-993c-4e33-84f2-81da144a422b,195,66232


In [20]:
def compute_score(train, target, itemId):
    candidate_sessions = []
    for s in range(len(train)):
        if itemId in train[s][1]:
            candidate_sessions.append(train[s][1])
    
    score = 0
    for n in range(len(candidate_sessions)):
        score += jaccard(candidate_sessions[n], target)
    
    return score
    
compute_score(train=train, target=target, itemId=195)

3.8153909602419485

In [21]:
categories = subset.loc[subset.sessionId==actual_session[0]]['category_code'].unique().tolist()
candidate_items = subset.loc[subset.category_code.isin(categories)]['itemId'].unique().tolist()
candidate_items

[192,
 193,
 194,
 195,
 353,
 396,
 791,
 792,
 793,
 998,
 999,
 1000,
 1001,
 1002,
 1818,
 1996,
 2074,
 2075,
 2349,
 2688,
 2829,
 2830,
 2847,
 2897,
 3026,
 3027,
 3614,
 3615,
 3616,
 3617,
 3618,
 3619,
 3620,
 3621,
 3622,
 3623,
 3624,
 4477,
 4540,
 4541,
 4711,
 5230,
 5231,
 5426,
 5427,
 5443,
 5661,
 6098,
 6207,
 6208,
 6209,
 6210,
 6436,
 6437,
 6438,
 6503,
 6504,
 6505,
 6506,
 6715,
 6876,
 7666,
 7667,
 7807,
 7808,
 7809,
 7838,
 8176,
 8177,
 8178,
 8796,
 8797,
 8858,
 8859,
 9305,
 9306,
 9307,
 9637,
 9706,
 9735,
 9891,
 9892,
 9893,
 9907,
 10305,
 10579,
 10580,
 11421,
 11748,
 11877,
 12376,
 14631,
 15111,
 15767,
 16091,
 16685,
 17322,
 18367,
 19057,
 19323,
 20149,
 20179,
 20755,
 20924,
 20950,
 22291,
 22399,
 25701,
 26345,
 26799,
 27618,
 27704,
 28556,
 28790,
 29741,
 34546]

In [23]:
ranking = []
for i in range(len(candidate_items)):
    ranking.append((compute_score(train, target, candidate_items[i]), candidate_items[i]))

ranking.sort()
ranking.reverse()
print(ranking[0:10])

[(66.91265771294417, 192), (12.230885114147412, 791), (9.052438868681444, 193), (5.512958634450153, 2688), (3.8153909602419485, 195), (3.2293649504327253, 998), (3.1151415500731554, 792), (2.892015294646874, 3617), (2.551746245608138, 2829), (2.3040635476158426, 5661)]


In [24]:
subset.loc[subset.itemId==791]

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
1801,2019-10-08 05:48:42 UTC,view,6000227,auto.accessories.alarm,starline,00057ca1-3dfc-47c2-b463-60ab56cc1f7e,791,499
1802,2019-10-08 05:49:15 UTC,view,6000227,auto.accessories.alarm,starline,00057ca1-3dfc-47c2-b463-60ab56cc1f7e,791,499
2404,2019-10-06 09:03:53 UTC,view,6000227,auto.accessories.alarm,starline,0006ca24-345c-4c0b-b901-f76b2c959103,791,618
2926,2019-10-11 12:20:56 UTC,view,6000227,auto.accessories.alarm,starline,000862ec-aef0-46d5-ab1a-2f266d71edd2,791,759
3469,2019-10-12 11:03:06 UTC,view,6000227,auto.accessories.alarm,starline,000a0614-8553-4c4e-86c0-bc0022e87e4a,791,923
...,...,...,...,...,...,...,...,...
996489,2019-10-02 06:19:33 UTC,view,6000227,auto.accessories.alarm,starline,099c2cc0-5477-446d-bee6-c51fec2303d7,791,240726
999306,2019-10-25 06:41:18 UTC,view,6000227,auto.accessories.alarm,starline,09a3f83d-b2a2-4d4d-81d8-0e92e579bef9,791,241482
999342,2019-10-31 13:18:29 UTC,view,6000227,auto.accessories.alarm,starline,09a3fe0d-2710-403e-b38f-b4c90361a890,791,241489
999636,2019-10-01 10:33:40 UTC,view,6000227,auto.accessories.alarm,starline,09a4afa5-1179-43af-9b85-abe521a43274,791,241569
