# RecSys2022

In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [2]:
candidate_items_path = "data/candidate_items.csv"
candidate_items = pd.read_csv(candidate_items_path)

items_features_path = "data/item_features.csv"
items_features = pd.read_csv(items_features_path)

sessions_path = "data/train_sessions.csv"
sessions = pd.read_csv(sessions_path)

purchases_path = "data/train_purchases.csv"
train_purchases = pd.read_csv(purchases_path)

test_final_path = "data/test_final_sessions.csv"
test_final = pd.read_csv(test_final_path)

test_leaderboard_path = "data/test_leaderboard_sessions.csv"
test_leaderboard = pd.read_csv(test_leaderboard_path)


In [3]:
items_features = candidate_items.merge(items_features,on="item_id", how = 'inner')

In [4]:
train_set = items_features.pivot_table(
            index='item_id', columns='feature_category_id', values='feature_value_id', aggfunc=np.sum, fill_value=0)

In [5]:
train_set

feature_category_id,1,2,3,4,5,6,7,8,9,10,...,64,65,66,67,68,69,70,71,72,73
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0,0,793,618,605,0,837,0,0,0,...,0,521,0,0,373,538,0,0,75,544
8,0,0,793,618,605,0,798,0,0,0,...,0,521,0,0,351,592,0,0,75,544
9,0,0,793,618,605,0,536,0,0,0,...,0,521,0,0,222,805,0,0,649,544
19,0,0,0,618,0,399,619,0,0,0,...,0,521,0,0,103,592,0,0,75,544
20,461,0,793,618,605,0,837,0,0,0,...,0,0,0,0,31,116,0,0,75,544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28128,0,0,793,618,605,0,2,0,0,0,...,0,521,0,0,745,885,0,0,75,544
28131,0,0,0,618,0,0,490,0,0,0,...,0,521,0,0,767,592,0,0,7,544
28132,0,53,0,0,0,0,0,0,0,0,...,0,0,264,382,659,0,0,0,0,544
28133,0,0,0,618,0,0,394,0,0,0,...,0,0,0,0,744,885,0,0,75,544


In [6]:
items_classifier = NearestNeighbors(n_neighbors=10)
items_model = items_classifier.fit(train_set)

In [7]:
items_mapping = {v: k for k, v in enumerate(list(train_set.index))}

In [8]:
train_set.index

Int64Index([    4,     8,     9,    19,    20,    26,    33,    40,    51,
               54,
            ...
            28093, 28101, 28105, 28112, 28122, 28128, 28131, 28132, 28133,
            28137],
           dtype='int64', name='item_id', length=4990)

In [9]:
train_set.iloc[items_mapping[4]]

feature_category_id
1       0
2       0
3     793
4     618
5     605
     ... 
69    538
70      0
71      0
72     75
73    544
Name: 4, Length: 71, dtype: int64

In [10]:
  list(items_classifier.kneighbors([train_set.iloc[items_mapping[4]]])[1])

[array([   0, 1255,  672, 2057,  833, 2237,   31, 3496,  525, 1550])]

In [11]:
def get_ten_most_similar_items(item: str):
    if item not in items_mapping:
        item = train_purchases['item_id'].value_counts().index.tolist()[0]

    top_ten_indexes = list(items_classifier.kneighbors([train_set.iloc[items_mapping[int(item)]]])[1])

    return [train_set.index[index] for index in top_ten_indexes]
        

In [12]:
train_purchases['item_id'].value_counts()

8060     8451
26853    3829
19882    3355
8622     2745
2447     2726
         ... 
20141       1
24646       1
1581        1
14660       1
3302        1
Name: item_id, Length: 18907, dtype: int64

In [13]:
sessions = sessions.append(train_purchases)
sessions['item_id'] = sessions['item_id'].apply(str)
grouped_sessions = sessions.groupby("session_id").agg(list)
grouped_sessions.head(10)

Unnamed: 0_level_0,item_id,date
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,"[9655, 9655, 15085]","[2020-12-18 21:25:00.373, 2020-12-18 21:19:48...."
13,"[15654, 18626]","[2020-03-13 19:35:27.136, 2020-03-13 19:36:15...."
18,"[18316, 2507, 4026, 24911]","[2020-08-26 19:18:30.833, 2020-08-26 19:16:31...."
19,"[25772, 6341, 25555, 20033, 8281, 8268, 4385, ...","[2020-11-02 16:31:18.543, 2020-11-02 16:34:33...."
24,"[2927, 11662, 2927, 28075, 434, 16064, 10414, ...","[2020-02-26 17:23:04.84, 2020-02-26 17:26:29.1..."
28,"[11529, 11529, 16895, 21902, 26394]","[2020-05-18 12:51:10.994, 2020-05-18 12:51:32...."
31,"[25972, 16289, 2069, 27579, 26457, 2069, 4230,...","[2021-04-20 19:38:03.816, 2021-04-20 19:39:17...."
36,"[26536, 25417, 14532]","[2020-06-21 10:29:51.337, 2020-06-21 10:29:08...."
42,"[10395, 20523, 17472, 22492, 11784]","[2021-03-01 15:15:53.207, 2021-03-01 15:14:01...."
44,"[22747, 17089, 4028]","[2020-11-27 20:45:10.302, 2020-11-27 20:45:43...."


In [14]:
from gensim.models import Word2Vec

# training the W2V model
model = Word2Vec(sentences=grouped_sessions['item_id'], 
                 window=100, 
                 min_count=1, 
                 workers=4)

In [15]:
test_leaderboard['item_id'] = test_leaderboard['item_id'].apply(str)
grouped_test_sessions = sessions.groupby("session_id").agg(list)
grouped_test_sessions['item_id'] = grouped_test_sessions['item_id'].apply((lambda x: x[-1]))
grouped_test_sessions.head(10)

Unnamed: 0_level_0,item_id,date
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,15085,"[2020-12-18 21:25:00.373, 2020-12-18 21:19:48...."
13,18626,"[2020-03-13 19:35:27.136, 2020-03-13 19:36:15...."
18,24911,"[2020-08-26 19:18:30.833, 2020-08-26 19:16:31...."
19,12534,"[2020-11-02 16:31:18.543, 2020-11-02 16:34:33...."
24,13226,"[2020-02-26 17:23:04.84, 2020-02-26 17:26:29.1..."
28,26394,"[2020-05-18 12:51:10.994, 2020-05-18 12:51:32...."
31,8345,"[2021-04-20 19:38:03.816, 2021-04-20 19:39:17...."
36,14532,"[2020-06-21 10:29:51.337, 2020-06-21 10:29:08...."
42,11784,"[2021-03-01 15:15:53.207, 2021-03-01 15:14:01...."
44,4028,"[2020-11-27 20:45:10.302, 2020-11-27 20:45:43...."


In [None]:
grp_by_session = test_leaderboard.groupby(["session_id"])
session_id_list = []
item_id_list = []
rank_list = []
index = 0
print("number of index:"+str(len(grp_by_session)), flush=True)
for name, group in grp_by_session:
    session_id_list += [name] * 100
    item_id_list.extend([get_ten_most_similar_items(x[0]) for x in model.wv.most_similar(list(grouped_test_sessions['item_id'])[index], topn=10)])
    rank_list += range(1,101,1)
    print("index:"+str(index),flush=True)
    index += 1

number of index:50000
index:0
index:1
index:2
index:3
index:4
index:5
index:6
index:7
index:8
index:9
index:10
index:11
index:12
index:13
index:14
index:15
index:16
index:17
index:18
index:19
index:20
index:21
index:22
index:23
index:24
index:25
index:26
index:27
index:28
index:29
index:30
index:31
index:32
index:33
index:34
index:35
index:36
index:37
index:38
index:39
index:40
index:41
index:42
index:43
index:44
index:45
index:46
index:47
index:48
index:49
index:50
index:51
index:52
index:53
index:54
index:55
index:56
index:57
index:58
index:59
index:60
index:61
index:62
index:63
index:64
index:65
index:66
index:67
index:68
index:69
index:70
index:71
index:72
index:73
index:74
index:75
index:76
index:77
index:78
index:79
index:80
index:81
index:82
index:83
index:84
index:85
index:86
index:87
index:88
index:89
index:90
index:91
index:92
index:93
index:94
index:95
index:96
index:97
index:98
index:99
index:100
index:101
index:102
index:103
index:104
index:105
index:106
index:107
index:10

index:829
index:830
index:831
index:832
index:833
index:834
index:835
index:836
index:837
index:838
index:839
index:840
index:841
index:842
index:843
index:844
index:845
index:846
index:847
index:848
index:849
index:850
index:851
index:852
index:853
index:854
index:855
index:856
index:857
index:858
index:859
index:860
index:861
index:862
index:863
index:864
index:865
index:866
index:867
index:868
index:869
index:870
index:871
index:872
index:873
index:874
index:875
index:876
index:877
index:878
index:879
index:880
index:881
index:882
index:883
index:884
index:885
index:886
index:887
index:888
index:889
index:890
index:891
index:892
index:893
index:894
index:895
index:896
index:897
index:898
index:899
index:900
index:901
index:902
index:903
index:904
index:905
index:906
index:907
index:908
index:909
index:910
index:911
index:912
index:913
index:914
index:915
index:916
index:917
index:918
index:919
index:920
index:921
index:922
index:923
index:924
index:925
index:926
index:927
index:928


index:1590
index:1591
index:1592
index:1593
index:1594
index:1595
index:1596
index:1597
index:1598
index:1599
index:1600
index:1601
index:1602
index:1603
index:1604
index:1605
index:1606
index:1607
index:1608
index:1609
index:1610
index:1611
index:1612
index:1613
index:1614
index:1615
index:1616
index:1617
index:1618
index:1619
index:1620
index:1621
index:1622
index:1623
index:1624
index:1625
index:1626
index:1627
index:1628
index:1629
index:1630
index:1631
index:1632
index:1633
index:1634
index:1635
index:1636
index:1637
index:1638
index:1639
index:1640
index:1641
index:1642
index:1643
index:1644
index:1645
index:1646
index:1647
index:1648
index:1649
index:1650
index:1651
index:1652
index:1653
index:1654
index:1655
index:1656
index:1657
index:1658
index:1659
index:1660
index:1661
index:1662
index:1663
index:1664
index:1665
index:1666
index:1667
index:1668
index:1669
index:1670
index:1671
index:1672
index:1673
index:1674
index:1675
index:1676
index:1677
index:1678
index:1679
index:1680

index:2335
index:2336
index:2337
index:2338
index:2339
index:2340
index:2341
index:2342
index:2343
index:2344
index:2345
index:2346
index:2347
index:2348
index:2349
index:2350
index:2351
index:2352
index:2353
index:2354
index:2355
index:2356
index:2357
index:2358
index:2359
index:2360
index:2361
index:2362
index:2363
index:2364
index:2365
index:2366
index:2367
index:2368
index:2369
index:2370
index:2371
index:2372
index:2373
index:2374
index:2375
index:2376
index:2377
index:2378
index:2379
index:2380
index:2381
index:2382
index:2383
index:2384
index:2385
index:2386
index:2387
index:2388
index:2389
index:2390
index:2391
index:2392
index:2393
index:2394
index:2395
index:2396
index:2397
index:2398
index:2399
index:2400
index:2401
index:2402
index:2403
index:2404
index:2405
index:2406
index:2407
index:2408
index:2409
index:2410
index:2411
index:2412
index:2413
index:2414
index:2415
index:2416
index:2417
index:2418
index:2419
index:2420
index:2421
index:2422
index:2423
index:2424
index:2425

index:3080
index:3081
index:3082
index:3083
index:3084
index:3085
index:3086
index:3087
index:3088
index:3089
index:3090
index:3091
index:3092
index:3093
index:3094
index:3095
index:3096
index:3097
index:3098
index:3099
index:3100
index:3101
index:3102
index:3103
index:3104
index:3105
index:3106
index:3107
index:3108
index:3109
index:3110
index:3111
index:3112
index:3113
index:3114
index:3115
index:3116
index:3117
index:3118
index:3119
index:3120
index:3121
index:3122
index:3123
index:3124
index:3125
index:3126
index:3127
index:3128
index:3129
index:3130
index:3131
index:3132
index:3133
index:3134
index:3135
index:3136
index:3137
index:3138
index:3139
index:3140
index:3141
index:3142
index:3143
index:3144
index:3145
index:3146
index:3147
index:3148
index:3149
index:3150
index:3151
index:3152
index:3153
index:3154
index:3155
index:3156
index:3157
index:3158
index:3159
index:3160
index:3161
index:3162
index:3163
index:3164
index:3165
index:3166
index:3167
index:3168
index:3169
index:3170

In [18]:
print("number of index:"+str(len(grp_by_session)), flush=True)


number of index:50000


In [11]:
list(train_set.index)
with open('data/Pickles/items_mapping.pickle', 'wb') as output:
    pickle.dump(list(train_set.index), output)

In [10]:
test_leaderboard['count'] = 1
session_ids = CategoricalDtype(sorted(test_leaderboard.session_id.unique()), ordered=True)
item_ids = CategoricalDtype(sorted(sessions.item_id.unique()), ordered=True)

row = test_leaderboard.session_id.astype(session_ids).cat.codes
col = test_leaderboard.item_id.astype(item_ids).cat.codes
row = row.drop(col.loc[col == -1].index)
test_leaderboard_c = test_leaderboard.drop(col.loc[col == -1].index)
col = col.drop(col.loc[col == -1].index)
session_matrix = csr_matrix((test_leaderboard_c["count"], (row, col)), \
                        shape=(session_ids.categories.size, item_ids.categories.size))


In [11]:
session_matrix = pd.DataFrame.sparse.from_spmatrix(session_matrix)


In [12]:
session_predictions = sessions_model.kneighbors(session_matrix.head(1000), n_neighbors=10)[1]

KeyboardInterrupt: 

In [None]:
session_index = sessions[['session_id']].drop_duplicates().reset_index(drop=True).reset_index()
session_index

Unnamed: 0,index,session_id
0,0,3
1,1,13
2,2,18
3,3,19
4,4,24
...,...,...
999995,999995,4439986
999996,999996,4439990
999997,999997,4439994
999998,999998,4439999


In [None]:
session_predictions = pd.DataFrame(list(np.concatenate(session_predictions).flat)).rename(columns={0:"index"})

In [None]:
session_predictions.reset_index(inplace=True)

In [None]:
session_predictions

Unnamed: 0,level_0,index
0,0,230604
1,1,36203
2,2,67313
3,3,962539
4,4,482172
...,...,...
9995,9995,325998
9996,9996,271371
9997,9997,162167
9998,9998,26575


In [None]:
session_predictions = session_predictions.merge(session_index, on='index')

In [None]:
session_predictions = session_predictions[['session_id']]
session_predictions

Unnamed: 0,session_id
0,1027045
1,161367
2,300231
3,4274564
4,2141747
...,...
9995,2022407
9996,1449253
9997,1207865
9998,118250


In [None]:
test_leaderboard_sessions = pd.DataFrame(test_leaderboard['session_id'].unique())[0].repeat(10).head(10000)

In [None]:
test_leaderboard_sessions.reset_index(inplace=True, drop=True)
test_leaderboard_sessions

0          26
1          26
2          26
3          26
4          26
        ...  
9995    87478
9996    87478
9997    87478
9998    87478
9999    87478
Name: 0, Length: 10000, dtype: int64

In [None]:
test_session = pd.concat([session_predictions.reset_index(drop=True), test_leaderboard_sessions.reset_index(drop=True)], axis=1, join='inner')

In [None]:
test_session.head(12)

Unnamed: 0,session_id,0
0,1027045,26
1,161367,26
2,300231,26
3,4274564,26
4,2141747,26
5,2782663,26
6,179706,26
7,561346,26
8,4062908,26
9,1563957,26


In [None]:
test_session = test_session.merge(train_purchases, on="session_id", how ='inner').rename(columns={0:'session'})[['session','item_id']]

In [None]:
test_session

Unnamed: 0,session,item_id
0,26,7019
1,26,27852
2,26,25166
3,26,5383
4,26,3425
...,...,...
9995,87478,15389
9996,87478,17740
9997,87478,21434
9998,87478,27588


In [None]:
test_session.merge(items_features,on="item_id", how = 'inner')

Unnamed: 0,session,item_id,feature_category_id,feature_value_id
0,26,7019,56,365
1,26,7019,68,373
2,26,7019,11,859
3,26,7019,55,267
4,26,7019,7,490
...,...,...,...,...
196771,87478,27588,62,801
196772,87478,27588,55,543
196773,87478,27588,15,584
196774,87478,27588,4,618


In [None]:
items_pivot_table = test_session.merge(items_features,on="item_id", how = 'inner').pivot_table(
            index='item_id', columns='feature_category_id', values='feature_value_id', aggfunc=np.sum, fill_value=0)

In [None]:
items_pivot_table

feature_category_id,1,2,3,4,5,6,7,8,9,10,...,64,65,66,67,68,69,70,71,72,73
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0,0,1586,1236,1210,0,1072,0,0,0,...,0,1042,0,0,444,1610,0,0,1298,1088
26,0,0,3965,3090,3025,0,1970,3180,0,3155,...,0,0,0,0,3695,4025,0,0,375,2720
33,0,0,6344,5752,4840,0,4952,0,0,0,...,0,4168,0,0,5912,4736,0,0,1752,728
51,0,0,793,618,605,0,452,0,0,0,...,0,521,0,0,895,780,0,0,75,544
64,0,0,4758,3708,3630,0,2712,0,0,0,...,0,3126,0,0,2010,3552,0,0,42,546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28128,0,0,3965,3090,3025,0,10,0,0,0,...,0,2605,0,0,3725,4425,0,0,375,2720
28131,0,0,0,618,0,0,490,0,0,0,...,0,521,0,0,767,592,0,0,7,544
28132,0,265,0,0,0,0,0,0,0,0,...,0,0,1320,1910,3295,0,0,0,0,2720
28133,0,0,0,4944,0,0,3152,0,0,0,...,0,0,0,0,5952,7080,0,0,600,4352


In [None]:
items_predictions=items_model.kneighbors(items_pivot_table, n_neighbors=10)[1]
items_predictions = pd.DataFrame(list(np.concatenate(items_predictions).flat)).rename(columns={0:"index"})

In [None]:
items_predictions

Unnamed: 0,index
0,2
1,3687
2,1879
3,84
4,2298
...,...
23265,4326
23266,3740
23267,3355
23268,2265


In [None]:
items_predictions = items_predictions.rename(columns={'item_id':'index'})
items_predictions

Unnamed: 0,index
0,2
1,3687
2,1879
3,84
4,2298
...,...
23265,4326
23266,3740
23267,3355
23268,2265


In [None]:
items_predictions=items_predictions.reset_index()

In [None]:
items_predictions=items_predictions.merge(candidate_items.reset_index(), on='index').sort_values(by=['level_0']).reset_index(drop=True)[['item_id']]

In [None]:
t = pd.DataFrame(items_features[items_features['item_id'].isin(test_session['item_id'])]['item_id'].drop_duplicates().repeat(10)).rename(columns={0:'item_id'})
p =items_predictions.reset_index()
t = t.reset_index(drop=True).reset_index()
p.merge(t, on='index', how='inner')


Unnamed: 0,index,item_id_x,item_id_y
0,0,9,9
1,1,20856,9
2,2,10456,9
3,3,484,9
4,4,12955,9
...,...,...,...
23265,23265,24429,28137
23266,23266,21190,28137
23267,23267,18953,28137
23268,23268,12707,28137


In [None]:
items_features[items_features['item_id'].isin(test_session['item_id'])][['item_id']].drop_duplicates()

Unnamed: 0,item_id
48,9
117,26
142,33
185,51
227,64
...,...
109938,28128
109963,28131
109983,28132
110001,28133


In [None]:
session_predictions

Unnamed: 0,session_id
0,230604
1,36203
2,67313
3,962539
4,482172
...,...
95,508583
96,650048
97,371777
98,246596


In [None]:
test_leaderboard.head(10)

Unnamed: 0,session_id,item_id,date,count
0,26,19185,2021-06-16 09:53:54.158,1
1,200,17089,2021-06-25 12:23:40.811,1
2,200,17089,2021-06-25 12:24:36.631,1
3,200,8060,2021-06-25 12:24:41.677,1
4,200,4758,2021-06-25 12:24:50.692,1
5,205,8194,2021-06-11 00:28:07.058,1
6,495,6853,2021-06-14 22:13:06.741,1
7,521,26471,2021-06-19 13:50:03.09,1
8,587,22204,2021-06-01 16:43:22.80,1
9,721,13914,2021-06-19 18:46:57.263,1


In [None]:
col.drop(col.loc[col == -1].index)

0         16056
1         14295
2         14295
3          6726
4          3957
          ...  
229349    21699
229350    10169
229351     1720
229352     1720
229353     5650
Length: 224850, dtype: int16

In [None]:
sessions

Unnamed: 0,session_id,item_id,date,count
0,3,9655,2020-12-18 21:25:00.373,1
1,3,9655,2020-12-18 21:19:48.093,1
2,13,15654,2020-03-13 19:35:27.136,1
3,18,18316,2020-08-26 19:18:30.833,1
4,18,2507,2020-08-26 19:16:31.211,1
...,...,...,...,...
4743815,4440001,20409,2020-10-30 23:37:20.658,1
4743816,4440001,14155,2020-10-30 23:31:56.607,1
4743817,4440001,14303,2020-10-30 23:36:17.934,1
4743818,4440001,27852,2020-10-30 23:39:55.186,1


In [None]:
sessions_train = sessions[['session_id', 'item_id']].groupby('session_id').agg(list).reset_index()['item_id']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X = cv.fit_transform(sessions_train) 
X = X.toarray()

AttributeError: 'list' object has no attribute 'lower'