In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
import os

path = r"./../data"

# Load DataFrames

In [2]:
df_purchases = pd.read_csv(os.path.join(path,"train_purchases.csv"))
df_sessions = pd.read_csv(os.path.join(path,"train_sessions-1.csv"))
for i in range(2,6):
    file = "train_sessions-" + str(i) + ".csv"
    df_sessions.append = pd.read_csv(os.path.join(path, file))
df_item_features = pd.read_csv(os.path.join(path,"item_features.csv"))
df_candidate_items = pd.read_csv(os.path.join(path, "candidate_items.csv"))
df_test_sessions = pd.read_csv(os.path.join(path, "test_leaderboard_sessions.csv"))

# Combine session and purchase

In [3]:
frames = [df_purchases, df_sessions]
df_full_session = pd.concat(frames, sort=True)

dict_sessions = dict()
for row in df_full_session.itertuples():
    if row[3] not in dict_sessions:
        dict_sessions[row[3]] = list()
    dict_sessions[row[3]].append(row[2])
    
"""
dict_items = dict()
for row in df_full_session.itertuples():
    if row[2] not in dict_items:
        dict_items[row[2]] = list()
    dict_items[row[2]].append(row[3])
"""

'\ndict_items = dict()\nfor row in df_full_session.itertuples():\n    if row[2] not in dict_items:\n        dict_items[row[2]] = list()\n    dict_items[row[2]].append(row[3])\n'

In [4]:
display(df_full_session)
display(dict(list(dict_sessions.items())[:3]))

Unnamed: 0,date,item_id,session_id
0,2020-12-18 21:26:47.986,15085,3
1,2020-03-13 19:36:15.507,18626,13
2,2020-08-26 19:20:32.049,24911,18
3,2020-11-02 17:16:45.92,12534,19
4,2020-02-26 18:27:44.114,13226,24
...,...,...,...
948761,2021-02-26 15:02:38.456,1373,888293
948762,2021-02-26 06:56:37.392,21872,888293
948763,2021-02-12 13:12:18.128,22719,888296
948764,2021-02-12 13:11:40.988,22862,888296


{3: [15085, 9655, 9655], 13: [18626, 15654], 18: [24911, 18316, 2507, 4026]}

# Create item-item neighborhood

In [5]:
array_candidates = df_candidate_items["item_id"]

# sparse matrix item item
sm_item_item = scipy.sparse.lil_matrix((28144,28144), dtype=np.int8)

#iterate through all sessions
for session, items in dict_sessions.items():
    #within a session compare all items with each other
    for item_2 in items:
        #only write items to the second dimension if they are possible candidates for recommendation
        if item_2 in array_candidates.values:
            #continuation of comparing all items with each other
            for item_1 in items:
                #dont recommend itself
                if item_1 != item_2:
                    #dim_1 = searched for item, dim_2 recommender candidate
                    sm_item_item[item_1, item_2] += 1

In [6]:
print("Dimensions of item_item similarity:",sm_item_item.shape)
print("Amount of elements in this matrix:", sm_item_item.count_nonzero())
print(sm_item_item.getrow(28143))

Dimensions of item_item similarity: (28144, 28144)
Amount of elements in this matrix: 2300847
  (0, 1018)	1
  (0, 13081)	1
  (0, 13922)	1
  (0, 22956)	1
  (0, 24921)	3


# Predict
1 session as is test_leaderbord_sessions.csv

In [7]:
display(df_test_sessions)
dict_test_sessions = dict()
for row in df_test_sessions.itertuples():
    if row[1] not in dict_test_sessions:
        dict_test_sessions[row[1]] = list()
    dict_test_sessions[row[1]].append(row[2])


Unnamed: 0,session_id,item_id,date
0,26,19185,2021-06-16 09:53:54.158
1,200,17089,2021-06-25 12:23:40.811
2,200,17089,2021-06-25 12:24:36.631
3,200,8060,2021-06-25 12:24:41.677
4,200,4758,2021-06-25 12:24:50.692
...,...,...,...
229349,4439653,25955,2021-06-11 10:22:57.47
229350,4439653,12179,2021-06-11 10:23:00.663
229351,4439757,2078,2021-06-30 11:42:15.073
229352,4439757,2078,2021-06-30 11:43:13.725


In [78]:
# iterate through all test sessions
df_recommendations = pd.DataFrame(columns=["session_id", "item_id", "rank"])
for session, items in dict_test_sessions.items():
    # save recommendations for this session (28144 is the total amount of available items)
    recommendations = np.zeros(28144)
    # iterate over items within a session
    for item in items:
        row = sm_item_item.getrow(item).toarray()
        row = np.array(row[0])
        # add recommendations per item to recommendations per session
        recommendations += row
    # create the three columns session_id, recommended_item_id, rank
    item_ids = recommendations.nonzero()[0]
    values = [recommendations[i] for i in indexes]
    arr_session = [session for i in range(0,len(values))]

    # create intermediate df
    df_session_rec = pd.DataFrame([arr_session, item_ids, values]).transpose()
    df_session_rec.columns = df_recommendations.columns
    df_session_rec.sort_values(by="rank", ascending=False, inplace=True)

    # cut top 100 recommendations
    df_session_rec = df_session_rec.head(100)
    new_rank = list(range(1, len(df_session_rec.index) + 1))
    df_session_rec["rank"] = new_rank
    df_recommendations = df_recommendations.append(df_session_rec) 


In [79]:
print(df_recommendations)

     session_id  item_id rank
37         26.0   3260.0    1
58         26.0   5383.0    2
21         26.0   1578.0    3
266        26.0  27416.0    4
241        26.0  25188.0    5
..          ...      ...  ...
250   4439757.0  14336.0   96
249   4439757.0  14314.0   97
248   4439757.0  14282.0   98
247   4439757.0  14216.0   99
246   4439757.0  14072.0  100

[5000000 rows x 3 columns]
