In [37]:
import pandas as pd
import numpy as np
import re

In [38]:
df = pd.read_csv("sessions_train.csv")

df = df[df["locale"]=="DE"].drop(columns={"locale"}).reset_index(drop=True)

# https://github.com/rn5l/session-rec/blob/master/algorithms/knn/iknn.py
class ItemKNN:
    '''
    ItemKNN(n_sims = 100, lmbd = 20, alpha = 0.5, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time')
    Item-to-item predictor that computes the the similarity to all items to the given item.
    Similarity of two items is given by:
    .. math::
        s_{i,j}=\sum_{s}I\{(s,i)\in D & (s,j)\in D\} / (supp_i+\\lambda)^{\\alpha}(supp_j+\\lambda)^{1-\\alpha}
    Parameters
    --------
    n_sims : int
        Only give back non-zero scores to the N most similar items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100)
    lmbd : float
        Regularization. Discounts the similarity of rare items (incidental co-occurrences). (Default value: 20)
    alpha : float
        Balance between normalizing with the supports of the two items. 0.5 gives cosine similarity, 1.0 gives confidence (as in association rules).
    session_key : string
        header of the session ID column in the input file (default: 'SessionId')
    item_key : string
        header of the item ID column in the input file (default: 'ItemId')
    time_key : string
        header of the timestamp column in the input file (default: 'Time')
    '''

    def __init__(self, n_sims=100, lmbd=20, alpha=0.5, session_key='SessionId', item_key='ItemId', time_key='Time'):
        self.n_sims = n_sims
        self.lmbd = lmbd
        self.alpha = alpha
        self.item_key = item_key
        self.session_key = session_key
        self.time_key = time_key

    def fit(self, data):
        '''
        Trains the predictor.
        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
        '''
        data.set_index(np.arange(len(data)), inplace=True)
        itemids = data[self.item_key].unique()
        n_items = len(itemids)
        data = pd.merge(data, pd.DataFrame({self.item_key: itemids, 'ItemIdx': np.arange(len(itemids))}),
                        on=self.item_key, how='inner')
        sessionids = data[self.session_key].unique()
        data = pd.merge(data, pd.DataFrame({self.session_key: sessionids, 'SessionIdx': np.arange(len(sessionids))}),
                        on=self.session_key, how='inner')
        supp = data.groupby('SessionIdx').size()
        session_offsets = np.zeros(len(supp) + 1, dtype=np.int32)
        session_offsets[1:] = supp.cumsum()
        index_by_sessions = data.sort_values(['SessionIdx', self.time_key]).index.values
        supp = data.groupby('ItemIdx').size()
        item_offsets = np.zeros(n_items + 1, dtype=np.int32)
        item_offsets[1:] = supp.cumsum()
        index_by_items = data.sort_values(['ItemIdx', self.time_key]).index.values
        self.sims = dict()
        for i in range(n_items):
            iarray = np.zeros(n_items)
            start = item_offsets[i]
            end = item_offsets[i + 1]
            for e in index_by_items[start:end]:
                uidx = data.SessionIdx.values[e]
                ustart = session_offsets[uidx]
                uend = session_offsets[uidx + 1]
                user_events = index_by_sessions[ustart:uend]
                iarray[data.ItemIdx.values[user_events]] += 1
            iarray[i] = 0
            norm = np.power((supp[i] + self.lmbd), self.alpha) * np.power((supp.values + self.lmbd), (1.0 - self.alpha))
            norm[norm == 0] = 1
            iarray = iarray / norm
            indices = np.argsort(iarray)[-1:-1 - self.n_sims:-1]
            self.sims[itemids[i]] = pd.Series(data=iarray[indices], index=itemids[indices])

    def predict_next(self, session_id, input_item_id, predict_for_item_ids, skip=False, mode_type='view',
                     timestamp=0):
        '''
        Gives predicton scores for a selected set of items on how likely they be the next item in the session.
        Parameters
        --------
        session_id : int or string
            The session IDs of the event.
        input_item_id : int or string
            The item ID of the event. Must be in the set of item IDs of the training set.
        predict_for_item_ids : 1D array
            IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
        Returns
        --------
        out : pandas.Series
            Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
        '''
        preds = np.zeros(len(predict_for_item_ids))
        sim_list = self.sims[input_item_id]
        mask = np.in1d(predict_for_item_ids, sim_list.index)
        preds[mask] = sim_list[predict_for_item_ids[mask]]
        return pd.Series(data=preds, index=predict_for_item_ids)


# Función para reorganizar el DataFrame
def reformat_dataframe(df):

    # Convertir las cadenas en matrices de NumPy
    df["prev_items"] = df["prev_items"].apply(lambda x: re.findall(r"'([^']*)'", x))
    session_id_col = []
    item_id_col = []
    time_col = []

    for i, row in df.iterrows():
        session_id = i
        for j, item_id in enumerate(row["prev_items"]):
            session_id_col.append(session_id)
            item_id_col.append(item_id)
            time_col.append(j)

        session_id_col.append(session_id)
        item_id_col.append(row["next_item"])
        time_col.append(len(row["prev_items"]))

    new_df = pd.DataFrame({"SessionId": session_id_col, "ItemId": item_id_col, "Time": time_col})
    return new_df

# Reorganizar el DataFrame
new_data = reformat_dataframe(df)

In [190]:
df

Unnamed: 0,prev_items,next_item
0,"[B09W9FND7K, B09JSPLN1M]",B09M7GY217
1,"[B076THCGSG, B007MO8IME, B08MF65MLV, B001B4TKA0]",B001B4THSA
2,"[B0B1LGXWDS, B00AZYORS2, B0B1LGXWDS, B00AZYORS...",B0767DTG2Q
3,"[B09XMTWDVT, B0B4MZZ8MB, B0B7HZ2GWX, B09XMTWDV...",B0B4R9NN4B
4,"[B09Y5CSL3T, B09Y5DPTXN, B09FKD61R8]",B0BGVBKWGZ
...,...,...
1111411,"[B06X9BB2D7, B09RWWGXZJ]",B09RWWWYGZ
1111412,"[B0BK2WGCN4, B08H93ZRLL, B0BK2WGCN4, B08H93ZRLL]",B001BWJEXK
1111413,"[B08X1SDBLB, B06WVZRBJ1]",B09YRTCM8X
1111414,"[B0B8NNHR5N, B0BBCCB2S1, B09R222SDP, B0B5FBHX8...",B07XJ3H1RM


In [39]:
new_data

Unnamed: 0,SessionId,ItemId,Time
0,0,B09W9FND7K,0
1,0,B09JSPLN1M,1
2,0,B09M7GY217,2
3,1,B076THCGSG,0
4,1,B007MO8IME,1
...,...,...,...
5948394,1111414,3750524505,8
5948395,1111414,B07XJ3H1RM,9
5948396,1111415,B09Y5SSN7R,0
5948397,1111415,3731861860,1


In [56]:
# Crear y entrenar el modelo ItemKNN
iknn = ItemKNN(n_sims=100, lmbd=20, alpha=0.5, session_key="SessionId",
               item_key="ItemId", time_key="Time")
iknn.fit(new_data)

In [208]:
#import pickle
# create a binary pickle file 
#f = open("iknn.pkl","wb")

# write the python object (dict) to pickle file
#pickle.dump(iknn.sims,f)

# close file
#f.close()

In [209]:
import pickle
file = open("iknn.pkl",'rb')
iknn2 = pickle.load(file)

In [212]:
primer_dic = iknn.sims

In [213]:
segundo_dic = {}

for clave, valores in primer_dic.items():
    segundo_dic[clave] = list(valores.keys())

In [217]:
objetos = new_data.ItemId.unique()

In [219]:

import re

In [220]:
def str2list(x):
    x = x.replace('[', '').replace(']', '').replace("'", '').replace('\n', ' ').replace('\r', ' ')
    l = [i for i in x.split() if i]
    return l

In [270]:
#test = test[test["locale"].isin(["DE","UK","JP"])].drop(columns={"locale"}).reset_index(drop=True)
#test['last_item'] = test['prev_items'].apply(lambda x: str2list(x)[-1])
#test['last_item'] = test['prev_items'].apply(lambda x: str2list(x)[-1])


#total_items = set(df_next["item"].unique())

#len(set(test["last_item"].unique()) - total_items)

In [297]:
test = pd.read_csv("sessions_test_task1.csv")

train = pd.read_csv("sessions_train.csv")

test_de = test[test["locale"]=="DE"]
test_uk = test[test["locale"]=="UK"]
test_jp = test[test["locale"]=="JP"]


test_de['last_item'] = test_de['prev_items'].apply(lambda x: str2list(x)[-1])
test_uk['last_item'] = test_uk['prev_items'].apply(lambda x: str2list(x)[-1])
test_jp['last_item'] = test_jp['prev_items'].apply(lambda x: str2list(x)[-1])




In [298]:
import pickle

In [299]:
file = open("next_item_map.pkl", "rb")
next_item_map = pickle.load(file)

DE

In [300]:
test_de["next_item_prediction1"] = test_de["last_item"].map(segundo_dic)
test_de["next_item_prediction2"] = test_de["last_item"].map(next_item_map)
test_de['next_item_prediction'] = test_de.next_item_prediction1.combine_first(test_de.next_item_prediction2)

UK

In [301]:
file = open("iknn_uk.pkl", "rb")
primer_dic_uk = pickle.load(file)

In [302]:
segundo_dic_uk = {}

for clave, valores in primer_dic_uk.items():
    segundo_dic_uk[clave] = list(valores.keys())

In [303]:
test_uk["next_item_prediction1"] = test_uk["last_item"].map(segundo_dic_uk)
test_uk["next_item_prediction2"] = test_uk["last_item"].map(next_item_map)
test_uk['next_item_prediction'] = test_uk.next_item_prediction1.combine_first(test_uk.next_item_prediction2)

JP

In [304]:
file = open("iknn_jp.pkl", "rb")
primer_dic_jp = pickle.load(file)

In [305]:
segundo_dic_jp = {}

for clave, valores in primer_dic_jp.items():
    segundo_dic_jp[clave] = list(valores.keys())

In [306]:
test_jp["next_item_prediction1"] = test_jp["last_item"].map(segundo_dic_jp)
test_jp["next_item_prediction2"] = test_jp["last_item"].map(next_item_map)
test_jp['next_item_prediction'] = test_jp.next_item_prediction1.combine_first(test_jp.next_item_prediction2)

BASELINE

In [312]:
import warnings
warnings.simplefilter('ignore')

import gc
import re
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm

# df_prod = pd.read_csv('data/products_train.csv')
# df_prod

df_sess = pd.read_csv('sessions_train.csv')


df_test = pd.read_csv('sessions_test_task1.csv')


def str2list(x):
    x = x.replace('[', '').replace(']', '').replace("'", '').replace('\n', ' ').replace('\r', ' ')
    l = [i for i in x.split() if i]
    return l

next_item_dict = defaultdict(list)

for _, row in tqdm(df_sess.iterrows(), total=len(df_sess)):
    prev_items = str2list(row['prev_items'])
    next_item = row['next_item']
    prev_items_length = len(prev_items)
    if prev_items_length <= 1:
        next_item_dict[prev_items[0]].append(next_item)
    else:
        for i, item in enumerate(prev_items[:-1]):
            next_item_dict[item].append(prev_items[i+1])
        next_item_dict[prev_items[-1]].append(next_item)

for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    prev_items = str2list(row['prev_items'])
    prev_items_length = len(prev_items)
    if prev_items_length <= 1:
        continue
    else:
        for i, item in enumerate(prev_items[:-1]):
            next_item_dict[item].append(prev_items[i+1])

next_item_map = {}

for item in tqdm(next_item_dict):
    counter = Counter(next_item_dict[item])
    next_item_map[item] = [i[0] for i in counter.most_common(100)]

k = []
v = []

for item in next_item_dict:
    k.append(item)
    v.append(next_item_dict[item])
    
df_next = pd.DataFrame({'item': k, 'next_item': v})
df_next = df_next.explode('next_item').reset_index(drop=True)


top200 = df_next['next_item'].value_counts().index.tolist()[:200]

  0%|          | 0/3606249 [00:00<?, ?it/s]

  0%|          | 0/316971 [00:00<?, ?it/s]

  0%|          | 0/1334818 [00:00<?, ?it/s]

JUNTAMOS TODO EN test

In [315]:
df_test = pd.concat([test_de[["locale","prev_items","next_item_prediction"]], 
                     test_jp[["locale","prev_items","next_item_prediction"]],
                     test_uk[["locale","prev_items","next_item_prediction"]]], axis=0)

In [316]:
df_test

Unnamed: 0,locale,prev_items,next_item_prediction
0,DE,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,"[B099NR3X6D, B07LG5T3V9, B08496TCCQ, B099NS1XP..."
1,DE,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],"[B00R9R5ND6, B004ZXMV4Q, B097HPKM63, B08NJP33W..."
2,DE,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,"[B08C9Q7QVK, B0B5QNFWJ1, B0BBVB89CS, B0B5TFLBC..."
3,DE,['B08KQBYV43' '3955350843' '3955350843' '39553...,"[3772476953, 395535086X, 3955350878, 377247791..."
4,DE,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,"[B09J8V18FL, B09J8T6TTH, B09J8SKX9G, B09J8VPTT..."
...,...,...,...
316966,UK,['B077SZ2C3Y' 'B0B14M3VZX'],"[B08X9L5RGD, B07BKX8KH7, B09SV27FHD, B07LB4YTH..."
316967,UK,['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851...,"[B09CPNS7XV, B0989BHLSY, B09895QPQF, B09CPP92Q..."
316968,UK,['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1...,"[B09HKZBNZH, B07PY1N81F, B07PY1NG3X, B09HZSRJW..."
316969,UK,['B01MCQMORK' 'B09JYZ325W'],"[B07TR5LQSL, B08FB464L7, B005G3DI32, B01MCQMOR..."


ARREGLO PREDS DEL BASELINE

In [317]:
preds = []

for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    pred_orig = row['next_item_prediction']
    pred = pred_orig
    prev_items = str2list(row['prev_items'])
    if type(pred) == float:
        pred = top200[:100]
    else:
        if len(pred_orig) < 100:
            for i in top200:
                if i not in pred_orig and i not in prev_items:
                    pred.append(i)
                if len(pred) >= 100:
                    break
        else:
            pred = pred[:100]
    preds.append(pred)

  0%|          | 0/316971 [00:00<?, ?it/s]

In [318]:
df_test['next_item_prediction'] = preds
df_test

Unnamed: 0,locale,prev_items,next_item_prediction
0,DE,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,"[B099NR3X6D, B07LG5T3V9, B08496TCCQ, B099NS1XP..."
1,DE,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],"[B00R9R5ND6, B004ZXMV4Q, B097HPKM63, B08NJP33W..."
2,DE,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,"[B08C9Q7QVK, B0B5QNFWJ1, B0BBVB89CS, B0B5TFLBC..."
3,DE,['B08KQBYV43' '3955350843' '3955350843' '39553...,"[3772476953, 395535086X, 3955350878, 377247791..."
4,DE,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,"[B09J8V18FL, B09J8T6TTH, B09J8SKX9G, B09J8VPTT..."
...,...,...,...
316966,UK,['B077SZ2C3Y' 'B0B14M3VZX'],"[B08X9L5RGD, B07BKX8KH7, B09SV27FHD, B07LB4YTH..."
316967,UK,['B08KFHDPY9' 'B0851KTSRZ' 'B08KFHDPY9' 'B0851...,"[B09CPNS7XV, B0989BHLSY, B09895QPQF, B09CPP92Q..."
316968,UK,['B07PY1N81F' 'B07Q1Z8SQN' 'B07PY1N81F' 'B07Q1...,"[B09HKZBNZH, B07PY1N81F, B07PY1NG3X, B09HZSRJW..."
316969,UK,['B01MCQMORK' 'B09JYZ325W'],"[B07TR5LQSL, B08FB464L7, B005G3DI32, B01MCQMOR..."


In [319]:
df_test['next_item_prediction'].apply(len).describe()

count    316971.0
mean        100.0
std           0.0
min         100.0
25%         100.0
50%         100.0
75%         100.0
max         100.0
Name: next_item_prediction, dtype: float64

In [320]:
df_test[['locale', 'next_item_prediction']].to_parquet('submission_task1.parquet', engine='pyarrow')

In [323]:
test_sessions=pd.read_csv('sessions_test_task1.csv')
def check_predictions(predictions, check_products=False):
    """
    These tests need to pass as they will also be applied on the evaluator
    """
    test_locale_names = test_sessions['locale'].unique()
    for locale in test_locale_names:
        sess_test = test_sessions.query(f'locale == "{locale}"')
        preds_locale =  predictions[predictions['locale'] == sess_test['locale'].iloc[0]]
        assert sorted(preds_locale.index.values) == sorted(sess_test.index.values), f"Session ids of {locale} doesn't match"

        if check_products:
            # This check is not done on the evaluator
            # but you can run it to verify there is no mixing of products between locales
            # Since the ground truth next item will always belong to the same locale
            # Warning - This can be slow to run
            products = read_product_data().query(f'locale == "{locale}"')
            predicted_products = np.unique( np.array(list(preds_locale["next_item_prediction"].values)) )
            assert np.all( np.isin(predicted_products, products['id']) ), f"Invalid products in {locale} predictions"

In [324]:
check_predictions(df_test[['locale', 'next_item_prediction']])
