In [37]:
import pandas as pd
import numpy as np
import re

In [38]:
df = pd.read_csv("sessions_train.csv")

df = df[df["locale"]=="DE"].drop(columns={"locale"}).reset_index(drop=True)

# https://github.com/rn5l/session-rec/blob/master/algorithms/knn/iknn.py
class ItemKNN:
    '''
    ItemKNN(n_sims = 100, lmbd = 20, alpha = 0.5, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time')
    Item-to-item predictor that computes the the similarity to all items to the given item.
    Similarity of two items is given by:
    .. math::
        s_{i,j}=\sum_{s}I\{(s,i)\in D & (s,j)\in D\} / (supp_i+\\lambda)^{\\alpha}(supp_j+\\lambda)^{1-\\alpha}
    Parameters
    --------
    n_sims : int
        Only give back non-zero scores to the N most similar items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100)
    lmbd : float
        Regularization. Discounts the similarity of rare items (incidental co-occurrences). (Default value: 20)
    alpha : float
        Balance between normalizing with the supports of the two items. 0.5 gives cosine similarity, 1.0 gives confidence (as in association rules).
    session_key : string
        header of the session ID column in the input file (default: 'SessionId')
    item_key : string
        header of the item ID column in the input file (default: 'ItemId')
    time_key : string
        header of the timestamp column in the input file (default: 'Time')
    '''

    def __init__(self, n_sims=100, lmbd=20, alpha=0.5, session_key='SessionId', item_key='ItemId', time_key='Time'):
        self.n_sims = n_sims
        self.lmbd = lmbd
        self.alpha = alpha
        self.item_key = item_key
        self.session_key = session_key
        self.time_key = time_key

    def fit(self, data):
        '''
        Trains the predictor.
        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
        '''
        data.set_index(np.arange(len(data)), inplace=True)
        itemids = data[self.item_key].unique()
        n_items = len(itemids)
        data = pd.merge(data, pd.DataFrame({self.item_key: itemids, 'ItemIdx': np.arange(len(itemids))}),
                        on=self.item_key, how='inner')
        sessionids = data[self.session_key].unique()
        data = pd.merge(data, pd.DataFrame({self.session_key: sessionids, 'SessionIdx': np.arange(len(sessionids))}),
                        on=self.session_key, how='inner')
        supp = data.groupby('SessionIdx').size()
        session_offsets = np.zeros(len(supp) + 1, dtype=np.int32)
        session_offsets[1:] = supp.cumsum()
        index_by_sessions = data.sort_values(['SessionIdx', self.time_key]).index.values
        supp = data.groupby('ItemIdx').size()
        item_offsets = np.zeros(n_items + 1, dtype=np.int32)
        item_offsets[1:] = supp.cumsum()
        index_by_items = data.sort_values(['ItemIdx', self.time_key]).index.values
        self.sims = dict()
        for i in range(n_items):
            iarray = np.zeros(n_items)
            start = item_offsets[i]
            end = item_offsets[i + 1]
            for e in index_by_items[start:end]:
                uidx = data.SessionIdx.values[e]
                ustart = session_offsets[uidx]
                uend = session_offsets[uidx + 1]
                user_events = index_by_sessions[ustart:uend]
                iarray[data.ItemIdx.values[user_events]] += 1
            iarray[i] = 0
            norm = np.power((supp[i] + self.lmbd), self.alpha) * np.power((supp.values + self.lmbd), (1.0 - self.alpha))
            norm[norm == 0] = 1
            iarray = iarray / norm
            indices = np.argsort(iarray)[-1:-1 - self.n_sims:-1]
            self.sims[itemids[i]] = pd.Series(data=iarray[indices], index=itemids[indices])

    def predict_next(self, session_id, input_item_id, predict_for_item_ids, skip=False, mode_type='view',
                     timestamp=0):
        '''
        Gives predicton scores for a selected set of items on how likely they be the next item in the session.
        Parameters
        --------
        session_id : int or string
            The session IDs of the event.
        input_item_id : int or string
            The item ID of the event. Must be in the set of item IDs of the training set.
        predict_for_item_ids : 1D array
            IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
        Returns
        --------
        out : pandas.Series
            Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
        '''
        preds = np.zeros(len(predict_for_item_ids))
        sim_list = self.sims[input_item_id]
        mask = np.in1d(predict_for_item_ids, sim_list.index)
        preds[mask] = sim_list[predict_for_item_ids[mask]]
        return pd.Series(data=preds, index=predict_for_item_ids)


# Función para reorganizar el DataFrame
def reformat_dataframe(df):

    # Convertir las cadenas en matrices de NumPy
    df["prev_items"] = df["prev_items"].apply(lambda x: re.findall(r"'([^']*)'", x))
    session_id_col = []
    item_id_col = []
    time_col = []

    for i, row in df.iterrows():
        session_id = i
        for j, item_id in enumerate(row["prev_items"]):
            session_id_col.append(session_id)
            item_id_col.append(item_id)
            time_col.append(j)

        session_id_col.append(session_id)
        item_id_col.append(row["next_item"])
        time_col.append(len(row["prev_items"]))

    new_df = pd.DataFrame({"SessionId": session_id_col, "ItemId": item_id_col, "Time": time_col})
    return new_df

# Reorganizar el DataFrame
new_data = reformat_dataframe(df)

In [39]:
new_data

Unnamed: 0,SessionId,ItemId,Time
0,0,B09W9FND7K,0
1,0,B09JSPLN1M,1
2,0,B09M7GY217,2
3,1,B076THCGSG,0
4,1,B007MO8IME,1
...,...,...,...
5948394,1111414,3750524505,8
5948395,1111414,B07XJ3H1RM,9
5948396,1111415,B09Y5SSN7R,0
5948397,1111415,3731861860,1


In [56]:
# Crear y entrenar el modelo ItemKNN
iknn = ItemKNN(n_sims=100, lmbd=20, alpha=0.5, session_key="SessionId",
               item_key="ItemId", time_key="Time")
iknn.fit(new_data)

In [68]:
objetos = new_data.ItemId.unique()

In [72]:
df["prev_items"][]

0                                   [B09W9FND7K, B09JSPLN1M]
1           [B076THCGSG, B007MO8IME, B08MF65MLV, B001B4TKA0]
2          [B0B1LGXWDS, B00AZYORS2, B0B1LGXWDS, B00AZYORS...
3          [B09XMTWDVT, B0B4MZZ8MB, B0B7HZ2GWX, B09XMTWDV...
4                       [B09Y5CSL3T, B09Y5DPTXN, B09FKD61R8]
                                 ...                        
1111411                             [B06X9BB2D7, B09RWWGXZJ]
1111412     [B0BK2WGCN4, B08H93ZRLL, B0BK2WGCN4, B08H93ZRLL]
1111413                             [B08X1SDBLB, B06WVZRBJ1]
1111414    [B0B8NNHR5N, B0BBCCB2S1, B09R222SDP, B0B5FBHX8...
1111415                             [B09Y5SSN7R, 3731861860]
Name: prev_items, Length: 1111416, dtype: object

In [70]:
lastclick =  "B0B1LGXWDS"
preds = iknn.predict_next(1, lastclick, objetos).sort_values(ascending=False)  # lista con el item y su peso


In [71]:
preds

B00AZYORS2    0.402015
B0767DTG2Q    0.170103
B085XT2DZY    0.098058
B004N9BSQE    0.096225
B07JLJ4N44    0.093352
                ...   
B093FK5GN6    0.000000
B07JMS9GZC    0.000000
B07XLTFFSK    0.000000
B075SSYD9F    0.000000
B092SJJKD1    0.000000
Length: 513811, dtype: float64

In [76]:

import re

In [79]:
def str2list(x):
    x = x.replace('[', '').replace(']', '').replace("'", '').replace('\n', ' ').replace('\r', ' ')
    l = [i for i in x.split() if i]
    return l

In [82]:
test = pd.read_csv("sessions_test_task1.csv")

In [86]:
train = pd.read_csv("sessions_train.csv")

train = train[train["locale"]=="DE"].drop(columns={"locale"}).reset_index(drop=True)


In [88]:
train

Unnamed: 0,prev_items,next_item,last_item
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,B09JSPLN1M
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,B001B4TKA0
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,B00AZYORS2
3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,B0B71CHT1L
4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,B09FKD61R8
...,...,...,...
1111411,['B06X9BB2D7' 'B09RWWGXZJ'],B09RWWWYGZ,B09RWWGXZJ
1111412,['B0BK2WGCN4' 'B08H93ZRLL' 'B0BK2WGCN4' 'B08H9...,B001BWJEXK,B08H93ZRLL
1111413,['B08X1SDBLB' 'B06WVZRBJ1'],B09YRTCM8X,B06WVZRBJ1
1111414,['B0B8NNHR5N' 'B0BBCCB2S1' 'B09R222SDP' 'B0B5F...,B07XJ3H1RM,3750524505


In [87]:
train['last_item'] = train['prev_items'].apply(lambda x: str2list(x)[-1])


In [89]:
train = train.iloc[0:2,:]

In [92]:
for i, row in train.iterrows():
    preds = iknn.predict_next(0, row["last_item"], objetos).sort_values(ascending=False) 

In [101]:
lista = [x for x in ]


In [126]:

# Convertimos el objeto de índice en una lista de Python sin comillas simples
lista = preds.index[0:2].tolist()

# Eliminamos las comillas simples de cada elemento de la lista
lista_sin_comillas = [x.replace("'",'') for x in lista]

In [127]:
lista

['B001B4THSA', 'B0001NPYZ2']

In [125]:
' '.join(lista)

'B001B4THSA B0001NPYZ2'

In [124]:
new_list = ",".join(lista).replace("\"","").replace("'","").split("\n")
new_list

['B001B4THSA,B0001NPYZ2']

In [None]:
#next_item_map = {}
#for item in tqdm(next_item_dict):
#    counter = Counter(next_item_dict[item])
#    next_item_map[item] = [i[0] for i in counter.most_common(100)]

In [118]:
cadena_con_comillas = 'B001B4THSA'
cadena_sin_comillas = cadena_con_comillas.replace("'", "")

print(cadena_sin_comillas)

B001B4THSA


In [119]:
cadena_con_comillas

'B001B4THSA'