In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
class ItemKNN:
    '''
    ItemKNN(n_sims = 100, lmbd = 20, alpha = 0.5, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time')
    Item-to-item predictor that computes the the similarity to all items to the given item.
    Similarity of two items is given by:
    .. math::
        s_{i,j}=\sum_{s}I\{(s,i)\in D & (s,j)\in D\} / (supp_i+\\lambda)^{\\alpha}(supp_j+\\lambda)^{1-\\alpha}
    Parameters
    --------
    n_sims : int
        Only give back non-zero scores to the N most similar items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100)
    lmbd : float
        Regularization. Discounts the similarity of rare items (incidental co-occurrences). (Default value: 20)
    alpha : float
        Balance between normalizing with the supports of the two items. 0.5 gives cosine similarity, 1.0 gives confidence (as in association rules).
    session_key : string
        header of the session ID column in the input file (default: 'SessionId')
    item_key : string
        header of the item ID column in the input file (default: 'ItemId')
    time_key : string
        header of the timestamp column in the input file (default: 'Time')
    '''

    def __init__(self, n_sims=100, lmbd=20, alpha=0.5, session_key='SessionId', item_key='ItemId', time_key='Time'):
        self.n_sims = n_sims
        self.lmbd = lmbd
        self.alpha = alpha
        self.item_key = item_key
        self.session_key = session_key
        self.time_key = time_key

    def fit(self, data):
        '''
        Trains the predictor.
        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
        '''
        data.set_index(np.arange(len(data)), inplace=True)
        itemids = data[self.item_key].unique()
        n_items = len(itemids)
        data = pd.merge(data, pd.DataFrame({self.item_key: itemids, 'ItemIdx': np.arange(len(itemids))}),
                        on=self.item_key, how='inner')
        sessionids = data[self.session_key].unique()
        data = pd.merge(data, pd.DataFrame({self.session_key: sessionids, 'SessionIdx': np.arange(len(sessionids))}),
                        on=self.session_key, how='inner')
        supp = data.groupby('SessionIdx').size()
        session_offsets = np.zeros(len(supp) + 1, dtype=np.int32)
        session_offsets[1:] = supp.cumsum()
        index_by_sessions = data.sort_values(['SessionIdx', self.time_key]).index.values
        supp = data.groupby('ItemIdx').size()
        item_offsets = np.zeros(n_items + 1, dtype=np.int32)
        item_offsets[1:] = supp.cumsum()
        index_by_items = data.sort_values(['ItemIdx', self.time_key]).index.values
        self.sims = dict()
        for i in range(n_items):
            iarray = np.zeros(n_items)
            start = item_offsets[i]
            end = item_offsets[i + 1]
            for e in index_by_items[start:end]:
                uidx = data.SessionIdx.values[e]
                ustart = session_offsets[uidx]
                uend = session_offsets[uidx + 1]
                user_events = index_by_sessions[ustart:uend]
                iarray[data.ItemIdx.values[user_events]] += 1
            iarray[i] = 0
            norm = np.power((supp[i] + self.lmbd), self.alpha) * np.power((supp.values + self.lmbd), (1.0 - self.alpha))
            norm[norm == 0] = 1
            iarray = iarray / norm
            indices = np.argsort(iarray)[-1:-1 - self.n_sims:-1]
            self.sims[itemids[i]] = pd.Series(data=iarray[indices], index=itemids[indices])

    def predict_next(self, session_id, input_item_id, predict_for_item_ids, skip=False, mode_type='view',
                     timestamp=0):
        '''
        Gives predicton scores for a selected set of items on how likely they be the next item in the session.
        Parameters
        --------
        session_id : int or string
            The session IDs of the event.
        input_item_id : int or string
            The item ID of the event. Must be in the set of item IDs of the training set.
        predict_for_item_ids : 1D array
            IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
        Returns
        --------
        out : pandas.Series
            Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
        '''
        preds = np.zeros(len(predict_for_item_ids))
        sim_list = self.sims[input_item_id]
        mask = np.in1d(predict_for_item_ids, sim_list.index)
        preds[mask] = sim_list[predict_for_item_ids[mask]]
        return pd.Series(data=preds, index=predict_for_item_ids)


In [None]:
data = {
    'prev_items': [
        ['B09W9FND7K', 'B09JSPLN1M'],
        ['B076THCGSG', 'B007MO8IME', 'B08MF65MLV', 'B001B'],
        ['B0B1LGXWDS', 'B00AZYORS2', 'B0B1LGXWDS', 'B00AZ'],
        ['B076THCGSG', 'B007MO8IME', 'B08MF65MLV', 'B001B'],
        ['B09W9FND7K', 'B09JSPLN1M']
    ],
    'next_item': ['B09M7GY217', 'B001B4THSA', 'B0767DTG2Q', 'B001B4THSA', 'B09M7GY217'],
    'locale': ['DE', 'DE', 'DE', 'DE', 'DE']
}

df = pd.DataFrame(data)
import re

import pandas as pd
import re

def reformat_dataframe(df):
    session_id_col = []
    item_id_col = []
    time_col = []

    for i, row in df.iterrows():
        session_id = i
        for j, item_ids in enumerate(row["prev_items"]):
            for item_id in item_ids:
                session_id_col.append(session_id)
                item_id_col.append(item_id)
                time_col.append(j)

        session_id_col.append(session_id)
        item_id_col.append(row["next_item"])
        time_col.append(len(row["prev_items"]))

    new_df = pd.DataFrame({"SessionId": session_id_col, "ItemId": item_id_col, "Time": time_col})
    return new_df
df = reformat_dataframe(df)

In [None]:
df

Unnamed: 0,SessionId,ItemId,Time
0,0,B,0
1,0,0,0
2,0,9,0
3,0,W,0
4,0,9,0
...,...,...,...
145,4,L,1
146,4,N,1
147,4,1,1
148,4,M,1


In [None]:
import pandas as pd

def reformat_dataframe(df):
    session_id_col = []
    item_id_col = []
    time_col = []

    for i, row in df.iterrows():
        session_id = i
        for j, item_ids in enumerate(row["prev_items"]):
            for item_id in item_ids:
                session_id_col.append(session_id)
                item_id_col.append(item_id)
                time_col.append(j)

    new_df = pd.DataFrame({"SessionId": session_id_col, "ItemId": item_id_col, "Time": time_col})
    return new_df

data = {
    'prev_items': [
        ['A', 'B'],
        ['A', 'B', 'C', 'D'],
        ['B', 'A', 'E', 'F'],
        ['A', 'B', 'G', 'H'],
        ['A', 'E']
    ],
    'locale': ['DE', 'DE', 'DE', 'DE', 'DE']
}

df = pd.DataFrame(data)
df = reformat_dataframe(df)
print(df)

    SessionId ItemId  Time
0           0      A     0
1           0      B     1
2           1      A     0
3           1      B     1
4           1      C     2
5           1      D     3
6           2      B     0
7           2      A     1
8           2      E     2
9           2      F     3
10          3      A     0
11          3      B     1
12          3      G     2
13          3      H     3
14          4      A     0
15          4      E     1


In [None]:
iknn = ItemKNN(n_sims=100, lmbd=20, alpha=0.5, session_key="SessionId",
               item_key="ItemId", time_key="Time")
iknn.fit(df)

In [None]:
iknn.sims

{'A': B    0.163299
 E    0.085280
 H    0.043644
 G    0.043644
 F    0.043644
 D    0.043644
 C    0.043644
 A    0.000000
 dtype: float64,
 'B': A    0.163299
 H    0.044544
 G    0.044544
 F    0.044544
 D    0.044544
 C    0.044544
 E    0.043519
 B    0.000000
 dtype: float64,
 'C': D    0.047619
 B    0.044544
 A    0.043644
 H    0.000000
 G    0.000000
 F    0.000000
 E    0.000000
 C    0.000000
 dtype: float64,
 'D': C    0.047619
 B    0.044544
 A    0.043644
 H    0.000000
 G    0.000000
 F    0.000000
 E    0.000000
 D    0.000000
 dtype: float64,
 'E': A    0.085280
 F    0.046524
 B    0.043519
 H    0.000000
 G    0.000000
 E    0.000000
 D    0.000000
 C    0.000000
 dtype: float64,
 'F': E    0.046524
 B    0.044544
 A    0.043644
 H    0.000000
 G    0.000000
 F    0.000000
 D    0.000000
 C    0.000000
 dtype: float64,
 'G': H    0.047619
 B    0.044544
 A    0.043644
 G    0.000000
 F    0.000000
 E    0.000000
 D    0.000000
 C    0.000000
 dtype: float64,
 'H': 

In [None]:
import pandas as pd
import numpy as np

def calculate_similarity(df, alpha, lambda_value):
    # Flatten prev_items column
    prev_items_flat = np.concatenate(df['prev_items'].values)

    # Get unique items and their counts
    unique_items, item_counts = np.unique(prev_items_flat, return_counts=True)

    # Create an item index mapping for efficient indexing
    item_to_index = {item: index for index, item in enumerate(unique_items)}

    # Calculate support arrays
    supp_i = np.zeros(len(unique_items))
    supp_j = np.zeros(len(unique_items))

    for items in df['prev_items']:
        indices = [item_to_index[item] for item in items]
        supp_i[indices] += 1
        supp_j[indices] += len(items)

    # Create co-occurrence matrix
    co_occurrence_matrix = np.zeros((len(unique_items), len(unique_items)), dtype=int)

    for items in df['prev_items']:
        indices = [item_to_index[item] for item in items]
        co_occurrence_matrix[np.ix_(indices, indices)] += 1

    # Calculate similarity measure
    numerator = co_occurrence_matrix / ((supp_i + lambda_value) ** alpha * (supp_j + lambda_value) ** (1 - alpha))
    denominator = np.sum(numerator)
    similarity = numerator / denominator

    return similarity

# Example usage
data = {
    'prev_items': [
        np.array(['A', 'B']),
        np.array(['A', 'B', 'C', 'D']),
        np.array(['B', 'A', 'E', 'F']),
        np.array(['A', 'B', 'G', 'H']),
        np.array(['A', 'E']),
    ],
    'locale': ['DE', 'DE', 'DE', 'DE']
}

df = pd.DataFrame(data)
alpha = 0.5  # Parameter alpha
lambda_value = 0.1  # Regularization parameter

similarity_matrix = calculate_similarity(df, alpha, lambda_value)
print(similarity_matrix)

ValueError: ignored

In [None]:

        ['A', 'B'],
        ['A', 'B', 'C', 'D'],
        ['B', 'A', 'E', 'F'],
        ['A', 'B', 'G', 'H'],
        ['A', 'E']