In [1]:
from recoxplainer.data_reader.data_reader import DataReader
from recoxplainer.config import cfg

In [2]:
# ** unpacks the dictionary (config.yml)
data = DataReader(**cfg.retailrocket.events)

data.dataset = data.dataset.rename(columns={'userId': 'visitorid', 'itemId': 'itemid'})
data = data.dataset

data.shape

(2756101, 5)

### IDs der Top N User bzw. Items heraussuchen die die meisten Interaktionen haben

In [3]:
def get_top_n_items(dataframe, top_n):
    result_top_items = (
        dataframe['itemid']
        .value_counts()
        .head(top_n)
        .index
    )
    return result_top_items

def get_top_n_users(dataframe, top_n):
    result_top_users = (
        dataframe['visitorid']
        .value_counts()
        .head(top_n)
        .index
    )
    return result_top_users

In [4]:
top_2000_items = get_top_n_items(dataframe=data, top_n=2000)

### Datensatz filtern mit Top Items

In [5]:
data = data[data['itemid'].isin(top_2000_items)]

Kontrolle ob das filtern funktioniert hat

In [6]:
data['itemid'].nunique()

2000

Schauen wie viele Interaktionen die User haben

In [7]:
interaction_counts_user = data['visitorid'].value_counts()

for i in range(1,70):
    num_users_equal_n = (interaction_counts_user == i).sum()
    print("Number of users with exactly", i, "interactions:", num_users_equal_n)
print("Number of users in total:", data['visitorid'].nunique())

Number of users with exactly 1 interactions: 233929
Number of users with exactly 2 interactions: 47928
Number of users with exactly 3 interactions: 17927
Number of users with exactly 4 interactions: 8493
Number of users with exactly 5 interactions: 4687
Number of users with exactly 6 interactions: 2939
Number of users with exactly 7 interactions: 1881
Number of users with exactly 8 interactions: 1222
Number of users with exactly 9 interactions: 906
Number of users with exactly 10 interactions: 668
Number of users with exactly 11 interactions: 472
Number of users with exactly 12 interactions: 372
Number of users with exactly 13 interactions: 306
Number of users with exactly 14 interactions: 237
Number of users with exactly 15 interactions: 213
Number of users with exactly 16 interactions: 167
Number of users with exactly 17 interactions: 120
Number of users with exactly 18 interactions: 119
Number of users with exactly 19 interactions: 93
Number of users with exactly 20 interactions: 79

Schauen wie viel Interaktionen die Items haben

In [8]:
interaction_counts_items = data['itemid'].value_counts()

for i in range(1,70):
    num_items_equal_n = (interaction_counts_items == i).sum()
    print("Number of items with exactly", i, "interactions:", num_items_equal_n)
print("Number of items in total:", data['itemid'].nunique())

Number of items with exactly 1 interactions: 0
Number of items with exactly 2 interactions: 0
Number of items with exactly 3 interactions: 0
Number of items with exactly 4 interactions: 0
Number of items with exactly 5 interactions: 0
Number of items with exactly 6 interactions: 0
Number of items with exactly 7 interactions: 0
Number of items with exactly 8 interactions: 0
Number of items with exactly 9 interactions: 0
Number of items with exactly 10 interactions: 0
Number of items with exactly 11 interactions: 0
Number of items with exactly 12 interactions: 0
Number of items with exactly 13 interactions: 0
Number of items with exactly 14 interactions: 0
Number of items with exactly 15 interactions: 0
Number of items with exactly 16 interactions: 0
Number of items with exactly 17 interactions: 0
Number of items with exactly 18 interactions: 0
Number of items with exactly 19 interactions: 0
Number of items with exactly 20 interactions: 0
Number of items with exactly 21 interactions: 0
N

Datensatz Nutzer mit weniger als n Interaktionen herausfiltern

In [9]:
filter_n_users = 20
users_to_keep = interaction_counts_user[interaction_counts_user >= filter_n_users].index
data = data[data['visitorid'].isin(users_to_keep)].copy()

Gefilterten Datensatz kontrollieren

In [None]:
print(data.shape)
print(data.visitorid.nunique())
print(data.itemid.nunique())

(54850, 5)
954
1768


Datensatz als csv-Datei speichern

In [None]:
data.to_csv("events_filtered.csv", index=False)