In [1]:
from recoxplainer.data_reader.data_reader import DataReader
from recoxplainer.config import cfg

import numpy as np
import pandas as pd

from recoxplainer.models import ALS
from recoxplainer.recommender import Recommender 
from recoxplainer.evaluator import Splitter, Evaluator, ExplanationEvaluator
from recoxplainer.explain import ALSExplainer, ARPostHocExplainer, KNNPostHocExplainer

import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
print(os.environ.get("OPENBLAS_NUM_THREADS")) # Check if OPENBLAS Value is set correctly

1


### Datensatz einlesen

In [2]:
# ** unpacks the dictionary (config.yml)
data = DataReader(**cfg.retailrocket.events_filtered)

print("Filepath: ",data.filepath_or_buffer)
print("Sep: ", data.sep)
print("Rownames: ", data.names)
# um die Überschriften nicht als Daten einzulesen
print("Skiprows: ", data.skiprows)
# anzahl einzigartiger userids (df_events.visitorid.unique().size)
# Wert wird zu beginn initialisiert
print("Anzahl einzigartige UserIds: ", data.num_user)
# anzahl einzigartiger itemids (df_events.itemid.unique().size)
print("Anzahl einzigartige ItemIds: ", data.num_item)
data.dataset

Filepath:  ../datasets/retailrocket/events_filtered.csv
Sep:  ,
Rownames:  ['timestamp', 'userId', 'event', 'itemId', 'transactionId']
Skiprows:  1
Anzahl einzigartige UserIds:  954
Anzahl einzigartige ItemIds:  1768


Unnamed: 0,timestamp,userId,event,itemId,transactionId
0,1433193535886,820159,view,369447,
1,1433194351839,1065157,view,197980,
2,1433193992715,820159,view,281211,
3,1433218757551,168685,addtocart,111530,
4,1433216559709,777089,view,221329,
...,...,...,...,...,...
54845,1438405563348,478537,view,214790,
54846,1438397198429,1139675,view,309778,
54847,1438397312078,1139675,view,432152,
54848,1438397360331,1139675,view,450455,


### Hyperparameter definieren

In [3]:
# Datensatzgröße und Filterungstechnik wird in einem anderen Python Notebook umgesetzt

# Gewichtung: (1,1,1), (1,2,3), (1,3,5)
event_weight = {
    "view": 1,
    "addtocart": 1,
    "transaction": 1
}

# Train/Test Split 
frac = 0.1 # Werte: 0.1 = 90/10, 0.2 = 80/20, 0.3 = 70/30

# Latent Faktor zum trainieren vom ALS Modell
latent_dim = 100 # Werte: 10, 25, 50, 100

# regularization Term zum trainieren vom ALS Modell -> für Overfitting/Underfitting (bestraft hohe Gewichtungen)
reg_term = 0.001 # Werte: 0.1, 0.01, 0.001

# Anzahl empfohlener Items pro User
top_n = 5 # Werte: 5, 10, 20

### Datensatz aufbereiten

In [4]:
# events (Text) zu ratings (Zahlen) mappen
data.dataset['rating'] = data.dataset['event'].map(event_weight)
print("event:")
print(data.dataset['event'].value_counts())
print("--------------------")
print("rating: ")
print(data.dataset['rating'].value_counts())

# Unwichtige Spalten entfernen
data.dataset = data.dataset.drop(columns=['event', 'transactionId'])

# Dataframe zeitlich sortieren, weil die Funktion "split_leave_n_out" das nicht berücksichtigt
data.dataset = data.dataset.sort_values(by=["userId", "timestamp"], ascending=[True, False]).copy()

# User-Item-Paare gruppieren indem Gewichtung summiert und aktuellste Interaktion als Zeitpunkt behalten wird
data.dataset = data.dataset.groupby(['userId', 'itemId']).agg({
    'rating': 'sum',
    'timestamp': 'max'
}).reset_index()

# User und Item Ids in sequenzielle Ids von 0 startend ändern
data.make_consecutive_ids_in_dataset()

data.dataset

event:
view           48860
addtocart       3749
transaction     2241
Name: event, dtype: int64
--------------------
rating: 
1    54850
Name: rating, dtype: int64


Unnamed: 0,userId,itemId,rating,timestamp
0,0,0,1,1436235933413
1,0,1,25,1437492668245
2,1,2,31,1436063944180
3,2,3,38,1433276118067
4,3,4,1,1431034227547
...,...,...,...,...
19402,953,1195,1,1440624906822
19403,953,1495,4,1433464909314
19404,953,1198,1,1432770801645
19405,953,663,1,1437412846781


### Datensatz in Trainings- und Testteildatensatz aufteilen

In [5]:
sp = Splitter()
train, test = sp.split_leave_n_out(data, frac=frac)

print("data shape:", data.dataset.shape)
print("train shape:", train.dataset.shape)
print("test shape:", test.shape)
print("---------------------------")
print("Summe train und test Zeilen:", train.dataset.shape[0] + test.shape[0])

print("---------------------------")
print("train userId nunique: ", train.dataset.userId.nunique())
print("train itemId nunique: ", train.dataset.itemId.nunique())
print("train userId max: ", train.dataset.userId.max())
print("train itemId max: ", train.dataset.itemId.max())
print("train num_user: ", train.num_user)
print("train num_item: ", train.num_item)
print("---------------------------")
print("test userId nunique: ", test.userId.nunique())
print("test itemId nunique: ", test.itemId.nunique())

data shape: (19407, 4)
train shape: (17504, 4)
test shape: (1903, 4)
---------------------------
Summe train und test Zeilen: 19407
---------------------------
train userId nunique:  954
train itemId nunique:  1747
train userId max:  953
train itemId max:  1767
train num_user:  954
train num_item:  1768
---------------------------
test userId nunique:  620
test itemId nunique:  1000


### Empfehlungsmodell definieren und trainieren

In [6]:
als = ALS(latent_dim=latent_dim, learning_rate=0.1, epochs=10, reg_term=reg_term)
als.fit(train)

  self._set_arrayXarray(i, j, x)
100%|██████████| 10/10 [00:03<00:00,  3.23it/s]


True

### Empfehlungen mithilfe vom Empfehlungsmodell generieren

In [7]:
recommender = Recommender(dataset_metadata=train, model=als, top_n=top_n)
recommendations = recommender.recommend_all()
als_recommendations = recommendations.copy()
ar_recommendations = recommendations.copy()
knn_recommendations = recommendations.copy()
recommendations

Recommending for users: 100%|██████████| 954/954 [00:05<00:00, 166.56it/s]


Unnamed: 0,userId,itemId,rank
482,0.0,485.0,1.0
1155,0.0,1159.0,2.0
502,0.0,505.0,3.0
549,0.0,552.0,4.0
946,0.0,949.0,5.0
...,...,...,...
987,953.0,1018.0,1.0
675,953.0,699.0,2.0
775,953.0,802.0,3.0
483,953.0,503.0,4.0


### Empfehlungsmodell evaluieren

In [8]:
recsys_evaluator = Evaluator(test)
print("Hit Ratio: ", recsys_evaluator.cal_hit_ratio(recommendations))
print("ncdg: ", recsys_evaluator.cal_ndcg(recommendations))

Hit Ratio:  0.21297944886654563
ncdg:  0.16284548122590953


### Empfehlungen mithilfe von Erklärungsmodell erklären

In [None]:
# als_explainer = ALSExplainer(model=als, recommendations=als_recommendations, data=train)
# ar_explainer = ARPostHocExplainer(model=als, recommendations=ar_recommendations, data=train)
# knn_explainer = KNNPostHocExplainer(model=als, recommendations=knn_recommendations, data=train)

# als_explanations = als_explainer.explain_recommendations()
# ar_explanations = ar_explainer.explain_recommendations()
# knn_explanations = knn_explainer.explain_recommendations()

#### ALSExplainer

In [9]:
als_explainer = ALSExplainer(model=als, recommendations=als_recommendations, data=train, number_of_contributions=10)
als_explanations = als_explainer.explain_recommendations()
als_explanations

Computing explanations: 100%|██████████| 4770/4770 [02:04<00:00, 38.32it/s]


Unnamed: 0,userId,itemId,rank,explanations
482,0.0,485.0,1.0,"{'item': [1, 0], 'contribution': [0.5437084841..."
1155,0.0,1159.0,2.0,"{'item': [1, 0], 'contribution': [0.4021382806..."
502,0.0,505.0,3.0,"{'item': [1, 0], 'contribution': [0.3517809022..."
549,0.0,552.0,4.0,"{'item': [0, 1], 'contribution': [0.6351211581..."
946,0.0,949.0,5.0,"{'item': [1, 0], 'contribution': [0.3306549047..."
...,...,...,...,...
987,953.0,1018.0,1.0,"{'item': [1141, 1195, 946, 1013, 148, 1198, 30..."
675,953.0,699.0,2.0,"{'item': [703, 57, 946, 1640, 1141, 574, 1089,..."
775,953.0,802.0,3.0,"{'item': [1640, 1265, 179, 628, 1012, 1589, 73..."
483,953.0,503.0,4.0,"{'item': [1352, 925, 1267, 1263, 574, 64, 162,..."


#### Association Rules

In [10]:
ar_explainer = ARPostHocExplainer(min_support=0.01, model=als, recommendations=ar_recommendations, data=train)
ar_explanations = ar_explainer.explain_recommendations()
ar_explanations

Computing explanations: 100%|██████████| 4770/4770 [00:05<00:00, 802.58it/s] 


Unnamed: 0,userId,itemId,rank,explanations
482,0.0,485.0,1.0,{}
1155,0.0,1159.0,2.0,{}
502,0.0,505.0,3.0,{}
549,0.0,552.0,4.0,{}
946,0.0,949.0,5.0,{}
...,...,...,...,...
987,953.0,1018.0,1.0,{}
675,953.0,699.0,2.0,"{57, 574}"
775,953.0,802.0,3.0,{}
483,953.0,503.0,4.0,{}


#### kNN

In [11]:
knn_explainer = KNNPostHocExplainer(model=als, recommendations=knn_recommendations, data=train, knn=1500)
knn_explanations = knn_explainer.explain_recommendations()
knn_explanations

Computing explanations: 100%|██████████| 4770/4770 [00:02<00:00, 1940.63it/s]


Unnamed: 0,userId,itemId,rank,explanations
482,0.0,485.0,1.0,"{0, 1}"
1155,0.0,1159.0,2.0,"{0, 1}"
502,0.0,505.0,3.0,"{0, 1}"
549,0.0,552.0,4.0,"{0, 1}"
946,0.0,949.0,5.0,"{0, 1}"
...,...,...,...,...
987,953.0,1018.0,1.0,"{512, 1542, 265, 397, 148, 663, 153, 154, 925,..."
675,953.0,699.0,2.0,"{512, 1542, 265, 397, 148, 663, 153, 925, 162,..."
775,953.0,802.0,3.0,"{512, 1542, 397, 148, 663, 153, 154, 925, 162,..."
483,953.0,503.0,4.0,"{512, 1542, 265, 397, 148, 663, 153, 154, 925,..."


### Erklärungsmodell evaluieren

#### Fidelity

In [12]:
als_explainer_eval = ExplanationEvaluator(train.dataset.userId.nunique(), top_n=top_n)
ar_explainer_eval = ExplanationEvaluator(train.dataset.userId.nunique(), top_n=top_n)
knn_explainer_eval = ExplanationEvaluator(train.dataset.userId.nunique(), top_n=top_n)

In [13]:
print("als fidelity: ", als_explainer_eval.model_fidelity(explanations=als_explanations))
print("ar fidelity: ", ar_explainer_eval.model_fidelity(explanations=ar_explanations))
print("knn fidelity: ", knn_explainer_eval.model_fidelity(explanations=knn_explanations))

als fidelity:  1.0
ar fidelity:  0.34549266247379457
knn fidelity:  0.9989517819706499


#### Mean Explainable Precision

In [14]:
# transform into 2D NumPy array
def build_explanation_matrix(explanations):
    # get matrix dimensions
    num_users = int(explanations['userId'].max()) + 1
    num_items = int(explanations['itemId'].max()) + 1

    explainability_matrix = np.zeros((num_users, num_items))

    # fill the matrix with explanation scores
    for _, row in explanations.iterrows():
        user = int(row['userId'])
        item = int(row['itemId'])
        expl_set = row['explanations']
        
        # Normalize by explanation set size
        if expl_set:
            score = len(expl_set) / max(len(expl_set), top_n)
            explainability_matrix[user, item] = score

    return explainability_matrix

In [15]:
als_explanation_matrix = build_explanation_matrix(explanations=als_explanations)
ar_explanation_matrix = build_explanation_matrix(explanations=ar_explanations)
knn_explanations_matrix = build_explanation_matrix(explanations=knn_explanations)

In [16]:
print("als mep: ", als_explainer_eval.mean_explaianable_precision(recommendations=recommendations, explainability_matrix=als_explanation_matrix))
print("ar mep: ", ar_explainer_eval.mean_explaianable_precision(recommendations=recommendations, explainability_matrix=ar_explanation_matrix))
print("knn mep: ", knn_explainer_eval.mean_explaianable_precision(recommendations=recommendations, explainability_matrix=knn_explanations_matrix))

als mep:  1.0
ar mep:  0.34549266247379457
knn mep:  0.9989517819706499
