In [28]:
import pandas as pd

## Trainingsdaten laden

In [29]:
train_df = pd.read_csv("../data/train.csv")
train_df.head()

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel
0,9534310106,"[4, 3, 4]",new,252.0,1
1,7202594767,"[4, 2, 0, 2, 5]",existing,70.0,0
2,2737331698,[5],existing,84.0,0
3,4868011733,"[1, 4, 2, 4]",existing,116.0,0
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",existing,378.0,0


## Feature Engineering

In [30]:
def create_features(data_frame):
    data_frame = pd.get_dummies(data_frame, columns=["customerType"], dtype=int, drop_first=True)
    data_frame["orderedBooks"] = data_frame["basket"].apply(lambda x: sum(c.isdigit() for c in x))
    return data_frame

train_df = create_features(train_df)
train_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks
0,9534310106,"[4, 3, 4]",252.0,1,1,3
1,7202594767,"[4, 2, 0, 2, 5]",70.0,0,0,5
2,2737331698,[5],84.0,0,0,1
3,4868011733,"[1, 4, 2, 4]",116.0,0,0,4
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",378.0,0,0,7


## Daten skalieren

In [31]:
from sklearn.preprocessing import StandardScaler

features = ["totalAmount", "orderedBooks"]
scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
train_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks
0,9534310106,"[4, 3, 4]",-0.032823,1,1,-0.607275
1,7202594767,"[4, 2, 0, 2, 5]",-0.879459,0,0,0.133204
2,2737331698,[5],-0.814334,0,0,-1.347754
3,4868011733,"[1, 4, 2, 4]",-0.665475,0,0,-0.237036
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",0.553309,0,0,0.873682


In [32]:
X_train = train_df.drop(columns=["returnLabel", "transactionId", "basket"])
y_train = train_df["returnLabel"]

In [33]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 20, 100],
    "min_samples_split": [2, 5, 10]
}

In [34]:
from sklearn.model_selection import ParameterGrid

grid = list(ParameterGrid(param_grid))
grid[:5]

[{'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50},
 {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100},
 {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200},
 {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 50},
 {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 100}]

## Modell auf Testdaten anwenden

In [35]:
test_df = pd.read_csv("../data/test.csv")
test_df.head()

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel
0,4132523932,"[4, 3, 4, 3, 2, 3]",existing,366.0,1
1,8998574539,"[3, 4, 4, 3, 5]",existing,85.0,0
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",existing,275.0,0
3,4533897707,"[3, 2, 2, 1, 5, 1, 1, 0]",existing,528.0,0
4,3334800500,"[4, 2, 3, 5, 2, 5, 1]",existing,287.0,0


In [36]:
test_df = create_features(test_df)
test_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks
0,4132523932,"[4, 3, 4, 3, 2, 3]",366.0,1,0,6
1,8998574539,"[3, 4, 4, 3, 5]",85.0,0,0,5
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",275.0,0,0,11
3,4533897707,"[3, 2, 2, 1, 5, 1, 1, 0]",528.0,0,0,8
4,3334800500,"[4, 2, 3, 5, 2, 5, 1]",287.0,0,0,7


In [37]:
features = ["totalAmount", "orderedBooks"] 
test_df[features] = scaler.transform(test_df[features])
test_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks
0,4132523932,"[4, 3, 4, 3, 2, 3]",0.497487,1,0,0.503443
1,8998574539,"[3, 4, 4, 3, 5]",-0.809682,0,0,0.133204
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",0.074169,0,0,2.35464
3,4533897707,"[3, 2, 2, 1, 5, 1, 1, 0]",1.251086,0,0,1.243922
4,3334800500,"[4, 2, 3, 5, 2, 5, 1]",0.129991,0,0,0.873682


In [38]:
X_test = test_df.drop(["transactionId", "returnLabel", "basket"], axis=1)
y_test = test_df["returnLabel"]

## MLFlow Part

In [39]:
import mlflow
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score
from mlflow.models import infer_signature
from mlflow import MlflowClient

mlflow.set_tracking_uri(uri="http://localhost:8080")
mlflow.set_experiment("MLflow Return Data HyperParamTuning")

client = MlflowClient()

for params in grid:
    with mlflow.start_run():

        mlflow.set_tag("Training Info", "Basic RF model for return data")
        
        rf = RandomForestClassifier(**params)
        rf.fit(X_train, y_train)

        predictions = rf.predict(X_test)

        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions)
        recall = recall_score(y_test, predictions)
        
        # Log theparameters and tags
        mlflow.log_params(params)
        
        # Log the loss metric
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)

print("done.")

2025/10/30 11:41:13 INFO mlflow.tracking.fluent: Experiment with name 'MLflow Return Data HyperParamTuning' does not exist. Creating a new experiment.


üèÉ View run agreeable-croc-680 at: http://localhost:8080/#/experiments/271063554503179368/runs/66eaad751a984305a5bb6bc5fa18fd0d
üß™ View experiment at: http://localhost:8080/#/experiments/271063554503179368
üèÉ View run sincere-mouse-857 at: http://localhost:8080/#/experiments/271063554503179368/runs/b9c00d7015e44ba897329861bad5de22
üß™ View experiment at: http://localhost:8080/#/experiments/271063554503179368
üèÉ View run burly-conch-183 at: http://localhost:8080/#/experiments/271063554503179368/runs/16645200a40946c3a438c516532ed837
üß™ View experiment at: http://localhost:8080/#/experiments/271063554503179368
üèÉ View run defiant-pug-627 at: http://localhost:8080/#/experiments/271063554503179368/runs/1423b69fbfa34f1db4a942b0330e03a6
üß™ View experiment at: http://localhost:8080/#/experiments/271063554503179368
üèÉ View run invincible-steed-631 at: http://localhost:8080/#/experiments/271063554503179368/runs/bde4087e80764a6aa952d897dcbc23b2
üß™ View experiment at: http://loc