In [1]:
import pandas as pd

## Trainingsdaten laden

In [2]:
train_df = pd.read_csv("../data/train.csv")
train_df.head()

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel
0,9534310106,"[4, 3, 4]",new,252.0,1
1,7202594767,"[4, 2, 0, 2, 5]",existing,70.0,0
2,2737331698,[5],existing,84.0,0
3,4868011733,"[1, 4, 2, 4]",existing,116.0,0
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",existing,378.0,0


## Feature Engineering

In [3]:
def create_features(data_frame):
    data_frame = pd.get_dummies(data_frame, columns=["customerType"], dtype=int, drop_first=True)
    data_frame["orderedBooks"] = data_frame["basket"].apply(lambda x: sum(c.isdigit() for c in x))
    # Aufgabe 1 - Teil 5 hier rein
    return data_frame

train_df = create_features(train_df)
train_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks
0,9534310106,"[4, 3, 4]",252.0,1,1,3
1,7202594767,"[4, 2, 0, 2, 5]",70.0,0,0,5
2,2737331698,[5],84.0,0,0,1
3,4868011733,"[1, 4, 2, 4]",116.0,0,0,4
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",378.0,0,0,7


## Daten skalieren

In [4]:
from sklearn.preprocessing import StandardScaler

features = ["totalAmount", "orderedBooks"]
scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
train_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks
0,9534310106,"[4, 3, 4]",-0.032823,1,1,-0.607275
1,7202594767,"[4, 2, 0, 2, 5]",-0.879459,0,0,0.133204
2,2737331698,[5],-0.814334,0,0,-1.347754
3,4868011733,"[1, 4, 2, 4]",-0.665475,0,0,-0.237036
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",0.553309,0,0,0.873682


In [5]:
x_train = train_df.drop(columns=["returnLabel", "transactionId", "basket"])
y_train = train_df["returnLabel"]

In [6]:
from sklearn.ensemble import RandomForestClassifier

params = {"n_estimators": 100, "max_features": "sqrt", "random_state": 0}
rf = RandomForestClassifier(**params)
rf.fit(x_train, y_train)

## Modell auf Testdaten anwenden

In [7]:
test_df = pd.read_csv("../data/test.csv")
test_df.head()

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel
0,4132523932,"[4, 3, 4, 3, 2, 3]",existing,366.0,1
1,8998574539,"[3, 4, 4, 3, 5]",existing,85.0,0
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",existing,275.0,0
3,4533897707,"[3, 2, 2, 1, 5, 1, 1, 0]",existing,528.0,0
4,3334800500,"[4, 2, 3, 5, 2, 5, 1]",existing,287.0,0


In [8]:
test_df = create_features(test_df)
test_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks
0,4132523932,"[4, 3, 4, 3, 2, 3]",366.0,1,0,6
1,8998574539,"[3, 4, 4, 3, 5]",85.0,0,0,5
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",275.0,0,0,11
3,4533897707,"[3, 2, 2, 1, 5, 1, 1, 0]",528.0,0,0,8
4,3334800500,"[4, 2, 3, 5, 2, 5, 1]",287.0,0,0,7


In [9]:
features = ["totalAmount", "orderedBooks"]
test_df[features] = scaler.transform(test_df[features])
test_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks
0,4132523932,"[4, 3, 4, 3, 2, 3]",0.462354,1,0,0.496732
1,8998574539,"[3, 4, 4, 3, 5]",-0.800293,0,0,0.128986
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",0.053454,0,0,2.335461
3,4533897707,"[3, 2, 2, 1, 5, 1, 1, 0]",1.190286,0,0,1.232224
4,3334800500,"[4, 2, 3, 5, 2, 5, 1]",0.107375,0,0,0.864478


In [10]:
X_test = test_df.drop(["transactionId", "returnLabel", "basket"], axis=1)
predictions = rf.predict(X_test)
y_test = test_df["returnLabel"]

In [11]:
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score

print("Accuracy :", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall   :", recall_score(y_test, predictions))

Random Forest:
Accuracy : 0.7285931064383264
Precision: 0.564366632337796
Recall   : 0.39796659404502543
