In [21]:
import pandas as pd

## Trainingsdaten laden

In [22]:
train_df = pd.read_csv("../data/train.csv")
train_df.head()

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel
0,9534310106,"[4, 3, 4]",new,252.0,1
1,7202594767,"[4, 2, 0, 2, 5]",existing,70.0,0
2,2737331698,[5],existing,84.0,0
3,4868011733,"[1, 4, 2, 4]",existing,116.0,0
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",existing,378.0,0


## Feature Engineering

In [23]:
def create_features(data_frame):
    data_frame = pd.get_dummies(data_frame, columns=["customerType"], dtype=int, drop_first=True)
    data_frame["orderedBooks"] = data_frame["basket"].apply(lambda x: sum(c.isdigit() for c in x))
    for booktype in range(6):
        data_frame[f"booktype_{booktype}"] = data_frame["basket"].apply(lambda x: x.count(str(booktype)))
    return data_frame

train_df = create_features(train_df)
train_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks,booktype_0,booktype_1,booktype_2,booktype_3,booktype_4,booktype_5
0,9534310106,"[4, 3, 4]",252.0,1,1,3,0,0,0,1,2,0
1,7202594767,"[4, 2, 0, 2, 5]",70.0,0,0,5,1,0,2,0,1,1
2,2737331698,[5],84.0,0,0,1,0,0,0,0,0,1
3,4868011733,"[1, 4, 2, 4]",116.0,0,0,4,0,1,1,0,2,0
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",378.0,0,0,7,1,0,3,2,0,1


## Daten skalieren

In [24]:
from sklearn.preprocessing import StandardScaler

features = ["totalAmount", "orderedBooks"] + [col for col in train_df.columns if col.startswith("booktype_")]
scaler = StandardScaler()
train_df[features] = scaler.fit_transform(train_df[features])
train_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks,booktype_0,booktype_1,booktype_2,booktype_3,booktype_4,booktype_5
0,9534310106,"[4, 3, 4]",-0.032823,1,1,-0.607275,-0.771563,-0.767737,-0.775046,0.075189,0.634586,-0.644106
1,7202594767,"[4, 2, 0, 2, 5]",-0.879459,0,0,0.133204,0.383487,-0.767737,1.514422,-0.880132,-0.122402,0.52709
2,2737331698,[5],-0.814334,0,0,-1.347754,-0.771563,-0.767737,-0.775046,-0.880132,-0.879391,0.52709
3,4868011733,"[1, 4, 2, 4]",-0.665475,0,0,-0.237036,-0.771563,0.39158,0.369688,-0.880132,0.634586,-0.644106
4,7622406570,"[2, 5, 3, 2, 3, 2, 0]",0.553309,0,0,0.873682,0.383487,-0.767737,2.659156,1.03051,-0.879391,0.52709


In [25]:
x_train = train_df.drop(columns=["returnLabel", "transactionId", "basket"])
y_train = train_df["returnLabel"]

In [26]:
from sklearn.ensemble import RandomForestClassifier
params = {"n_estimators": 100, "max_features": "sqrt", "random_state": 0}
rf = RandomForestClassifier(**params)
rf.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Modell auf Testdaten anwenden

In [27]:
test_df = pd.read_csv("../data/test.csv")
test_df.head()

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel
0,4132523932,"[4, 3, 4, 3, 2, 3]",existing,366.0,1
1,8998574539,"[3, 4, 4, 3, 5]",existing,85.0,0
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",existing,275.0,0
3,4533897707,"[3, 2, 2, 1, 5, 1, 1, 0]",existing,528.0,0
4,3334800500,"[4, 2, 3, 5, 2, 5, 1]",existing,287.0,0


In [28]:
test_df = create_features(test_df)
test_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks,booktype_0,booktype_1,booktype_2,booktype_3,booktype_4,booktype_5
0,4132523932,"[4, 3, 4, 3, 2, 3]",366.0,1,0,6,0,0,1,3,2,0
1,8998574539,"[3, 4, 4, 3, 5]",85.0,0,0,5,0,0,0,2,2,1
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",275.0,0,0,11,1,4,2,2,2,0
3,4533897707,"[3, 2, 2, 1, 5, 1, 1, 0]",528.0,0,0,8,1,3,2,1,0,1
4,3334800500,"[4, 2, 3, 5, 2, 5, 1]",287.0,0,0,7,0,1,2,1,1,2


In [29]:
features = ["totalAmount", "orderedBooks"] + [col for col in train_df.columns if col.startswith("booktype_")]
test_df[features] = scaler.transform(test_df[features])
test_df.head()

Unnamed: 0,transactionId,basket,totalAmount,returnLabel,customerType_new,orderedBooks,booktype_0,booktype_1,booktype_2,booktype_3,booktype_4,booktype_5
0,4132523932,"[4, 3, 4, 3, 2, 3]",0.497487,1,0,0.503443,-0.771563,-0.767737,0.369688,1.98583,0.634586,-0.644106
1,8998574539,"[3, 4, 4, 3, 5]",-0.809682,0,0,0.133204,-0.771563,-0.767737,-0.775046,1.03051,0.634586,0.52709
2,9346688547,"[1, 1, 2, 2, 4, 4, 3, 1, 1, 0, 3]",0.074169,0,0,2.35464,0.383487,3.869533,1.514422,1.03051,0.634586,-0.644106
3,4533897707,"[3, 2, 2, 1, 5, 1, 1, 0]",1.251086,0,0,1.243922,0.383487,2.710216,1.514422,0.075189,-0.879391,0.52709
4,3334800500,"[4, 2, 3, 5, 2, 5, 1]",0.129991,0,0,0.873682,-0.771563,0.39158,1.514422,0.075189,-0.122402,1.698287


In [30]:
X_test = test_df.drop(["transactionId", "returnLabel", "basket"], axis=1)
predictions = rf.predict(X_test)
y_test = test_df["returnLabel"]

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy :", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall   :", recall_score(y_test, predictions))

Accuracy : 0.8507478864079775
Precision: 0.7535911602209945
Recall   : 0.7429193899782135
