In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from features.transformers import ItemSelector
from sklearn.linear_model import LogisticRegression

In [2]:
DATA_PATH = "../data/"
DATASET_FILE_NAME = "dataset.csv"

In [3]:
df = pd.read_csv(DATA_PATH + DATASET_FILE_NAME)

In [4]:
df["cancelled"].value_counts()

0    99550
1    60692
Name: cancelled, dtype: int64

In [5]:
df.head()

Unnamed: 0,default,product_id,cancelled
0,0,IS0002,0
1,0,IS0001,1
2,0,CC0002,1
3,1,IS0001,1
4,1,IS0001,1


## Split Dataset

In [6]:
X = df[["product_id", "default"]]
y = df["cancelled"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Model

In [7]:
# After perform a Grid-Search
params = {
    "C":1e-05,
    "class_weight":'balanced'
}

In [8]:
transformer_list = [("pid", Pipeline(steps=[
                        ('pid_selector', ItemSelector(key="product_id")),
                        ('pid_one_hot', OneHotEncoder(categories='auto',
                                                      handle_unknown='ignore'))])),
                    ('default', ItemSelector(key="default"))]

model = Pipeline([
    ("features", FeatureUnion(transformer_list=transformer_list)),
    ("clf", LogisticRegression(**params))
])

model.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('pid',
                                                 Pipeline(steps=[('pid_selector',
                                                                  ItemSelector(key='product_id')),
                                                                 ('pid_one_hot',
                                                                  OneHotEncoder(handle_unknown='ignore'))])),
                                                ('default',
                                                 ItemSelector(key='default'))])),
                ('clf', LogisticRegression(C=1e-05, class_weight='balanced'))])

In [9]:
y_pred = model.predict(X_test)

In [10]:
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2}')
print(f'Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred):.2}')
print(f'F1: {f1_score(y_test, y_pred):.2}')

Accuracy: 0.71
Balanced Accuracy: 0.68
F1: 0.59
