## Load libraries

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, TargetEncoder, SplineTransformer, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from pathlib import Path
import os
from sklearn import set_config
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

# Enable pandas output globally
set_config(transform_output="pandas")

## Load data

In [2]:
path = Path.cwd().parent
train_file_path = os.path.join(path, "raw", "train.csv")
test_file_path = os.path.join(path, "raw", "test.csv")

In [3]:
original_train_df = pd.read_csv(train_file_path, index_col=0)
test_df = pd.read_csv(test_file_path, index_col=0)

In [4]:
y_var = "y"
categorical_variables = original_train_df.select_dtypes(include="object").columns.to_list()
numerical_variables = (
    original_train_df
    .select_dtypes(exclude="object")
    .columns
    .drop(
        labels=["pdays", "y"]
    )
    .to_list()
)

In [5]:
import sys

# Absolute path to your package
sys.path.append(os.path.join(path))

from utils import convert_months_to_categorical
convert_months_to_categorical(original_train_df)["month"]

id
0         aug
1         jun
2         may
3         may
4         feb
         ... 
749995    jul
749996    aug
749997    apr
749998    aug
749999    aug
Name: month, Length: 750000, dtype: category
Categories (12, object): ['jan' < 'feb' < 'mar' < 'apr' ... 'sep' < 'oct' < 'nov' < 'dec']

### train val split

In [6]:
train_df, val_df = train_test_split(original_train_df, test_size=0.3)

## create first pipeline without feature selection

### pipeline and fitting

In [7]:
categorical_pipeline = Pipeline(
    [
        ("onehotencoding", OneHotEncoder(drop="first", sparse_output=False)),
    ]
    
)

pipeline = Pipeline(
    [
        ("month_transform_to_categorical", FunctionTransformer(convert_months_to_categorical, validate=False)),
        ("column_transformer", ColumnTransformer(
            [
                ("numerical", "passthrough", numerical_variables),
                ("categorical", categorical_pipeline, categorical_variables),
            ]
            )
        ),
        ("logistic_classifier", LogisticRegression(solver="newton-cholesky"))
    ]
)

In [8]:
fitted_pipeline = pipeline.fit(train_df, train_df[y_var])

### score

In [9]:
train_predictions_0 = fitted_pipeline.predict(train_df)
train_predictions_0

array([0, 0, 0, ..., 0, 0, 0], shape=(525000,))

In [10]:
auc_score_0 = roc_auc_score(train_df[y_var], train_predictions_0)
accuracy_score_0 = accuracy_score(train_df[y_var], train_predictions_0)
f1_score_0 = f1_score(train_df[y_var], train_predictions_0)
print(f"On Train AUC: {auc_score_0:.4f}, Accuracy: {accuracy_score_0:.4f}, F1 score: {f1_score_0:.4f}")

On Train AUC: 0.7403, Accuracy: 0.9161, F1 score: 0.5943


In [11]:
val_predictions_0 = fitted_pipeline.predict(val_df)
val_predictions_0

array([0, 0, 0, ..., 1, 0, 0], shape=(225000,))

In [12]:
auc_score_0 = roc_auc_score(val_df[y_var], val_predictions_0)
accuracy_score_0 = accuracy_score(val_df[y_var], val_predictions_0)
f1_score_0 = f1_score(val_df[y_var], val_predictions_0)
print(f"On Val AUC: {auc_score_0:.4f}, Accuracy: {accuracy_score_0:.4f}, F1 score: {f1_score_0:.4f}")

On Val AUC: 0.7381, Accuracy: 0.9156, F1 score: 0.5894


## Submission for simple model

In [13]:
test_predictions = fitted_pipeline.predict(test_df)
test_predictions

array([0, 0, 0, ..., 0, 0, 0], shape=(250000,))

In [14]:
submission_0 = test_df[[]]
submission_0 = submission_0.copy()
submission_0.loc[:, "y"] = test_predictions
submission_0.to_csv("submission_0.csv")

## LogisticRegression with feature engineering

In [None]:
power_pipeline = Pipeline(
    [
        ("power_transform", PowerTransformer("yeo-johnson", standardize=True))
    ],
)

one_hot_pipeline = Pipeline(
    [
        ("one_hot", OneHotEncoder(drop="first", sparse_output=False))
    ]
)

target_encoder = TargetEncoder()

preprocessing_pipeline_0 = ColumnTransformer(
    [
        ("numerical", power_pipeline, ["age", "balance", "duration", "campaign", "previous"]),
        ("to_one_hot", one_hot_pipeline, ["default", "housing", "loan", "poutcome"]),
        ("target_encoder", target_encoder, ["month", "day", "job", "marital", "education"]),
    ],
    verbose_feature_names_out=False
)

preprocessing_pipeline_1 = ColumnTransformer(
    [
        ("spline", SplineTransformer(n_knots=3), ["age"]),
    ],
    remainder="passthrough"
)

pipeline = Pipeline(
    [
        ("month_transform_to_categorical", FunctionTransformer(
            convert_months_to_categorical, 
            validate=False,
        )),
        ("preprocess_0", preprocessing_pipeline_0),
        ("preprocess_1", preprocessing_pipeline_1),
        ("logistic", LogisticRegression(
            solver="newton-cholesky",
            class_weight="balanced"
        ))
    ]
)

In [16]:
fitted_pipeline = pipeline.fit(train_df, train_df[y_var])

### score

In [17]:
train_predictions_1 = fitted_pipeline.predict(train_df)
train_predictions_1

array([0, 0, 0, ..., 1, 1, 0], shape=(525000,))

In [18]:
auc_score_1 = roc_auc_score(train_df[y_var], train_predictions_1)
accuracy_score_1 = accuracy_score(train_df[y_var], train_predictions_1)
f1_score_1 = f1_score(train_df[y_var], train_predictions_1)
print(f"On Train AUC: {auc_score_1:.4f}, Accuracy: {accuracy_score_1:.4f}, F1 score: {f1_score_1:.4f}")

On Train AUC: 0.8744, Accuracy: 0.8523, F1 score: 0.5967


In [23]:
import numpy as np
np.unique(train_predictions_1, return_counts=True)

(array([0, 1]), array([396248, 128752]))

## Errors analysis

In [27]:
errors_train_df = train_df.copy()
errors_train_df["proba"] = fitted_pipeline.predict_proba(train_df)[:, 0]
errors_train_df["preds"] = train_predictions_1
errors_train_df.loc[587133]

age                 59
job            retired
marital        married
education    secondary
default             no
balance           3232
housing             no
loan                no
contact       cellular
day                 18
month              nov
duration           564
campaign             1
pdays               -1
previous             0
poutcome       unknown
y                    0
proba         0.131731
preds                1
Name: 587133, dtype: object

In [30]:
fitted_pipeline["logistic"].coef_

array([[-0.52888346, -2.05517994, -3.70675791, -3.24226163,  9.53308294,
         0.17625889,  2.32679572, -0.24608013,  0.27088326,  0.11493894,
        -0.9770349 , -0.63069084,  0.15584671,  2.83054179,  0.52780303,
         6.08052971,  6.22308446,  2.95367039,  3.69039711,  6.38888648]])

## Submission of more complex model

In [20]:
test_predictions = fitted_pipeline.predict(test_df)
test_predictions

array([0, 1, 0, ..., 1, 0, 0], shape=(250000,))

In [21]:
submission_lr_balanced = test_df[[]]
submission_lr_balanced = submission_lr_balanced.copy()
submission_lr_balanced.loc[:, "y"] = test_predictions
submission_lr_balanced.to_csv("submission_lr_balanced.csv")