## Load Libraries

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import TargetEncoder, FunctionTransformer, Binarizer, OrdinalEncoder
import imblearn.pipeline as imb
from sklearn.model_selection import train_test_split
from pathlib import Path
import os
from sklearn import set_config
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import SMOTE
import sys
import xgboost as xgb
path = Path.cwd().parent

# Absolute path to your package
sys.path.append(os.path.join(path))

from utils import convert_months_to_categorical, calculate_score, ClipValues, SinePreprocess

# Enable pandas output globally
set_config(transform_output="pandas")

## Load Data

In [2]:

train_file_path = os.path.join(path, "raw", "train.csv")
test_file_path = os.path.join(path, "raw", "test.csv")

In [3]:
original_train_df = pd.read_csv(train_file_path, index_col=0)
test_df = pd.read_csv(test_file_path, index_col=0)

In [4]:
y_var = "y"
categorical_variables = original_train_df.select_dtypes(include="object").columns.to_list()
numerical_variables = (
    original_train_df
    .select_dtypes(exclude="object")
    .columns
    .drop(
        labels=["y"]
    )
    .to_list()
)

## Train val split

In [5]:
train_df, val_df = train_test_split(original_train_df, test_size=0.3)

## Preprocessing and train Pipeline

In [269]:
def add_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["balance_negative_times_default_yes"] = (df["balance"] < 0) * (df['default'] == "yes")
    df["balance_negative_times_loan_yes"] = (df["balance"] < 0) * (df['loan'] == "yes")
    df["balance_negative"] = (df["balance"] < 0).astype(int)
    df["default_yes_times_loan_yes"] = (df['default'] == "yes") * (df['loan'] == "yes")
    df["contact_unknown_times_default_yes"] = (df['contact'] == "unknown") * (df['default'] == "yes")
    df["duration_times_loan_yes"] = (df["duration"]) * (df['loan'] == "yes")
    df["duration_times_housing_yes"] = (df["duration"]) * (df['housing'] == "yes")
    df["duration_times_contact_unknown"] = (df["duration"]) * (df['contact'] == "unknown")
    df["duration_times_age"] = (df["duration"] < 53) * (df["age"])
    df["balance_negative_times_poutcome_unknown"] = (df["balance"] < 0) * (df['poutcome'] == "unknown")
    df["duration_times_default_yes"] = (df["duration"]) * (df['default'] == "yes")
    df["duration_times_education_primary"] = (df["duration"]) * (df['education'] == "primary")
    df["pdays_by_campaign"] = (df["pdays"]) / (df['campaign'])
    df["previous_by_campaign"] = (df["previous"]) / (df['campaign'] + 1e-8)
    df["pdays_times_previous"] = (df["pdays"]) * (df['previous'])
    df["day_of_year"] = df["month"].cat.codes * 30 + df["day"]
    df["duration_sq"] = df["duration"] ** 2
    for col in ["default", "housing", "loan", "poutcome", "education", "contact", "job", "marital"]:
        df[col] = pd.Categorical(df[col])
    return df


power_pipeline = Pipeline(
    [
        ("passthrough", "passthrough"),
        # ("clip", ClipValues(quantile_range=(0.25, 0.75))),
        # ("power_transform", PowerTransformer("yeo-johnson", standardize=True)),
        # ("splines", SplineTransformer(n_knots=10, knots="quantile"))
    ],
)

cat_encoder = Pipeline(
    [
        ("encoder", OrdinalEncoder()),
        
    ]
)

target_encoder = TargetEncoder()
sin_preprocess = SinePreprocess(
    {
        "day": 31,
        "month": 12,
        "day_of_year": 360
    }
)

preprocessing_pipeline_0 = ColumnTransformer(
    [
        ("numerical", power_pipeline, [
            "balance", "duration", "campaign", "pdays", "age", 
        ]),
        ("binarizer", Binarizer(threshold=0.01), ["balance", "pdays"]),
        ("encode", "passthrough", ["default", "housing", "loan", "poutcome", "education", "contact", "job", "marital"]),
        ("target_encoder", target_encoder, ["day", "month", "default", "housing", "loan", "poutcome", "education", "contact", "job", "marital"]),
        ("sine_preproces", sin_preprocess, ["day", "month", "day_of_year"]),
        ("drop", "drop", ["previous"])
    ],
    remainder="passthrough"
    # verbose_feature_names_out=False
)

pipeline = imb.Pipeline(
    [
        ("month_transform_to_categorical", FunctionTransformer(
            convert_months_to_categorical, 
            validate=False,
        )),
        ("add_columns", FunctionTransformer(
            add_columns, 
            validate=False,
        )),
        ("preprocess_0", preprocessing_pipeline_0),
        ("drop_not_needed", 
         ColumnTransformer(transformers=[
            ("drop", "drop", [
                "binarizer__pdays",
                "remainder__balance_negative_times_default_yes",
                "remainder__duration_times_age",
            ]),
        ], remainder="passthrough", verbose_feature_names_out=False)
        ),
        # ("smote", SMOTE(random_state=42)),
        ("classifier", xgb.XGBClassifier(
            # 
            tree_method="hist", 
            enable_categorical=True,
            objective="binary:logistic",
            # learning_rate=0.1
            colsample_bytree=0.5,
            max_depth = 12,
            subsample = 0.5,
            reg_lambda = 0.001,
            reg_alpha = 0.01,
            **{"lambda": 0.001,
               "alpha": 0.01}
        ))
    ]
)

## Outlier detection

In [270]:
import numpy as np

In [271]:
iso_forest = IsolationForest(max_samples=10000)

In [272]:
outliers = iso_forest.fit_predict(
    train_df.select_dtypes(exclude="object").drop("y", axis=1),
    )

In [273]:
np.unique(outliers, return_counts=True)

(array([-1,  1]), array([ 15996, 509004]))

## Training

In [274]:
fitted_pipeline = pipeline.fit(train_df.iloc[outliers == 1].drop("y", axis=1), train_df.iloc[outliers == 1][y_var])

## Evaluation

In [275]:
# train_df_preprocessed = fitted_pipeline.transform(train_df)
train_pred_probs = fitted_pipeline.predict_proba(train_df)[:, 1]
calculate_score(train_df["y"], train_pred_probs)

{'auc': np.float64(0.9850131900396719),
 'accuracy': 0.9576647619047619,
 'f1': 0.8186757603446025}

In [276]:
# train_df_preprocessed = fitted_pipeline.transform(train_df)
val_pred_probs = fitted_pipeline.predict_proba(val_df)[:, 1]
calculate_score(val_df["y"], val_pred_probs)

{'auc': np.float64(0.9588542354467262),
 'accuracy': 0.9284577777777778,
 'f1': 0.690364898917037}

In [277]:
feature_importances = pd.DataFrame.from_dict(fitted_pipeline[-1].get_booster().get_score(importance_type='gain'), orient="index")
feature_importances.head()

Unnamed: 0,0
numerical__balance,3.365643
numerical__duration,6.494366
numerical__campaign,2.896157
numerical__pdays,6.423628
numerical__age,4.676598


In [278]:
feature_importances.sort_values(by=0)

Unnamed: 0,0
target_encoder__default,2.420257
target_encoder__housing,2.629286
numerical__campaign,2.896157
target_encoder__education,2.905272
target_encoder__contact,2.934781
target_encoder__marital,2.95949
remainder__balance_negative_times_loan_yes,3.033459
target_encoder__day,3.035996
remainder__default_yes_times_loan_yes,3.09021
target_encoder__job,3.104374
