## Load Libraries

In [1]:
import pandas as pd
# from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, TargetEncoder, SplineTransformer, FunctionTransformer, Binarizer, MinMaxScaler, OrdinalEncoder
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from pathlib import Path
import os
from sklearn import set_config
from imblearn.over_sampling import SMOTE
import sys
path = Path.cwd().parent

# Absolute path to your package
sys.path.append(os.path.join(path))

from utils import convert_months_to_categorical, calculate_score, ClipValues, SinePreprocess

# Enable pandas output globally
set_config(transform_output="pandas")

## Load Data

In [2]:

train_file_path = os.path.join(path, "raw", "train.csv")
test_file_path = os.path.join(path, "raw", "test.csv")

In [3]:
original_train_df = pd.read_csv(train_file_path, index_col=0)
test_df = pd.read_csv(test_file_path, index_col=0)

In [None]:
y_var = "y"
categorical_variables = original_train_df.select_dtypes(include="object").columns.to_list()
numerical_variables = (
    original_train_df
    .select_dtypes(exclude="object")
    .columns
    .drop(
        labels=["y"]
    )
    .to_list()
)

## Train val split

In [5]:
train_df, val_df = train_test_split(original_train_df, test_size=0.3)

## Preprocessing and train pipeline

In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
power_pipeline = Pipeline(
    [
        ("passthrough", "passthrough")
        # ("clip", ClipValues(quantile_range=(0.25, 0.75))),
        # ("power_transform", PowerTransformer("yeo-johnson", standardize=True)),
        # ("splines", SplineTransformer(n_knots=10, knots="quantile"))
    ],
)

cat_encoder = Pipeline(
    [
        ("encoder", OrdinalEncoder()),
        
    ]
)

target_encoder = TargetEncoder()
sin_preprocess = SinePreprocess(
    {
        "day": 31,
        "month": 12
    }
)

preprocessing_pipeline_0 = ColumnTransformer(
    [
        ("numerical", power_pipeline, ["balance", "duration", "campaign", "pdays", "previous", "age"]),
        ("encode", cat_encoder, ["default", "housing", "loan", "poutcome", "education", "contact", "job", "marital"]),
        ("target_encoder", target_encoder, ["month"]),
        ("sine_preproces", sin_preprocess, ["day", "month"]),
    ],
    verbose_feature_names_out=False
)

pipeline = Pipeline(
    [
        ("month_transform_to_categorical", FunctionTransformer(
            convert_months_to_categorical, 
            validate=False,
        )),
        # ("interactions", interactions_transformer),
        ("preprocess_0", preprocessing_pipeline_0),
        # ("smote", SMOTE(random_state=42)),
        ("logistic_regression", HistGradientBoostingClassifier(
            learning_rate = 0.1,
            class_weight = "balanced"
        ))
    ]
)

In [8]:
fitted_pipeline = pipeline.fit(train_df, train_df[y_var])

## Evaluation

In [9]:
# train_df_preprocessed = fitted_pipeline.transform(train_df)
train_pred_probs = fitted_pipeline.predict_proba(train_df)[:, 1]
calculate_score(train_df["y"], train_pred_probs)

{'auc': np.float64(0.9658902774249954),
 'accuracy': 0.882312380952381,
 'f1': 0.6580098081542737}

In [10]:
# train_df_preprocessed = fitted_pipeline.transform(train_df)
val_pred_probs = fitted_pipeline.predict_proba(val_df)[:, 1]
calculate_score(val_df["y"], val_pred_probs)

{'auc': np.float64(0.9647730857229863),
 'accuracy': 0.8813911111111111,
 'f1': 0.656851525632948}

## Model Review

In [11]:
sample = val_df.sample(10000, random_state=42)
sample.head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
534278,26,services,single,secondary,no,102,no,no,cellular,28,jul,477,3,-1,0,unknown,0
468194,35,management,married,tertiary,no,0,no,no,cellular,31,jul,88,2,-1,0,unknown,0
695506,47,technician,married,secondary,no,-432,no,yes,cellular,12,aug,645,8,-1,0,unknown,0
2653,25,management,single,tertiary,no,1776,yes,no,cellular,24,sep,479,1,184,1,success,1
175452,27,blue-collar,single,secondary,no,496,yes,no,unknown,9,may,137,3,-1,0,unknown,0


In [13]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    fitted_pipeline, sample, sample["y"], n_repeats=10, random_state=42
)

importances_perm = result.importances_mean
# for name, score in zip(feature_names, importances_perm):
#     print(f"{name}: {score:.4f}")


In [None]:
for name, score in zip(sample.columns, importances_perm):
    print(f"{name}: {score:.4f}")

age: 0.0036
job: 0.0001
marital: 0.0001
education: -0.0002
default: 0.0000
balance: 0.0073
housing: 0.0066
loan: 0.0015
contact: 0.0211
day: 0.0172
month: 0.0415
duration: 0.1327
campaign: 0.0035
pdays: 0.0111
previous: 0.0002
poutcome: 0.0070
y: 0.0000


## Submission

In [None]:
test_pred_probs = fitted_pipeline.predict_proba(test_df)
test_pred_probs

In [None]:
submission_lr_balanced = test_df[[]]
submission_lr_balanced = submission_lr_balanced.copy()
submission_lr_balanced.loc[:, "y"] = test_pred_probs[:, 1]
submission_lr_balanced.to_csv("submission_boosting.csv")