## Load libraries

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, TargetEncoder, SplineTransformer, FunctionTransformer, Binarizer, MinMaxScaler
from sklearn.model_selection import train_test_split
from pathlib import Path
import os
from sklearn import set_config
from imblearn.over_sampling import SMOTE
import sys
path = Path.cwd().parent

# Absolute path to your package
sys.path.append(os.path.join(path))

from utils import convert_months_to_categorical, calculate_score, ClipValues

# Enable pandas output globally
set_config(transform_output="pandas")

## Load data

In [2]:

train_file_path = os.path.join(path, "raw", "train.csv")
test_file_path = os.path.join(path, "raw", "test.csv")

In [3]:
original_train_df = pd.read_csv(train_file_path, index_col=0)
test_df = pd.read_csv(test_file_path, index_col=0)

In [4]:
y_var = "y"
categorical_variables = original_train_df.select_dtypes(include="object").columns.to_list()
numerical_variables = (
    original_train_df
    .select_dtypes(exclude="object")
    .columns
    .drop(
        labels=["pdays", "y"]
    )
    .to_list()
)

### train val split

In [5]:
train_df, val_df = train_test_split(original_train_df, test_size=0.3)

## LogisticRegression with feature engineering

In [6]:
power_pipeline = Pipeline(
    [
        ("clip", ClipValues(quantile_range=(0.25, 0.75))),
        ("power_transform", PowerTransformer("yeo-johnson", standardize=True))
    ],
)

one_hot_pipeline = Pipeline(
    [
        ("one_hot", OneHotEncoder(drop="first", sparse_output=False))
    ]
)

previous_binarizer = Pipeline(
    [
        # ("clip", FunctionTransformer(clip_values_beyond_range, validate=False)),
        ("binarizer", Binarizer(threshold=0.01))
    ]
)

target_encoder = TargetEncoder()

age_scaler = MinMaxScaler()

preprocessing_pipeline_0 = ColumnTransformer(
    [
        ("numerical", power_pipeline, ["balance", "duration", "campaign"]),
        ("binarizer", previous_binarizer, ["previous"]),
        ("to_one_hot", one_hot_pipeline, ["default", "housing", "loan", "poutcome", "education", "contact", "job", "marital"]),
        ("target_encoder", target_encoder, ["month", "day"]),
        ("age", age_scaler, ["age"])
    ],
    verbose_feature_names_out=False
)

pipeline = Pipeline(
    [
        ("month_transform_to_categorical", FunctionTransformer(
            convert_months_to_categorical, 
            validate=False,
        )),
        ("preprocess_0", preprocessing_pipeline_0),
        # ("preprocess_1", preprocessing_pipeline_1),
    ]
)

In [7]:
fitted_pipeline = pipeline.fit(train_df, train_df[y_var])

In [8]:
train_df_preprocessed = fitted_pipeline.transform(train_df)

In [9]:
# SMOTE resampling
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(train_df_preprocessed, train_df["y"])

In [10]:
df_to_train = X_res.copy()
df_to_train["y"] = y_res

In [11]:
import statsmodels.formula.api as smf

In [12]:
# bs vs cr
# bs(age, df=4, degree=3, include_intercept=False)
predictors = df_to_train.columns.drop("y").to_list()
dfs = {
    "age": 3,
    "duration": 10,
    "balance": 3
}
predictors = [
    f"bs({col}, df={dfs[col]}, degree=3, include_intercept=False)" 
    # f"cr({col}, df=3)"
    if col in dfs else col
    for col in predictors
]
formula = "y ~ " + " + ".join(predictors)
formula

'y ~ bs(balance, df=3, degree=3, include_intercept=False) + bs(duration, df=10, degree=3, include_intercept=False) + campaign + previous + default_yes + housing_yes + loan_yes + poutcome_other + poutcome_success + poutcome_unknown + education_secondary + education_tertiary + education_unknown + contact_telephone + contact_unknown + job_blue_collar + job_entrepreneur + job_housemaid + job_management + job_retired + job_self_employed + job_services + job_student + job_technician + job_unemployed + job_unknown + marital_married + marital_single + month + day + bs(age, df=3, degree=3, include_intercept=False)'

In [13]:
# logit vs probit
logit_model = smf.logit(
    formula=formula,
    data=df_to_train,
    # drop_cols=["poutcome_other", "poutcome_unknown", "marital_married", "education_unknown"]
)

In [14]:
result = logit_model.fit(method="newton", maxiter=100)

Optimization terminated successfully.
         Current function value: 0.271441
         Iterations 10


In [15]:
result.summary2()

0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,y,Pseudo R-squared:,0.608
Date:,2025-08-10 19:41,AIC:,501328.8753
No. Observations:,923292,BIC:,501856.9819
Df Model:,44,Log-Likelihood:,-2.5062e+05
Df Residuals:,923247,LL-Null:,-6.3998e+05
Converged:,1.0000,LLR p-value:,0.0000
No. Iterations:,10.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-16.2922,0.9033,-18.0361,0.0000,-18.0627,-14.5218
"bs(balance, df=3, degree=3, include_intercept=False)[0]",-9.4972,0.7178,-13.2314,0.0000,-10.9040,-8.0904
"bs(balance, df=3, degree=3, include_intercept=False)[1]",11.4148,0.3850,29.6499,0.0000,10.6603,12.1694
"bs(balance, df=3, degree=3, include_intercept=False)[2]",-4.7172,0.5242,-8.9981,0.0000,-5.7447,-3.6897
"bs(duration, df=10, degree=3, include_intercept=False)[0]",32.1282,0.9557,33.6170,0.0000,30.2551,34.0014
"bs(duration, df=10, degree=3, include_intercept=False)[1]",8.0128,0.5371,14.9187,0.0000,6.9601,9.0655
"bs(duration, df=10, degree=3, include_intercept=False)[2]",17.7180,0.6046,29.3038,0.0000,16.5329,18.9030
"bs(duration, df=10, degree=3, include_intercept=False)[3]",19.7466,0.5943,33.2273,0.0000,18.5819,20.9114
"bs(duration, df=10, degree=3, include_intercept=False)[4]",21.6524,0.6011,36.0216,0.0000,20.4743,22.8306


In [16]:
# train_df_preprocessed = fitted_pipeline.transform(train_df)
train_pred_probs = result.predict(train_df_preprocessed)
calculate_score(train_df["y"], train_pred_probs)

{'auc': np.float64(0.948876607490529),
 'accuracy': 0.8690152380952381,
 'f1': 0.6257802254014726}

In [17]:
val_df_preprocessed = fitted_pipeline.transform(val_df)
pred_probs = result.predict(val_df_preprocessed)
calculate_score(val_df["y"], pred_probs)

{'auc': np.float64(0.9483068171323303),
 'accuracy': 0.8696222222222222,
 'f1': 0.6262152622927842}

## Error analysis

In [18]:

errors_df = val_df.copy()
errors_df["pred_probs"] = pred_probs
errors_df["preds"] = (pred_probs > 0.5)
errors_df["preds"] = errors_df["preds"].astype(int)
errors_df[errors_df["preds"] != errors_df["y"]].head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,pred_probs,preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
617240,31,management,married,tertiary,no,652,no,no,cellular,20,aug,488,6,-1,0,unknown,0,0.559923,1
390884,41,blue-collar,married,primary,no,1872,yes,no,unknown,29,may,873,1,-1,0,unknown,0,0.584415,1
576116,34,technician,single,tertiary,no,21,no,no,cellular,27,aug,605,14,-1,0,unknown,0,0.830006,1
59967,43,blue-collar,married,primary,no,1499,no,no,cellular,22,may,787,2,-1,0,unknown,0,0.898198,1
601233,53,self-employed,married,secondary,no,631,no,yes,cellular,10,jul,811,1,-1,0,unknown,0,0.911156,1


In [19]:
pd.set_option('display.max_rows', 100)

In [20]:
sample_errors_df = errors_df[
    ( errors_df["job"] == "management" )
    & ( errors_df["education"] == "tertiary" )
    & ( errors_df["default"] == "no" )
    & ( errors_df["housing"] == "yes" )
    & ( errors_df["loan"] == "no" )
    & ( errors_df["contact"] == "unknown" )
    # & ( errors_df["y"] == 0 )
    # & ( errors_df["preds"] == 1 )
    & ( errors_df["marital"] == "single" )
    & ( errors_df["month"] == "jun" )
    & ( errors_df["duration"] > 600 )
]
sample_errors_df.head(10)

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,pred_probs,preds
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
51327,28,management,single,tertiary,no,676,yes,no,unknown,3,jun,882,3,-1,0,unknown,0,0.904557,1
35408,34,management,single,tertiary,no,53,yes,no,unknown,3,jun,1416,4,-1,0,unknown,1,0.97723,1
243468,25,management,single,tertiary,no,7103,yes,no,unknown,3,jun,661,1,-1,0,unknown,1,0.904032,1
145928,26,management,single,tertiary,no,853,yes,no,unknown,9,jun,605,3,-1,0,unknown,0,0.527279,1
577366,30,management,single,tertiary,no,431,yes,no,unknown,2,jun,847,1,-1,0,unknown,0,0.911736,1
498518,38,management,single,tertiary,no,572,yes,no,unknown,2,jun,767,4,-1,0,unknown,1,0.797808,1
577543,29,management,single,tertiary,no,16,yes,no,unknown,4,jun,1806,2,-1,0,unknown,1,0.987121,1
239896,43,management,single,tertiary,no,1646,yes,no,unknown,12,jun,658,2,-1,0,unknown,1,0.747655,1
628065,33,management,single,tertiary,no,2420,yes,no,unknown,3,jun,766,1,-1,0,unknown,0,0.916477,1
513692,28,management,single,tertiary,no,1741,yes,no,unknown,18,jun,705,10,-1,0,unknown,1,0.643518,1


In [22]:
sample_errors_df.pivot_table(index="y", columns="preds", values="age", aggfunc='count',)

preds,1
y,Unnamed: 1_level_1
0,28
1,25


## Submission of results

In [43]:
test_predictions = fitted_pipeline.transform(test_df)
test_pred_probs = result.predict(test_predictions)
test_pred_probs

id
750000    0.118858
750001    0.878584
750002    0.003427
750003    0.000903
750004    0.362252
            ...   
999995    0.001404
999996    0.393326
999997    0.940825
999998    0.043762
999999    0.351854
Length: 250000, dtype: float64

In [44]:
submission_lr_balanced = test_df[[]]
submission_lr_balanced = submission_lr_balanced.copy()
submission_lr_balanced.loc[:, "y"] = test_pred_probs
submission_lr_balanced.to_csv("submission_statsmodels.csv")