In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, auc
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

original = pd.read_csv("bank-full.csv", sep=";")
original['y'] = original['y'].apply(lambda x: 1 if x=="yes" else 0)

---

## Findings

- "day" can get encoded or transformed in some way since its not ratio data.  
- "pdays" has "-1" for missing data, otherwise is useful ratio data. Often people get called around quarter, half, full year

---

In [6]:
## convert day column to str so we will get target encoding as below

train['day'] = train['day'].astype(str)
original['day'] = original['day'].astype(str)

In [None]:
## convert 

---

In [7]:
# Step 1: Concatenate original + synthetic
train = pd.concat([train, original], ignore_index=True)

# Step 2: Categorical columns
cat_cols = original.select_dtypes(include=['object']).columns

# Step 3: Compute target means from original, and map to full train
for col in cat_cols:
    te_map = original.groupby(col)['y'].mean().to_dict()  # mean target per category
    train[col + "_mean"] = train[col].map(te_map)         # apply to full train

In [8]:
train

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,...,job_mean,marital_mean,education_mean,default_mean,housing_mean,loan_mean,contact_mean,month_mean,poutcome_mean,day_mean
0,0.0,42,technician,married,secondary,no,7,no,no,cellular,...,0.110570,0.101235,0.105594,0.117961,0.167024,0.126557,0.149189,0.110133,0.091615,0.158333
1,1.0,38,blue-collar,married,secondary,no,514,no,no,unknown,...,0.072750,0.101235,0.105594,0.117961,0.167024,0.126557,0.040707,0.102228,0.091615,0.098787
2,2.0,36,blue-collar,married,secondary,no,602,yes,no,unknown,...,0.072750,0.101235,0.105594,0.117961,0.077000,0.126557,0.040707,0.067195,0.091615,0.113636
3,3.0,27,student,single,secondary,no,34,yes,no,unknown,...,0.286780,0.149492,0.105594,0.117961,0.077000,0.126557,0.040707,0.067195,0.091615,0.078142
4,4.0,26,technician,married,secondary,no,889,yes,no,cellular,...,0.110570,0.101235,0.105594,0.117961,0.077000,0.126557,0.149189,0.166478,0.091615,0.164968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
840417,,51,technician,married,tertiary,no,825,no,no,cellular,...,0.110570,0.101235,0.150064,0.117961,0.167024,0.126557,0.149189,0.101511,0.091615,0.090768
840418,,71,retired,divorced,primary,no,1729,no,no,cellular,...,0.227915,0.119455,0.086265,0.117961,0.167024,0.126557,0.149189,0.101511,0.091615,0.090768
840419,,72,retired,married,secondary,no,5715,no,no,cellular,...,0.227915,0.101235,0.105594,0.117961,0.167024,0.126557,0.149189,0.101511,0.647253,0.090768
840420,,57,blue-collar,married,secondary,no,668,no,no,telephone,...,0.072750,0.101235,0.105594,0.117961,0.167024,0.126557,0.134205,0.101511,0.091615,0.090768
