In [1]:
# import packages
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
# load processed data
df = pd.read_csv(Path("../data/processed/bank_clean.csv"))
df.shape

(41188, 22)

In [3]:
# define treatment and outcome
df["treatment"] = (df["contact"] == "cellular").astype(int)
df["outcome"] = df["y_bin"]

In [4]:
# drop non-model columns
df_model = df.drop(columns=["contact", "y", "y_bin"])

In [5]:
# one hot encode categorical variables
df_model = pd.get_dummies(
    df_model,
    columns=[
        "job",
        "marital",
        "education",
        "default",
        "housing",
        "loan",
        "month",
        "day_of_week",
        "poutcome"
    ],
    drop_first=True
)

In [6]:
# log transform skewed variables
for col in ["campaign", "previous"]:
    df_model[col] = np.log1p(df_model[col])

In [None]:
# final feature set
X_cols = [col for col in df_model.columns if col not in ["treatment", "outcome"]]
df_model.shape

(41188, 54)

In [8]:
# save modeling dataset
processed_dir = Path("../data/processed")
df_model.to_csv(processed_dir / "bank_model_ready.csv", index=False)