In [1]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline as ImbPipeline

import warnings
warnings.filterwarnings("ignore")


## Load Dataset

In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv("data/df_EDA_data.csv")
df.head()



Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscription
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,59,admin.,married,professional.course,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


#### Data cleaning - removal of missing values, unknown entries, nulls and any NaN have been removed in the EDA file

### Define Feature Groups

In [3]:
# Numerical features: columns with numeric dtype
NUMERICAL_FEATURES = [col for col in df.columns if df[col].dtype in ["int64", "float64"]]

# Categorical features: columns with object dtype (strings)
CATEGORICAL_FEATURES = [col for col in df.columns if df[col].dtype == "object"]

# Check results
print("Numerical Features:", NUMERICAL_FEATURES)
print("Categorical Features:", CATEGORICAL_FEATURES)


Numerical Features: ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
Categorical Features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'subscription']


In [4]:
CATEGORICAL_FEATURES.remove('subscription')

### Split the target/feature columns

In [5]:
TARGET = "subscription"
X = df.drop(columns=[TARGET])
y = df[TARGET].map({"yes": 1, "no": 0})

### Train / Validation / Test Split (Stratified) --> 60/20/20 ratio

In [6]:
# Total number of samples
total_samples = len(X)

# Step 1: Split off 20% for the test set
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.2,         # 20% test
    stratify=y,
    random_state=42
)

# Step 2: Split the remaining 80% into 60% train and 20% validation
# Validation size = 20% / 80% = 0.25
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.25,        # 0.25 * 0.8 = 0.2 of total
    stratify=y_temp,
    random_state=42
)

# Check number of samples
print(f"Train: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")

# Calculate and display ratios relative to total
train_ratio = len(X_train) / total_samples
val_ratio = len(X_val) / total_samples
test_ratio = len(X_test) / total_samples

print(f"Train / Validation / Test ratios: {train_ratio:.2f} / {val_ratio:.2f} / {test_ratio:.2f}")


Train: 18286 samples
Validation: 6096 samples
Test: 6096 samples
Train / Validation / Test ratios: 0.60 / 0.20 / 0.20


### Build Preprocessing Pipelines

In [7]:
# Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

In [8]:
# Categorical Pipeline (One-Hot Example)
cat_pipeline = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False
        ))
    ]
)

In [9]:
# Combine with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, NUMERICAL_FEATURES),
        ("cat", cat_pipeline, CATEGORICAL_FEATURES)
    ]
)

#### Fit Preprocessor (TRAIN ONLY)

In [10]:
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed   = preprocessor.transform(X_val)
X_test_processed  = preprocessor.transform(X_test)


#### ðŸš¨ Never fit on validation or test data

### Handle Class Imbalance (Train Only)

#### SMOTE + Tomek Links

In [11]:
smote = SMOTE(random_state=42)
tomek = TomekLinks()

X_train_resampled, y_train_resampled = smote.fit_resample(
    X_train_processed, y_train
)

X_train_resampled, y_train_resampled = tomek.fit_resample(
    X_train_resampled, y_train_resampled
)


## Save Artifacts

In [12]:
ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(exist_ok=True)

joblib.dump(preprocessor, ARTIFACTS_DIR / "preprocessor.pkl")

['artifacts/preprocessor.pkl']

#### Saving data splits


In [13]:
data_splits = {
    "X_train": X_train_resampled,
    "y_train": y_train_resampled,
    "X_val": X_val_processed,
    "y_val": y_val.values,
    "X_test": X_test_processed,
    "y_test": y_test.values
}

joblib.dump(data_splits, ARTIFACTS_DIR / "data_splits.pkl")


['artifacts/data_splits.pkl']

In [16]:
X_train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')

In [15]:
feature_names = list(X_train.columns)

joblib.dump(feature_names, "artifacts/feature_names.pkl")

['artifacts/feature_names.pkl']

In [17]:
feature_names

['age',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'nr.employed']

# THE END