In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, OrdinalEncoder, PowerTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Pipeline
- Encode the label variable with numerical values in order to be able to build machine-
learning models
- Encode the month, day_of_week, education, housing, loan and default attributes using the
OrdinalEncoder class
- Encode the marital, poutcome, contact and job attribute using the OneHotEncoder class
- Transform the duration attribute using PowerTransformer and MinMaxscaler class
- Scale the age, cons.price.idx, cons.conf.idx, nr.employed attributes using MinMaxScaler class
- Use RandomForestClassifier() for modeling

In [2]:
class BankPipeline:
    def __init__(self, data) -> None:
        self.df = data
        self.pipeline = None
        self.label_encoder = LabelEncoder()
        self._prepare_data()
    
    def _prepare_data(self):
        self.df['y_encoded'] = self.label_encoder.fit_transform(self.df['y'])
        self.X = self.df.drop(columns=['y', 'y_encoded'])
        self.y = self.df['y_encoded']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.2, random_state=42)
        
    def build_pipeline(self):
        # Encode the month, day_of_week, education, housing, loan and default attributes using the OrdinalEncoder class
        ordinal_features = ['month', 'day_of_week', 'education', 'housing', 'loan', 'default']
        ordinal_transformer = OrdinalEncoder()

        # Encode the marital, poutcome, contact and job attribute using the OneHotEncoder class
        onehot_features = ['marital', 'poutcome', 'contact', 'job']
        onehot_transformer = OneHotEncoder()

        # Transform the duration attribute using PowerTransformer and MinMaxscaler class
        duration_transformer = Pipeline(steps=[
            ('power', PowerTransformer(method='yeo-johnson')),
            ('scaler', MinMaxScaler())])
        
        # Scale the age, cons.price.idx, cons.conf.idx, nr.employed attributes using MinMaxScaler class
        scale_features = ['age', 'cons.price.idx', 'cons.conf.idx', 'nr.employed']
        scale_transformer = MinMaxScaler()

        # Combine
        preprocessor = ColumnTransformer(
            transformers=[
                ('ord', ordinal_transformer, ordinal_features),
                ('onehot', onehot_transformer, onehot_features),
                ('duration', duration_transformer, ['duration']),
                ('scale', scale_transformer, scale_features)],
            remainder='passthrough'
        )

        # Create the pipeline
        self.pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42))])
    
    def train(self):
        if not self.pipeline:
            self.build_pipeline()
        self.pipeline.fit(self.X_train, self.y_train)
    
    def predict(self, X=None):
        if X is None:
            X = self.X_test
        return self.pipeline.predict(X)
    
    def evaluate(self):
        y_pred = self.predict()
        accuracy = accuracy_score(self.y_test, y_pred)
        return accuracy

# Load

In [3]:
data = pd.read_csv('../data/midterm1-data/bank-additional-full.csv', sep=';')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

# Train

In [5]:
bank_model = BankPipeline(data)
bank_model.train()
accuracy = bank_model.evaluate()
print(f"Accuracy: {accuracy:.6f}")

Accuracy: 0.912843
