In [1]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# from feature-engine
from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import preprocessing_utils as pp

import warnings
warnings.filterwarnings("ignore")

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# Import data
df = pd.read_csv("bank-additional-full.csv", sep=";")

In [3]:
# Change target variable to binary
df.loc[:, "y"] = df["y"].map(lambda x: 1 if x=="yes" else 0)

In [4]:
# Prepare train and test set

X_train, X_test, y_train, y_test = train_test_split(
    df,
    df["y"], 
    test_size=0.1,
    random_state=2022
)

In [5]:
# Define the mapping for education feature
education_mappings = {"university.degree": 6, 
                      "professional.course": 5,
                      "high.school": 4,
                      "basic.9y": 3,
                      "basic.6y": 2,
                      "basic.4y": 1,
                      "illiterate": 0,
                      "unknown": np.NaN}

In [6]:
# Define categorical and numerical variables
categorical_variables = [col for col in df.columns if df[col].dtype=="O"]
numerical_variables = list(set(df.columns) - set(categorical_variables) - set("y"))
features_to_drop = ["duration", "emp.var.rate", "euribor3m"]

In [7]:
categorical_variables

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']

In [8]:
numerical_variables

['age',
 'nr.employed',
 'previous',
 'cons.conf.idx',
 'cons.price.idx',
 'euribor3m',
 'duration',
 'campaign',
 'emp.var.rate',
 'pdays']

In [25]:
# Define preprocessing pipeline

marketing_pipe = Pipeline([

    # Change categorical variable "education" to ordinal
    ('mapper_education', pp.Mapper(
        variables=["education"], mappings=education_mappings)),

    # Replace 999 with np.NaN to fill missing values
    ("missing_adding", pp.Missing_Adding(
        variables = ["pdays"]
    )),

    # Impute numerical variables with the median
    ('median_imputation', MeanMedianImputer(
        imputation_method='median', variables=numerical_variables
    )),
    
    # Drop features as per data analysis
    ('drop_features', DropFeatures(features_to_drop=features_to_drop)),

    # Add to values 0.01 to enable logarithm calculation
    ('non_zero', pp.NonZero(variables=["previous"])),

    # Replace feature "previous" with it's logarithm
    ('log', LogTransformer(variables=["previous"])),  

    # one hot encode categorical features
    ('one_hot_encoder', OneHotEncoder(
        drop_last=True)),
        
    # scale
    ('scaler', StandardScaler()),
])

In [26]:
# train the pipeline
marketing_pipe.fit(X_train, y_train)

In [28]:
X_train = marketing_pipe.transform(X_train)
X_test = marketing_pipe.transform(X_test)

In [33]:
X_train.shape

(37069, 45)

In [None]:
# Fetures selection - Lasso