## Feature Engineering
The goal of the notebook is to perform features engineering required accoring to Data Exploration and prepare data for modelling.
I will perform the following steps:
1. Features Engineering
2. Features Selection
3. Balancing dataset

In [14]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso

# from feature-engine
from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.transformation import LogTransformer

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import preprocessing_utils as pp

import warnings
warnings.filterwarnings("ignore")

### 1. Features Engineering

In [15]:
# Import data
df = pd.read_csv("bank-additional-full.csv", sep=";")

In [16]:
# Change target variable to binary
df.loc[:, "y"] = df["y"].map(lambda x: 1 if x=="yes" else 0)

In [17]:
# Prepare train and test set
# For simplicity and time saving I will use only split for train and test not train, test, validation or cross validation
X = df.loc[:, df.columns != 'y']
y = df["y"]
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.1,
    random_state=2022
)

In [18]:
# Define the mapping for education feature
education_mappings = {"university.degree": 6, 
                      "professional.course": 5,
                      "high.school": 4,
                      "basic.9y": 3,
                      "basic.6y": 2,
                      "basic.4y": 1,
                      "illiterate": 0,
                      "unknown": np.NaN}

In [19]:
# Define categorical and numerical variables
categorical_variables = [col for col in df.columns if df[col].dtype=="O" and col not in ["education"]]
numerical_variables = list(set(df.columns) - set(categorical_variables) - set("y"))
features_to_drop = ["duration", "emp.var.rate", "euribor3m"]

In [20]:
# Define preprocessing pipeline

marketing_pipe = Pipeline([

    # Change categorical variable "education" to ordinal
    ('mapper_education', pp.Mapper(
        variables=["education"], mappings=education_mappings)),

    # Replace 999 with np.NaN to fill missing values
    ("missing_adding", pp.Missing_Adding(
        variables = ["pdays"]
    )),

    # Impute numerical variables with the median
    ('median_imputation', MeanMedianImputer(
        imputation_method='median', variables=numerical_variables
    )),
    
    # Drop features as per data analysis
    ('drop_features', DropFeatures(features_to_drop=features_to_drop)),

    # Add to values 0.01 to enable logarithm calculation
    ('non_zero', pp.NonZero(variables=["previous"])),

    # Replace feature "previous" with it's logarithm
    ('log', LogTransformer(variables=["previous"])),  

    # One hot encode categorical features
    ('one_hot_encoder', OneHotEncoder(
        drop_last=True)),
])

In [21]:
# Train the preprocessing pipeline
marketing_pipe.fit(X_train, y_train)

In [22]:
# Transform data using pipeline
X_train = marketing_pipe.transform(X_train)
X_test = marketing_pipe.transform(X_test)

In [23]:
# Check shape of X_train
X_train.shape

(37069, 44)

#### 2. Fetures selection
I will use Lasso technique to select best features for the model

In [24]:
selection = SelectFromModel(Lasso(alpha=0.0005, random_state=0))

# train Lasso model and select features
selection.fit(X_train, y_train)

In [25]:
# Check how many features were selected using Lasso
selected_feats = X_train.columns[(selection.get_support())]

print(f"total features: {(X_train.shape[1])}")
print(f"selected features: {len(selected_feats)}")
print(f"features with coefficients shrank to zero: {np.sum(selection.estimator_.coef_ == 0)}")

total features: 44
selected features: 27
features with coefficients shrank to zero: 17


In [26]:
# Check which features were selected
selected_feats

Index(['age', 'education', 'campaign', 'pdays', 'previous', 'cons.price.idx',
       'cons.conf.idx', 'nr.employed', 'job_blue-collar', 'job_admin.',
       'job_services', 'job_student', 'job_retired', 'marital_single',
       'default_unknown', 'default_no', 'contact_telephone', 'month_may',
       'month_aug', 'month_jul', 'month_nov', 'month_mar', 'month_sep',
       'month_jun', 'day_of_week_mon', 'day_of_week_fri', 'poutcome_failure'],
      dtype='object')

In [27]:
# Visualize selected features
X_train.loc[:, (selection.get_support())].sample(5)

Unnamed: 0,age,education,campaign,pdays,previous,cons.price.idx,cons.conf.idx,nr.employed,job_blue-collar,job_admin.,...,month_may,month_aug,month_jul,month_nov,month_mar,month_sep,month_jun,day_of_week_mon,day_of_week_fri,poutcome_failure
7612,37,4.0,2,6.0,-4.60517,93.994,-36.4,5191.0,0,1,...,1,0,0,0,0,0,0,0,1,0
18850,48,3.0,1,6.0,-4.60517,93.444,-36.1,5228.1,1,0,...,0,1,0,0,0,0,0,1,0,0
29994,65,1.0,1,6.0,-4.60517,93.075,-47.1,5099.1,0,0,...,0,0,0,0,0,0,0,0,0,0
11194,45,1.0,1,6.0,-4.60517,94.465,-41.8,5228.1,1,0,...,0,0,0,0,0,0,1,0,0,0
7983,33,6.0,2,6.0,-4.60517,94.465,-41.8,5228.1,0,0,...,0,0,0,0,0,0,1,1,0,0


In [28]:
# Select chosen features
X_train = X_train.loc[:, (selection.get_support())]
X_test = X_test.loc[:, (selection.get_support())]

In [29]:
# This step is only to save preprocessed dataset of customers that subscribed the product for further analysis
df_train = pd.concat((X_train, y_train), axis=1)
df_test = pd.concat((X_test, y_test), axis=1)
data_preprocessed = pd.concat((df_train, df_test), axis=0)
df_sub = data_preprocessed[data_preprocessed["y"]==1]
df_sub.to_csv("df_subscribed.csv")
print(df_sub.shape)

(4640, 28)


In [30]:
# Scale all features to be able to apply different kind of models, also those based on distance like KNN
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
# Check size of final data
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (37069, 27)
Shape of X_test: (4119, 27)


#### Summary
After using Lasso model for features selection, the following features were chosen as most significant for modeling: 'age', 'education', 'campaign', 'pdays', 'previous', 'cons.price.idx','cons.conf.idx', 'nr.employed', 'job_blue-collar', 'job_admin.','job_services', 'job_student', 'job_retired', 'marital_single', 'default_unknown', 'default_no', 'contact_telephone', 'month_may', 'month_aug', 'month_jul', 'month_nov', 'month_mar', 'month_sep', 'month_jun', 'day_of_week_mon', 'day_of_week_fri', 'poutcome_failure'

### 3. Dataset balancing
I will use SMOTE algorithm to balance dataset. This is a technique that synthesize new examples from the minority class. SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

In [32]:
# Data oversampling
from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state=2022)
X_train_oversample, y_train_oversample = oversample.fit_resample(X_train, y_train)

In [33]:
# Check the number of customers that subscribed for a product and no before oversampling
y_train.value_counts()

0    32913
1     4156
Name: y, dtype: int64

In [34]:
# Check the number of customers that subscribed for a product and no after oversampling
y_train_oversample.value_counts()

0    32913
1    32913
Name: y, dtype: int64

In [35]:
# Save training and testing data
pd.DataFrame(X_train_oversample).to_csv("X_train_oversample.csv", header=None, index=True)
pd.DataFrame(X_train).to_csv("X_train.csv", header=None, index=True)
pd.DataFrame(X_test).to_csv("X_test.csv", header=None, index=True)
pd.Series(y_train_oversample).to_csv("y_trainoversample.csv", header=None, index=True)
pd.Series(y_train).to_csv("y_train.csv", header=None, index=True)
pd.Series(y_test).to_csv("y_test.csv", header=None, index=True)
