# 3. Preparing features into suitable form for modeling

##### Importing necessary libraries:

In [1]:
import pandas as pd
#import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

##### Loading dataset:

In [2]:
data = pd.read_csv(r'./data/data.csv', delimiter = ';')

##### Split this dataset into features and label sets:

In [3]:
y = data['y']
X = data.drop(['y'], axis = 1)

##### Converting all categorical features into dummy variables:

In [4]:
categorical_features = X.select_dtypes(include = [object]).columns
X_num = pd.get_dummies(data = X, columns = categorical_features, drop_first = True)

In [5]:
X_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 47 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41176 non-null  int64  
 1   emp.var.rate                   41176 non-null  float64
 2   cons.price.idx                 41176 non-null  float64
 3   cons.conf.idx                  41176 non-null  float64
 4   euribor3m                      41176 non-null  float64
 5   nr.employed                    41176 non-null  float64
 6   age_band                       41176 non-null  int64  
 7   pdays_999                      41176 non-null  int64  
 8   previous_0                     41176 non-null  int64  
 9   campaign_band                  41176 non-null  int64  
 10  job_blue-collar                41176 non-null  uint8  
 11  job_entrepreneur               41176 non-null  uint8  
 12  job_housemaid                  41176 non-null 

##### Scale continous variables:

In [6]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns = X_num.columns)
X_scaled.head()

Unnamed: 0,age,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,age_band,pdays_999,previous_0,campaign_band,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,1.533143,0.648101,0.722628,0.886568,0.712463,0.331695,1.439991,0.195445,0.397773,-0.873212,...,1.411,-0.332541,-0.133123,-0.118479,1.95893,-0.514487,-0.494332,-0.496157,0.397773,-0.185728
1,1.629107,0.648101,0.722628,0.886568,0.712463,0.331695,1.439991,0.195445,0.397773,-0.873212,...,1.411,-0.332541,-0.133123,-0.118479,1.95893,-0.514487,-0.494332,-0.496157,0.397773,-0.185728
2,-0.290177,0.648101,0.722628,0.886568,0.712463,0.331695,-0.529944,0.195445,0.397773,-0.873212,...,1.411,-0.332541,-0.133123,-0.118479,1.95893,-0.514487,-0.494332,-0.496157,0.397773,-0.185728
3,-0.002284,0.648101,0.722628,0.886568,0.712463,0.331695,0.455024,0.195445,0.397773,-0.873212,...,1.411,-0.332541,-0.133123,-0.118479,1.95893,-0.514487,-0.494332,-0.496157,0.397773,-0.185728
4,1.533143,0.648101,0.722628,0.886568,0.712463,0.331695,1.439991,0.195445,0.397773,-0.873212,...,1.411,-0.332541,-0.133123,-0.118479,1.95893,-0.514487,-0.494332,-0.496157,0.397773,-0.185728


In [7]:
X_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41176 entries, 0 to 41175
Data columns (total 47 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            41176 non-null  float64
 1   emp.var.rate                   41176 non-null  float64
 2   cons.price.idx                 41176 non-null  float64
 3   cons.conf.idx                  41176 non-null  float64
 4   euribor3m                      41176 non-null  float64
 5   nr.employed                    41176 non-null  float64
 6   age_band                       41176 non-null  float64
 7   pdays_999                      41176 non-null  float64
 8   previous_0                     41176 non-null  float64
 9   campaign_band                  41176 non-null  float64
 10  job_blue-collar                41176 non-null  float64
 11  job_entrepreneur               41176 non-null  float64
 12  job_housemaid                  41176 non-null 

##### Merge X_sclaed and "y" sets in one big dataframe

In [8]:
data_scaled = pd.concat([X_scaled, y], axis=1)

##### Save preprocessed dataset

In [9]:
data_scaled.to_csv(r'./data/data_scaled.csv', sep=';', index=False)