## Problem : What should be my saving target to buy laptop?

* You - $1000$
* Sameer - $900$
* Nitika - $950$
* Roshan - $1050$
* Srikanth - $1500

Decide to average out - Averaging

In [1]:
(1000+900+950+1050+1500)/5

1080.0

In [2]:
import pandas as pd
import numpy as np
from statistics import mode
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('credit_preped.csv')
df.head()

Unnamed: 0,customer_id,loan_duration_mo,loan_amount,payment_pcnt_income,time_in_residence,age_yrs,number_loans,dependents,bad_credit,checking_account_status_0 - 200 DM,...,home_ownership_own,home_ownership_rent,job_category_highly skilled,job_category_skilled,job_category_unemployed-unskilled-non-resident,job_category_unskilled-resident,telephone_none,telephone_yes,foreign_worker_no,foreign_worker_yes
0,1122334,6,1169,4,4,67,2,1,0,0,...,1,0,0,1,0,0,0,1,0,1
1,6156361,48,5951,2,2,22,1,1,1,1,...,1,0,0,1,0,0,1,0,0,1
2,2051359,12,2096,2,3,49,1,2,0,0,...,1,0,0,0,0,1,1,0,0,1
3,8740590,42,7882,2,4,45,1,2,0,0,...,0,0,0,1,0,0,1,0,0,1
4,3924540,24,4870,3,4,53,2,2,1,0,...,0,0,0,1,0,0,1,0,0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 63 columns):
customer_id                                               1000 non-null int64
loan_duration_mo                                          1000 non-null int64
loan_amount                                               1000 non-null int64
payment_pcnt_income                                       1000 non-null int64
time_in_residence                                         1000 non-null int64
age_yrs                                                   1000 non-null int64
number_loans                                              1000 non-null int64
dependents                                                1000 non-null int64
bad_credit                                                1000 non-null int64
checking_account_status_0 - 200 DM                        1000 non-null int64
checking_account_status_< 0 DM                            1000 non-null int64
checking_account_status_> 200 DM or 

## We'll look at Ensembling for both classification (bad_credit) and regression (age_yrs)

Dropping customer id 

In [5]:
df.drop('customer_id',axis=1, inplace=True)

In [6]:
from sklearn.model_selection import KFold,cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score
from sklearn.preprocessing import RobustScaler

### Let's start by training individual models

### For classification following models will be used:

* SVC - Support vector classifier
* Logistic Reg
* KNN classifier

### For regression following models will be used:

* SVR - Support vector regressor
* Linear Reg
* KNN regressor

In [7]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# Classification models

rand_seed = 42
np.random.seed=rand_seed

log_cf = LogisticRegression(solver='lbfgs',random_state=rand_seed)
svc_cf = SVC(gamma='scale',random_state=rand_seed)
knn_cf = KNeighborsClassifier()

clf = [log_cf,svc_cf,knn_cf]

# Regression models

linear_reg = LinearRegression()
svr_reg = SVR(gamma='scale')
knn_reg = KNeighborsRegressor()


reg = [linear_reg,svr_reg,knn_reg]


## Functions to Standardize data, Split data into training/testing sets, get accuracy metrices, and train the models

In [8]:
# 1 - Standardize data

def standdata(df):
    scaler=RobustScaler()
    data=scaler.fit_transform(df)
    return data

# 2 - Split data into training/testing sets
def split_data(features,target_name=None):
    # get target column
    target = features[target_name]
    # Drop target value from data
    temp_data = features.drop(target_name,axis=1)
    temp_data = standdata(temp_data)
    
    # split
    x_train,x_val,y_train,y_val = train_test_split(temp_data,target,test_size=0.1)
    
    return (x_train,x_val,y_train,y_val)

# 3 - get accuracy metrics 
def get_reg(pred,actual):
    mae = mean_absolute_error(actual,pred)
    mse = mean_squared_error(actual,pred)
    return mae#, mse

def get_acc(pred,actual):
    return accuracy_score(actual,pred)*100

# 4 - Training the model

def train_model(model,features=None,target_name=None,nfolds=10,task='class'):
    # Getting target column
    target = features[target_name]
    # Dropping target
    temp = features.drop(target_name,axis=1)
    temp = standdata(temp)
    
    if task=='reg':
        score = -1 * cross_val_score(model,temp,target,scoring='neg_mean_absolute_error')
        print(f'\n Mean absolute error of {model} : {round(score[0],4)}')
        print('______________________________________\n\n')
    else:
        score = cross_val_score(model,temp,target,cv=nfolds,scoring='accuracy')
        print(f'Accuracy of {model} is {score[0]*100}%')
        print('______________________________________\n\n')

### Training Classification models

In [9]:
for model in clf:
    train_model(model,features=df,target_name='bad_credit')

Accuracy of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) is 82.0%
______________________________________


Accuracy of SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False) is 83.0%
______________________________________


Accuracy of KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform') is 74.0%
______________________________________




## Training Regression Models

In [10]:
for model in reg:
    train_model(model,features=df,target_name='age_yrs',task='reg')


 Mean absolute error of LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) : 7.6008
______________________________________



 Mean absolute error of SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) : 7.9741
______________________________________



 Mean absolute error of KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform') : 8.0677
______________________________________




# Simple Ensemble Techniques

### Averaging

In [11]:
x_train,x_val,y_train,y_val = split_data(df,target_name='age_yrs')

# fit base models
linear_reg.fit(x_train,y_train)
knn_reg.fit(x_train,y_train)
svr_reg.fit(x_train,y_train)

# make predictions
pred1 = linear_reg.predict(x_val)
pred2 = knn_reg.predict(x_val)
pred3 = svr_reg.predict(x_val)

average_pred = (pred1+pred2+pred3)/3

In [12]:
print('Linear Regression model')
print(get_reg(pred1,y_val))
print()
print('KNN Regression model')
print(get_reg(pred2,y_val))
print()
print('SVR Regression model')
print(get_reg(pred3,y_val))

print()

print('Average Model')
print(get_reg(average_pred,y_val))




Linear Regression model
7.00939453125

KNN Regression model
6.438000000000001

SVR Regression model
6.587693389526012

Average Model
6.450679066094097


## Weighted Average 

**Intuition** : Models with higher predictive powers should be given higher weights as compared to those with lower accuracy.

Weights are numbers between 0 and 1 totalling to 1.

In [13]:
linear_reg.fit(x_train,y_train)  # 0.25
knn_reg.fit(x_train,y_train)     # 0.25
svr_reg.fit(x_train,y_train)     # 0.5

pred1 = linear_reg.predict(x_val)
pred2 = knn_reg.predict(x_val)
pred3 = svr_reg.predict(x_val)


# Performing weighted average
w_avg = (0.15*pred1 + 0.15*pred2 + 0.7*pred3)

print('Linear Regression model')
print(get_reg(pred1,y_val))
print()
print('KNN Regression model')
print(get_reg(pred2,y_val))
print()
print('SVR Regression model')
print(get_reg(pred3,y_val))

print()

print('Weighted Average Model')
print(get_reg(w_avg,y_val))



Linear Regression model
7.00939453125

KNN Regression model
6.438000000000001

SVR Regression model
6.587693389526012

Weighted Average Model
6.498042413788097


## Max / Majority Voting

Max voting is similar to averaging but it applies to classification problems

In [14]:
x_train,x_val,y_train,y_val = split_data(df,target_name='bad_credit')

# fitting classification models
log_cf.fit(x_train,y_train)
knn_cf.fit(x_train,y_train)
svc_cf.fit(x_train,y_train)

# Predictions
pred1=log_cf.predict(x_val)
pred2=knn_cf.predict(x_val)
pred3=svc_cf.predict(x_val)

# maxvoting list

maxvote_pred = []

for i in range(0,len(x_val)):
    maxvote_pred.append(mode([pred1[i],pred3[i],pred3[i]]))
    
print('Logistic Regression Model')
print(get_acc(pred1,y_val))

print('KNN Model')
print(get_acc(pred2,y_val))

print('Support Vector Classifier Model')
print(get_acc(pred3,y_val))

print('Max Voting Model')
print(get_acc(np.array(maxvote_pred),y_val))

Logistic Regression Model
74.0
KNN Model
79.0
Support Vector Classifier Model
78.0
Max Voting Model
78.0


### Sklearn implementation

In [15]:
from sklearn.ensemble import VotingClassifier

max_model=VotingClassifier(estimators=[('Logistic',log_cf),('knn',knn_cf),('svc',svc_cf)],voting='hard')

max_model.fit(x_train,y_train)

print('Max voting using sklearn.ensemble.VotingClassifier')
print(get_acc(max_model.predict(x_val),y_val))

Max voting using sklearn.ensemble.VotingClassifier
78.0


## Advanced Techniques such as Boosting, Stacking, Bagging

## Bootstrap Aggregating

In [16]:
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,BaggingRegressor
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,BaggingClassifier

# Reg Models
rf_reg = RandomForestRegressor(n_estimators=100,random_state=rand_seed)
ex_reg = ExtraTreesRegressor(n_estimators=100,random_state=rand_seed)
bag_reg = BaggingRegressor(svr_reg,n_estimators=10,random_state=rand_seed)

# Clf Models
rf_clf = RandomForestClassifier(n_estimators=100,random_state=rand_seed)
ex_clf = ExtraTreesClassifier(n_estimators=100,random_state=rand_seed)
bag_clf = BaggingClassifier(svc_cf,n_estimators=10,random_state=rand_seed)

In [17]:
x_train,x_val,y_train,y_val = split_data(df,target_name='age_yrs')

rf_reg.fit(x_train,y_train)
ex_reg.fit(x_train,y_train)
bag_reg.fit(x_train,y_train)

print('MAE Random Forest Regressor : ',get_reg(rf_reg.predict(x_val),y_val))
print('MAE Extra Trees Regressor : ',get_reg(ex_reg.predict(x_val),y_val))
print('MAE Bagging Meta Regressor : ',get_reg(bag_reg.predict(x_val),y_val))

MAE Random Forest Regressor :  6.8039
MAE Extra Trees Regressor :  7.713900000000001
MAE Bagging Meta Regressor :  6.39210422863734


## Classification

In [18]:
x_train,x_val,y_train,y_val = split_data(df,target_name='bad_credit')

rf_clf.fit(x_train,y_train)
ex_clf.fit(x_train,y_train)
bag_clf.fit(x_train,y_train)

# print accuracy score for each classifier

print('Accuracy Score for Random Forest Classifier : ',get_acc(rf_clf.predict(x_val),y_val))
print('Accuracy Score for Extra Tree Classifier : ',get_acc(ex_clf.predict(x_val),y_val))
print('Accuracy Score for Bagging Meta Classifier : ',get_acc(bag_clf.predict(x_val),y_val))

Accuracy Score for Random Forest Classifier :  73.0
Accuracy Score for Extra Tree Classifier :  72.0
Accuracy Score for Bagging Meta Classifier :  74.0


# Boosting

In [19]:
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor

from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier

ada_reg = AdaBoostRegressor(base_estimator=svr_reg,n_estimators=100,random_state=rand_seed)
gb_reg = GradientBoostingRegressor(n_estimators=100,random_state=rand_seed)

ada_clf = AdaBoostClassifier(base_estimator=log_cf,random_state=rand_seed)
gb_clf = GradientBoostingClassifier(random_state=rand_seed)

In [20]:
x_train,x_val,y_train,y_val = split_data(df,target_name='age_yrs')

ada_reg.fit(x_train,y_train)
gb_reg.fit(x_train,y_train)

print('MAE of Adaboost Regressor : ', get_reg(ada_reg.predict(x_val),y_val))
print('MAE of Gradient Boosting Regressor : ', get_reg(gb_reg.predict(x_val),y_val))

x_train,x_val,y_train,y_val = split_data(df,target_name='bad_credit')

ada_clf.fit(x_train,y_train)
gb_clf.fit(x_train,y_train)

print('\n\nAccuracy of Adaboost Classifier : ', get_acc(ada_clf.predict(x_val),y_val))
print('Accuracy of Gradient Boosting Classifier : ', get_acc(gb_clf.predict(x_val),y_val))

MAE of Adaboost Regressor :  8.095924790351798
MAE of Gradient Boosting Regressor :  7.933886043908278


Accuracy of Adaboost Classifier :  83.0
Accuracy of Gradient Boosting Classifier :  79.0


# Stacking

First Level Learners > Use predictions from base learners to create new training data > Feed the new training data into another algo called meta-learner.

## Procedure
1. Split total training set into two disjoint sets - Training / Test
2. Train several base models on first part 
3. Test base models on second part
4. Use predictions from step 3 as inputs and correct responses as o/p to train a higher lever learner which is also called meta-learner.


<img src='https://miro.medium.com/max/1318/1*9uCwjY5uRkRrX2VNST7R0w.gif'>

In [21]:
from sklearn.model_selection import KFold

def stackingModel(base_models,meta_model,features,target,nfolds=10):
    
    # Split data into folds
    kfold = KFold(n_splits=nfolds,shuffle=True,random_state=rand_seed)
    
    # Initialize an array to hold predictions
    test_pred = np.zeros((features.shape[0],len(base_models)))
    train_pred = np.zeros((features.shape[0],len(base_models)))

    # Train base model or 1st level learners
    for i, model in enumerate(base_models):
        for train_index, test_index in kfold.split(features, target):
            # Fit train data on model
            model.fit(np.array(features)[train_index],np.array(target)[train_index])
            
            # Make predictions on the holdout data
            y_pred = model.predict(np.array(features)[test_index])
            
            # Making predictions on the train data
            t_pred = model.predict(np.array(features)[train_index])
            
            # Appending the predictions
            test_pred[test_index,i] = y_pred
            train_pred[train_index,i] = t_pred
            
            
    # Now, let's train the meta-model
    meta_model.fit(train_pred, target)
    
    # Make final predictions
    final_pred = meta_model.predict(np.mean([test_pred],axis=0))
    
    return final_pred

In [22]:
# Regression

target = df['age_yrs']
data = df.drop('age_yrs',axis=1)
data = standdata(data)

# Base learners
base_learners = [linear_reg,svr_reg,knn_reg]

# Meta Learner
meta_learner = svr_reg

pred = stackingModel(base_learners,meta_learner, data, target)

print('MAE of Stacked Model : ', get_reg(pred,target))

MAE of Stacked Model :  7.109825798393478


In [23]:
# Classification

target = df['bad_credit']
data = df.drop('bad_credit',axis=1)
data = standdata(data)

# Base learners
base_learners = [log_cf,knn_cf,svc_cf]

# Meta Learner
meta_learner = svc_cf

pred = stackingModel(base_learners,meta_learner, data, target)

print('Accuracy Score of Stacked Model : ', get_acc(pred,target))

Accuracy Score of Stacked Model :  76.6


# Vectstack
https://github.com/vecxoz/vecstack