# Bagging, boosting and Stacking

## Imports

In [144]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [129]:
df = pd.read_csv('D:/data/csv/loanCleansed.csv')
df['Gender'].fillna('Male', inplace=True)
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128,360,1,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128,360,1,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66,360,1,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120,360,1,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141,360,1,Urban,Y


In [130]:
cols = ['Feature']
res = pd.DataFrame(df.dtypes, columns=cols)
nulls = pd.DataFrame(df.isnull().sum())
res['Nulls'] = nulls[0]
res

Unnamed: 0,Feature,Nulls
Loan_ID,object,0
Gender,object,0
Married,object,0
Dependents,int64,0
Education,object,0
Self_Employed,object,0
ApplicantIncome,int64,0
CoapplicantIncome,float64,0
LoanAmount,int64,0
Loan_Amount_Term,int64,0


In [131]:
df = pd.get_dummies(df)
df = df.drop('Loan_Status_N',axis=1)
df = df.rename(columns={'Loan_Status_Y': 'Loan_Status'})

train, test = train_test_split(df, test_size=0.3, random_state=0)
print('train : ', train.shape)
print('test : ', test.shape)

x_train = train.drop('Loan_Status',axis=1)
y_train = train['Loan_Status']

x_test = test.drop('Loan_Status',axis=1)
y_test = test['Loan_Status']

print('x_train : ', x_train.shape)
print('y_train : ', y_train.shape)
print('x_test  : ', x_test.shape)
print('y_test  : ', y_test.shape)
x_test.head()

train :  (429, 632)
test :  (185, 632)
x_train :  (429, 631)
y_train :  (429,)
x_test  :  (185, 631)
y_test  :  (185,)


Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_ID_LP001002,Loan_ID_LP001003,Loan_ID_LP001005,Loan_ID_LP001006,...,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
454,0,7085,0.0,84,360,1,0,0,0,0,...,1,1,0,1,0,0,1,0,1,0
52,0,4230,0.0,112,360,1,0,0,0,0,...,0,1,0,1,0,1,0,0,1,0
536,0,6133,3906.0,324,360,1,0,0,0,0,...,1,0,1,1,0,1,0,0,0,1
469,0,4333,2451.0,110,360,1,0,0,0,0,...,1,0,1,1,0,1,0,0,0,1
55,2,2708,1167.0,97,360,1,0,0,0,0,...,1,0,1,1,0,1,0,0,1,0


In [134]:
model1 = LogisticRegression(random_state=1, solver='liblinear')
model2 = DecisionTreeClassifier(random_state=1)
model = VotingClassifier(estimators=[('lr', model1), ('dt', model2)], voting='hard')

model.fit(x_train,y_train)
model.score(x_test,y_test)

0.7675675675675676

In [150]:
def Stacking(model, train, y, test, n_fold):
    folds = StratifiedKFold(n_splits=n_fold, random_state=1)
    test_pred = np.empty((test.shape[0],1),float)
    train_pred = np.empty((0,1),float)
    for train_indices,val_indices in folds.split(train,y.values):
        x_train,x_val = train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val = y.iloc[train_indices],y.iloc[val_indices]

        model.fit(X=x_train,y=y_train)
        train_pred = np.append(train_pred,model.predict(x_val))
        test_pred = np.append(test_pred,model.predict(test))
    return test_pred.reshape(-1,1),train_pred

# Now we’ll create two base models – decision tree and knn.

model1 = DecisionTreeClassifier(random_state=1)
test_pred1, train_pred1 = Stacking(model=model1, n_fold=10, train=x_train, test=x_test, y=y_train)
train_pred1 = pd.DataFrame(train_pred1)
test_pred1 = pd.DataFrame(test_pred1)

model2 = KNeighborsClassifier()
test_pred2 ,train_pred2=Stacking(model=model2, n_fold=10, train=x_train, test=x_test, y=y_train)
train_pred2 = pd.DataFrame(train_pred2)
test_pred2 = pd.DataFrame(test_pred2)

# Create a third model, logistic regression, on the predictions of the decision tree and knn models.

dfc = pd.concat([train_pred1, train_pred2], axis=1)
df_test = pd.concat([test_pred1, test_pred2], axis=1)

model = LogisticRegression(random_state=1, solver='liblinear')
#model.fit(dfc, y_train)
#model.score(df_test, y_test)


## Credits & Links

https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/