# Bagging Example

In [1]:
import pandas as pd
df = pd.read_csv("divorce.csv")
#I downloaded divorce.csv from kaggle
df.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q46,Q47,Q48,Q49,Q50,Q51,Q52,Q53,Q54,Divorce
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1


In [2]:
#here I check for outliers and if the data needs to be scaled
df.describe()
#dataset seems OK so no edits needed

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q46,Q47,Q48,Q49,Q50,Q51,Q52,Q53,Q54,Divorce
count,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,...,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0
mean,1.776471,1.652941,1.764706,1.482353,1.541176,0.747059,0.494118,1.452941,1.458824,1.576471,...,2.552941,2.270588,2.741176,2.382353,2.429412,2.476471,2.517647,2.241176,2.011765,0.494118
std,1.627257,1.468654,1.415444,1.504327,1.632169,0.904046,0.898698,1.546371,1.557976,1.421529,...,1.371786,1.586841,1.137348,1.511587,1.40509,1.260238,1.476537,1.505634,1.667611,0.501442
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0
50%,2.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,...,3.0,2.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,0.0
75%,3.0,3.0,3.0,3.0,3.0,1.0,1.0,3.0,3.0,3.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0
max,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0


In [3]:
#checking for an imbalance in "outcomes" but I get a nice 50-50 ratio
df.Divorce.value_counts()

0    86
1    84
Name: Divorce, dtype: int64

In [4]:
x = df.drop("Divorce", axis = "columns")
y = df.Divorce

In [5]:
#divide into testing and training data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state = 10)



In [6]:
y_train.value_counts() #to check that I have the same 50-50 ratio

0    64
1    63
Name: Divorce, dtype: int64

In [7]:
from sklearn import tree as sktree
from sklearn.model_selection import cross_val_score
scores = cross_val_score(sktree.DecisionTreeClassifier(), x, y, cv = 5)
scores #training results

array([0.91176471, 1.        , 1.        , 0.97058824, 0.91176471])

In [8]:
scores.mean()
#cross validation score shows that the use of a this model type produces pretty accurate results

0.9588235294117646

In [9]:
from sklearn.ensemble import BaggingClassifier
bag_model = BaggingClassifier(
    base_estimator = sktree.DecisionTreeClassifier(),
    n_estimators = 100,
    max_samples = 0.8,
    oob_score = True,
    random_state = 0
)

bag_model.fit(x_train, y_train)
bag_model.oob_score_
#"number of correctly predicted rows from the out-of-bag sample"

0.9763779527559056

In [10]:
bag_model.score(x_test, y_test)
#our bag model score of 0.977 is notably higher than our training score of 0.959

0.9767441860465116

In [11]:
bag_model = BaggingClassifier(
    base_estimator = sktree.DecisionTreeClassifier(),
    n_estimators = 100,
    max_samples = 0.8,
    oob_score = True,
    random_state = 0
)

scores = cross_val_score(bag_model, x, y, cv = 5)
scores.mean()
#our cross validation score of 0.976 is comparable

0.9764705882352942

Overall, we can see that using a bagging method decently increases the accuracy of our model predictions.