In [None]:
# load required libraries
import pandas as pd
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    VotingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


In [2]:
# load the diabetes dataset
filename = "data/pima-indians-diabetes.data.csv"
names = ["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"]
data = pd.read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
y = array[:, 8]

### **Bagging Algorithms**

### Bagged Decision Trees

In [4]:
# Bagged Decision Trees for Classification
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(estimator=cart, n_estimators=num_trees, random_state=7)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.3f} ({results.std():.3f})")

Accuracy: 0.758 (0.039)


### Random Forest

In [6]:
# Random Forest Classification
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = RandomForestClassifier(n_estimators=100, max_features=3)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.3f} ({results.std():.3f})")

Accuracy: 0.776 (0.057)


### Extra Trees

In [9]:
# Extra Trees Classification
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = ExtraTreesClassifier(n_estimators=100, max_features=7)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.3f} ({results.std():.3f})")

Accuracy: 0.762 (0.055)


### **Boosting Algorithms**

### AdaBoost

In [11]:
# AdaBoost Classification
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = AdaBoostClassifier(n_estimators=30, random_state=7)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.3f} ({results.std():.3f})")

Accuracy: 0.755 (0.065)


### Stochastic Gradient Boosting

In [13]:
# Stochastic Gradient Boosting Classification
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
model = GradientBoostingClassifier(n_estimators=100, random_state=7)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.3f} ({results.std():.3f})")

Accuracy: 0.758 (0.056)


### **Voting Ensemble**

In [15]:
# Voting Ensemble for Classification
kfold = KFold(n_splits=10, random_state=7, shuffle=True)

# create the sub models
estimators = []
model1 = LogisticRegression(solver="liblinear")
estimators.append(("logistic", model1))
model2 = DecisionTreeClassifier()
estimators.append(("cart", model2))
model3 = SVC(gamma="auto")
estimators.append(("svm", model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.3f} ({results.std():.3f})")

Accuracy: 0.748 (0.050)
