In [5]:
########################################
# Combing Model Predictions into Ensembl Predictions
#
# http://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/
########################################

# The three most popular methods for combining 
#the predictions from different models are:

# Bagging: Building multiple models (typically 
#of the same type) from different subsamples of the training dataset.
#
# Boosting: Building multiple models (typically of 
#the same type) each of which learns to fix the prediction 
#errors of a prior model in the chain.
#
# Voting: Building multiple models (typically of 
#differing types) and simple statistics (like 
#calculating the mean) are used to combine predictions.


# Bagging Algorithms

# Bootstrap Aggregation or bagging 
#involves taking multiple samples from your 
#training dataset (with replacement) and 
#training a model for each sample

# 1. Bagged Decision Tree

# Bagging performs best with algorithms 
# that have high variance. A popular example 
# are decision trees, often constructed without pruning.

# Bagged Decision Trees for Classification
import pandas
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("\nBagged Decision Trees for Classification\n")
print(results.mean())

# 2. Random Forest

# Random forest is an extension of bagged decision trees.

# Samples of the training dataset are taken with 
# replacement, but the trees are constructed in a 
# way that reduces the correlation between individual 
# classifiers. Specifically, rather than greedily 
# choosing the best split point in the construction 
# of the tree, only a random subset of features are 
# considered for each split.

# Random Forest Classification
import pandas
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
num_trees = 100
max_features = 3
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("\nRandom Forest Classification\n")
print(results.mean())

# 3. Extra Trees

# Extra Trees are another modification of 
# bagging where random trees are constructed 
# from samples of the training dataset.

# Extra Trees Classification
import pandas
from sklearn import model_selection
from sklearn.ensemble import ExtraTreesClassifier
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
num_trees = 100
max_features = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("\nExtra Trees\n")
print(results.mean())

# Boosting Algorithms

# Boosting ensemble algorithms creates a sequence 
# of models that attempt to correct the mistakes 
# of the models before them in the sequence.

# Once created, the models make predictions 
# which may be weighted by their demonstrated 
# accuracy and the results are combined to create a final output prediction.

# The two most common methods are:


# 1. AdaBoost

# AdaBoost was perhaps the first successful 
# boosting ensemble algorithm. It generally 
# works by weighting instances in the dataset 
# by how easy or difficult they are to classify, 
# allowing the algorithm to pay more or less attention 
# to them in the construction of subsequent models.

# AdaBoost Classification
import pandas
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
num_trees = 30
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("\nAdaBoost\n")
print(results.mean())


# 2. Stochastic Gradient Boosting

# Stochastic Gradient Boosting (also called Gradient 
# Boosting Machines) are one of the most sophisticated 
# ensemble techniques. It is also a technique that 
# is proving to be perhaps of the the best techniques 
# available for improving performance via ensembles.

# Stochastic Gradient Boosting Classification
import pandas
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
num_trees = 100
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print("\nStochastic Gradient Boosting\n")
print(results.mean())

# Voting Ensemble

#weighting predictions from sub-models

# Voting Ensemble for Classification
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print("\nVoting Ensemble\n")
print(results.mean())


Bagged Decision Trees for Classification

0.770745044429

Random Forest Classification

0.768233082707

Extra Trees

0.76035543404

AdaBoost

0.76045796309

Stochastic Gradient Boosting

0.766900205058

Voting Ensemble

0.734295967191
