In [1]:
##### SCIKIT-LEARN or SKLEARN #####
#
#  - Python Machine Learning Module
#  - Simple and efficient tools for data mining and data analysis
#  - Accessible to everybody, and reusable in various contexts
#  - Built on NumPy, SciPy, and matplotlib
#  - Open source, commercially usable - BSD license
#
# https://scikit-learn.org/stable/index.html
# https://scikit-learn.org/stable/user_guide.html 
# https://scikit-learn.org/stable/modules/classes.html
# https://scikit-learn.org/stable/auto_examples/

In [2]:
# The sklearn ML API is very consistent:
# 0) read data
# 1) explore data
# 2) preprocess data
# 3) setup data for consumption by ML model 
#     4) choose the model by importing the appropriate estiamtor class from sklearn [from sklearn import model]
#     5) instantiate the model with desired parameter values [ml=model()]
#     6) fit the model to the training data [ml.fit(Xtrain, ytrain)]
#     7) apply the model to test data [ypred=ml.predict(Xtest) or ml.transform(Xtest)]
# 8) evaluate model
# 9) deploy/use model

In [3]:
import pandas as pd
import numpy as np

In [4]:
##### Ensemble Methods
#
# https://scikit-learn.org/stable/modules/ensemble.html
#
# - The goal of ensemble methods is to combine the predictions of several base estimators 
#    built with a given learning algorithm in order to improve generalizability / robustness over 
#    that of a single estimator.
#
# - In Averaging Methods, the driving principle is to build several estimators independently 
#   and then to average their predictions. On average, the combined estimator is usually better 
#   than any of the single base estimator because its variance is reduced. 
#   Examples include: 
#    Bagging (Bootrstrap Aggregation) Methods 
#    Forests of Randomized Trees (Random Forest, and Extremely Randomized (Extra) Trees)
#
# - By contrast, in Boosting Methods, base estimators are built sequentially and one tries to 
#   reduce the bias of the combined estimator. The motivation is to combine several weak 
#   models to produce a powerful ensemble. 
#   Examples include: 
#    AdaBoost (Adaptive Boosting)
#    Gradient Tree Boosting
#
# - Bagging methods work best with strong and complex models (e.g., fully developed decision 
#   trees), in contrast with Boosting methods which usually work best with weak models 
#   (e.g., shallow decision trees).

In [5]:
##### Baggigng (Bootrstrap Aggregation)  Methods
#
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
# 

# 0) read data
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=10000, n_features=10, centers=100)

# 1) explore data
# not demonstrating for this example

# 2) preprocess data
# not demonstrating for this example

# 3) setup data for ml model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4-7) import, instantiate, train, test model
from sklearn.ensemble import BaggingClassifier                                                           
from sklearn.neighbors import KNeighborsClassifier
# bagging ensemble of KNeighborsClassifier base estimators, 
# each built on random subsets of 50% of the samples drawn with replacement,
# and 50% of the features drawn without replacement
bc = BaggingClassifier(base_estimator=KNeighborsClassifier(), 
                       max_samples=0.5, bootstrap=True,
                       max_features=0.5, bootstrap_features=False)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
print (bc.score(X_test, y_test))

1.0


In [6]:
##### Forests of Randomized Trees
#
# - The sklearn.ensemble module includes two averaging algorithms based on randomized 
#   decision trees: the RandomForest algorithm and the Extra-Trees method.
#

In [7]:
##### Random Forest Classifier
#
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#

# 0) read data
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=10000, n_features=10, centers=100)

# 1) explore data
# not demonstrating for this example

# 2) preprocess data
# not demonstrating for this example

# 3) setup data for ml model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4-7) import, instantiate, train, test model
from sklearn.ensemble import RandomForestClassifier                                                                                     
rfc = RandomForestClassifier(n_estimators=10, 
                             max_features='auto', bootstrap=True,
                             max_depth=None, min_samples_split=2, min_samples_leaf=1, class_weight=None)   
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print (rfc.score(X_test, y_test))

0.9995


In [8]:
##### Extra Tree Classifier
#
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
#

# 0) read data
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=10000, n_features=10, centers=100, random_state=0)

# 1) explore data
# not demonstrating for this example

# 2) preprocess data
# not demonstrating for this example

# 3) setup data for ml model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# import, instantiate, train, test model
from sklearn.ensemble import ExtraTreesClassifier  
etc = ExtraTreesClassifier(n_estimators=10, 
                           max_features='auto', bootstrap=False,
                           max_depth=None, min_samples_split=2, min_samples_leaf=1, class_weight=None)   
etc.fit(X_train, y_train)
y_pred = etc.predict(X_test)
print (etc.score(X_test, y_test))

1.0


In [9]:
##### AdaBoost (Adaptive Boosting)
# 
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
#

# 0) read data
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=10000, n_features=10, centers=100)

# 1) explore data
# not demonstrating for this example

# 2) preprocess data
# not demonstrating for this example

# 3) setup data for ml model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4-7) import, instantiate, train, test model
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=100, learning_rate=0.01)
#from sklearn.svm import SVC
#svc = SVC(kernel='linear', C=1.0)
#from sklearn.ensemble import AdaBoostClassifier                                                                                                
#abc = AdaBoostClassifier(base_estimator=svc, n_estimators=100, learning_rate=0.01, algorithm='SAMME')      

abc.fit(X_train, y_train)
y_pred = abc.predict(X_test)
print (abc.score(X_test, y_test)) 

0.977


In [10]:
##### Gradient Tree Boosting
#
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
#

# 0) read data
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=10000, n_features=10, centers=10)

# 1) explore data
# not demonstrating for this example

# 2) preprocess data
# not demonstrating for this example

# 3) setup data for ml model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4-7) import, instantiate, train, test model
from sklearn.ensemble import GradientBoostingClassifier 
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01,
                                                   max_depth=3, min_samples_split=2, min_samples_leaf=1)
gbc.fit(X_train, y_train) 
y_pred = gbc.predict(X_test)  
print (gbc.score(X_test, y_test))  

0.9975


In [11]:
##### Voting Classifiers
# 
# https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier
#

# 0) read data
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=10000, n_features=10, centers=10, random_state=0)

# 1) explore data
# not demonstrating for this example

# 2) preprocess data
# not demonstrating for this example

# 3) setup data for ml model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4-7) import, instantiate, train, test model
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier   
dtc = DecisionTreeClassifier(max_depth=4)
knn = KNeighborsClassifier(n_neighbors=7)
svc = SVC(gamma='scale', kernel='rbf', probability=True)
#eclf = VotingClassifier(estimators=[('dtc', dtc), ('knn', knn), ('svc', svc)], voting='hard')  
eclf = VotingClassifier(estimators=[('dtc', dtc), ('knn', knn), ('svc', svc)], voting='soft', weights=[2, 1, 2])       
eclf.fit(X_train, y_train)
y_pred = eclf.predict(X_test)
print (eclf.score(X_test, y_test))

1.0
