## Building an Ensemble Classifier

In [171]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris  = datasets.load_iris()
# X = iris["data"]
X = iris["data"][:, (0,1)] # otherwise too easy...
y = iris["target"]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=42)

In [172]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# Don't need to parameterise any of the following 
# but could do so to improve performance
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],voting='hard')
voting_clf.fit(Xtrain, ytrain)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [180]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(Xtrain, ytrain)
    ypred = clf.predict(Xtest)
    print(clf.__class__.__name__, accuracy_score(ytest, ypred))

LogisticRegression 0.8421052631578947
RandomForestClassifier 0.8157894736842105
SVC 0.8421052631578947
VotingClassifier 0.8421052631578947


In [303]:
# exploring the impact of scaling...
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
std_scaler = StandardScaler()
svm_S_clf = SVC(kernel="rbf", C=100, gamma = 0.0001)
X_scaled = std_scaler.fit_transform(X)
XStrain, XStest, ytrain, ytest = train_test_split(X_scaled, y, test_size=0.25, random_state=42)
svm_S_clf.fit(XStrain, ytrain)
from sklearn.metrics import accuracy_score
ypred = svm_S_clf.predict(XStest)
print(accuracy_score(ytest, ypred))

0.868421052631579


#### Soft voting

In [82]:
svm_clf = SVC(probability=True)
VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()), ('svc', SVC())], voting='soft')

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())],
                 voting='soft')

In [83]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(Xtrain, ytrain)
    ypred = clf.predict(Xtest)
    print(clf.__class__.__name__, accuracy_score(ytest, ypred))

LogisticRegression 0.8421052631578947
RandomForestClassifier 0.7631578947368421
SVC 0.8421052631578947
VotingClassifier 0.8421052631578947


### Bagging

In [103]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Reduced number of estimators and dt size
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=25,
    max_samples=50, bootstrap=True, n_jobs=-1, random_state=42)

bag_clf.fit(Xtrain, ytrain)
ypred = bag_clf.predict(Xtest)
print(accuracy_score(ytest, ypred))

0.7368421052631579


In [105]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(Xtrain, ytrain)
ypred = dt_clf.predict(Xtest)
print(accuracy_score(ytest, ypred))

0.6578947368421053


#### OOB Score

In [107]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=25, max_samples=50,
    bootstrap=True, n_jobs=-1, oob_score=True, random_state=42)

bag_clf.fit(Xtrain, ytrain)
bag_clf.oob_score_

0.6607142857142857

### Random Forests

In [127]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=25, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(Xtrain, ytrain)
ypred = rnd_clf.predict(Xtest)
print(accuracy_score(ytest, ypred))

0.7894736842105263


### Feature Importance

In [128]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, 
                      max_leaf_nodes=16, n_jobs=-1)
from sklearn import datasets
iris = datasets.load_iris()
from sklearn.model_selection import train_test_split
X, y = datasets.load_iris(return_X_y=True)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
rnd_clf.fit(Xtrain, ytrain)
y_pred_rf = rnd_clf.predict(Xtest)
rnd_clf.feature_importances_

array([0.10499478, 0.03255396, 0.40800274, 0.45444852])

In [129]:
for name, score in zip(iris["feature_names"], 
                       rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10499478132668698
sepal width (cm) 0.032553955710342974
petal length (cm) 0.4080027410705183
petal width (cm) 0.45444852189245166


### Re-Load Original iris data again...

In [130]:
iris  = datasets.load_iris()
X = iris["data"][:, (0,1)] # otherwise too easy...
# X = iris["data"]
y = iris["target"]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=42)

### Boosting

#### AdaBoost

In [153]:
from sklearn.ensemble import AdaBoostClassifier
# Can explore parameter settings, base models etc.
adb_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=3),
    n_estimators=50)
adb_clf.fit(Xtrain, ytrain)
ypred = adb_clf.predict(Xtest)
print(accuracy_score(ytest, ypred))

0.8157894736842105


#### Gradient Boosting

In [269]:
from sklearn.ensemble import GradientBoostingClassifier
# Experiment with parameters - esp n_estimators and learning_rate
gb_clf = GradientBoostingClassifier(n_estimators=3, learning_rate=0.5, max_depth=1, random_state=0)
gb_clf.fit(Xtrain, ytrain)
ypred = gb_clf.predict(Xtest)
print(accuracy_score(ytest, ypred))

0.8157894736842105


## Principal Components Analysis

In [161]:
from sklearn.decomposition import PCA
X, y = datasets.load_iris(return_X_y=True)
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)
X_reduced = pca.fit_transform(X)
# Take a look a X_reduced

In [162]:
print(pca.explained_variance_ratio_)

[0.92461872 0.05306648]


In [167]:
pca = PCA(n_components=0.50)
X_reduced = pca.fit_transform(X)
print(pca.explained_variance_ratio_)

[0.92461872]


## One-Hot Encoding

In [202]:
import pandas as pd
df = pd.DataFrame({'no_legs': [2, 4, 6, 0],
                   'no_wings': [2, 0, 0, 0],
                   'species': ['bird', 'mammal', 'insect', 'fish'],
                   'cuteness_factor': [4, 7, 1, 2]})

In [203]:
zoo_df = df.sample(frac=2.5, replace=True, random_state=1)
zoo_df

Unnamed: 0,no_legs,no_wings,species,cuteness_factor
1,4,0,mammal,7
3,0,0,fish,2
0,2,2,bird,4
0,2,2,bird,4
3,0,0,fish,2
1,4,0,mammal,7
3,0,0,fish,2
1,4,0,mammal,7
3,0,0,fish,2
0,2,2,bird,4


In [204]:
animal_cat = zoo_df[["species"]]
animal_cat

Unnamed: 0,species
1,mammal
3,fish
0,bird
0,bird
3,fish
1,mammal
3,fish
1,mammal
3,fish
0,bird


### Ordinal Encoder

In [188]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
animal_cat_encoded = ordinal_encoder.fit_transform(animal_cat)
animal_cat_encoded

array([[2.],
       [1.],
       [0.],
       [0.],
       [1.],
       [2.],
       [1.],
       [2.],
       [1.],
       [0.]])

In [205]:
animal_categories = ordinal_encoder.categories_
animal_categories

[array(['bird', 'fish', 'mammal'], dtype=object)]

### One-Hot Encoder

In [206]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
animal_cat_1hot = cat_encoder.fit_transform(animal_cat)
animal_categories = ordinal_encoder.categories_
animal_cat_1hot

<10x3 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [207]:
animal_cat_1hot.toarray()

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [208]:
cat_encoder = OneHotEncoder(sparse=False)
animal_cat_1hot = cat_encoder.fit_transform(animal_cat)
animal_cat_1hot

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [209]:
enc_data = pd.DataFrame(animal_cat_1hot)
enc_data.columns = animal_categories[0]
enc_data.index = zoo_df.index

enc_data  #Have a look at the data

Unnamed: 0,bird,fish,mammal
1,0.0,0.0,1.0
3,0.0,1.0,0.0
0,1.0,0.0,0.0
0,1.0,0.0,0.0
3,0.0,1.0,0.0
1,0.0,0.0,1.0
3,0.0,1.0,0.0
1,0.0,0.0,1.0
3,0.0,1.0,0.0
0,1.0,0.0,0.0


In [210]:
zoo_encoded_df = pd.concat([zoo_df, enc_data], axis =1, ignore_index = False)
zoo_encoded_df

Unnamed: 0,no_legs,no_wings,species,cuteness_factor,bird,fish,mammal
1,4,0,mammal,7,0.0,0.0,1.0
3,0,0,fish,2,0.0,1.0,0.0
0,2,2,bird,4,1.0,0.0,0.0
0,2,2,bird,4,1.0,0.0,0.0
3,0,0,fish,2,0.0,1.0,0.0
1,4,0,mammal,7,0.0,0.0,1.0
3,0,0,fish,2,0.0,1.0,0.0
1,4,0,mammal,7,0.0,0.0,1.0
3,0,0,fish,2,0.0,1.0,0.0
0,2,2,bird,4,1.0,0.0,0.0


In [211]:
zoo_encoded_df.drop(columns=['species'], inplace = True)
zoo_encoded_df

Unnamed: 0,no_legs,no_wings,cuteness_factor,bird,fish,mammal
1,4,0,7,0.0,0.0,1.0
3,0,0,2,0.0,1.0,0.0
0,2,2,4,1.0,0.0,0.0
0,2,2,4,1.0,0.0,0.0
3,0,0,2,0.0,1.0,0.0
1,4,0,7,0.0,0.0,1.0
3,0,0,2,0.0,1.0,0.0
1,4,0,7,0.0,0.0,1.0
3,0,0,2,0.0,1.0,0.0
0,2,2,4,1.0,0.0,0.0


In [212]:
zoo_encoded_df.drop(columns=['mammal'], inplace = True)
zoo_encoded_df

Unnamed: 0,no_legs,no_wings,cuteness_factor,bird,fish
1,4,0,7,0.0,0.0
3,0,0,2,0.0,1.0
0,2,2,4,1.0,0.0
0,2,2,4,1.0,0.0
3,0,0,2,0.0,1.0
1,4,0,7,0.0,0.0
3,0,0,2,0.0,1.0
1,4,0,7,0.0,0.0
3,0,0,2,0.0,1.0
0,2,2,4,1.0,0.0


### Alternatively using get_dummies

In [213]:
import pandas as pd
df = pd.DataFrame({'no_legs': [2, 4, 6, 0],
                   'no_wings': [2, 0, 0, 0],
                   'species': ['bird', 'mammal', 'insect', 'fish'],
                   'cuteness_factor': [4, 7, 1, 2]})
zoo_df = df.sample(frac=2.5, replace=True, random_state=1)
animal_cat = zoo_df[["species"]]
pd.get_dummies(zoo_df[["species"]])

Unnamed: 0,species_bird,species_fish,species_mammal
1,0,0,1
3,0,1,0
0,1,0,0
0,1,0,0
3,0,1,0
1,0,0,1
3,0,1,0
1,0,0,1
3,0,1,0
0,1,0,0


#### Drop one of the columns

In [214]:
pd.get_dummies(zoo_df[["species"]], drop_first=True)

Unnamed: 0,species_fish,species_mammal
1,0,1
3,1,0
0,0,0
0,0,0
3,1,0
1,0,1
3,1,0
1,0,1
3,1,0
0,0,0


#### Change the data frame

In [215]:
pd.get_dummies(zoo_df, columns=["species"], drop_first=True)

Unnamed: 0,no_legs,no_wings,cuteness_factor,species_fish,species_mammal
1,4,0,7,0,1
3,0,0,2,1,0
0,2,2,4,0,0
0,2,2,4,0,0
3,0,0,2,1,0
1,4,0,7,0,1
3,0,0,2,1,0
1,4,0,7,0,1
3,0,0,2,1,0
0,2,2,4,0,0
