In [1]:
import pandas as pd 
import numpy as np 

In [3]:
df = pd.read_csv('data/ensemble_data.csv', sep='\t')
df.head()

Unnamed: 0,batches,force,thickness,life
0,4,188.17,1.28,normal
1,6,182.34,1.45,low
2,5,219.78,0.75,high
3,4,201.97,1.31,low
4,3,206.81,0.51,high


In [10]:
df['life'].value_counts()

life
low       184
normal    173
high      143
Name: count, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from matplotlib import pyplot as plt
import seaborn as sns

X = df.drop('life', axis=1)
y = df['life']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print(cm)

0.69
[[12 10  0]
 [ 8 24  4]
 [ 6  3 33]]


In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=300, min_samples_leaf=2, max_depth=5)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print(cm)


0.58
[[ 1 19  2]
 [ 6 22  8]
 [ 0  7 35]]


In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 2],
    'min_samples_split': [0.05, 0.10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

gs = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

print(gs.best_params_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=0.05, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=0.05, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=0.05, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=0.05, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=0.05, n_estimators=200; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=0.05, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=0.05, n_estimators=200; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=0.05

In [None]:
print(gs.best_score_)
gs.fit(X_train, y_train)

In [22]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

base = [
    ('rf', RandomForestClassifier(n_estimators=300, min_samples_leaf=2, max_depth=5)),
    ('dt', DecisionTreeClassifier()),
    ('svc', SVC())
]

meta_model = LogisticRegression()

stack = StackingClassifier(estimators=base, final_estimator=meta_model)

stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)


In [23]:
print(accuracy)
print(cm)

0.61
[[ 0 21  1]
 [ 3 28  5]
 [ 1  8 33]]
