<a href="https://colab.research.google.com/github/naikshrey2308/ML_Labs/blob/main/ML_Lab_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Stacking

### Manual Implementation

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

Load the `Breast Cancer` Dataset

In [None]:
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()
inputs = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
inputs.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
targets = pd.DataFrame(data=dataset.target, columns=["Malign"])
targets.head()

Unnamed: 0,Malign
0,0
1,0
2,0
3,0
4,0


We observe that the data does not belong to the same range. Therefore, we need to normalize the given data.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = pd.DataFrame(data=scaler.fit_transform(inputs), columns=inputs.columns)
y = targets

In [None]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,0.605518,...,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,0.141323,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,0.211247,...,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,1.0,...,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,0.186816,...,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.4)
y_train = y_train.ravel()
y_test = y_test.ravel()

To perform stacking, we'll use 3 models in the top layer and 1 model in the bottom layer. The top layer models => (Logistic Regression, Naive Bayes, Decision Tree) and the bottom layer model => (Logistic Regression).

In [None]:
# Creating and training the top layer models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

model_1_1 = LogisticRegression()
model_1_2 = GaussianNB()
model_1_3 = DecisionTreeClassifier(max_depth=20)

model_1_1.fit(X_train, y_train)
model_1_2.fit(X_train, y_train)
model_1_3.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=20)

In [None]:
pred_1_1 = model_1_1.predict(X_test)
pred_1_2 = model_1_2.predict(X_test)
pred_1_3 = model_1_3.predict(X_test)

In [None]:
# Observing the metrics for the top layer models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

top_layer_cols = ["model_1_1", "model_1_2", "model_1_3"]

performance_top = [{
    "Accuracy": accuracy_score(y_test, pred_1_1),
    "Precision": precision_score(y_test, pred_1_1),
    "Recall": recall_score(y_test, pred_1_1),
    "F1-score": f1_score(y_test, pred_1_1),
}, {
    "Accuracy": accuracy_score(y_test, pred_1_2),
    "Precision": precision_score(y_test, pred_1_2),
    "Recall": recall_score(y_test, pred_1_2),
    "F1-score": f1_score(y_test, pred_1_2),
}, {
    "Accuracy": accuracy_score(y_test, pred_1_3),
    "Precision": precision_score(y_test, pred_1_3),
    "Recall": recall_score(y_test, pred_1_3),
    "F1-score": f1_score(y_test, pred_1_3),
}]

top_layer_metrics = pd.DataFrame(data=performance_top, index=top_layer_cols)
top_layer_metrics

Unnamed: 0,Accuracy,Precision,Recall,F1-score
model_1_1,0.97807,0.965035,1.0,0.982206
model_1_2,0.95614,0.957143,0.971014,0.964029
model_1_3,0.925439,0.941606,0.934783,0.938182


Now, we'll try to build the bottom layer.

In [None]:
# Creating and training the bottom layer
model_2 = LogisticRegression()

We need to create the dataset for the bottom layer. The inputs to the bottom layer are the outputs of the top layer.

In [None]:
# Generate the predictions for the whole dataset to supply to the bottom layer
out_1_1 = model_1_1.predict(X.values)
out_1_2 = model_1_1.predict(X.values)
out_1_3 = model_1_1.predict(X.values)

In [None]:
X_2 = pd.DataFrame(data=[out_1_1, out_1_2, out_1_3])
X_2 = X_2.T
X_2.columns = ["model_1_1", "model_1_2", "model_1_3"]
X_2.head()

Unnamed: 0,model_1_1,model_1_2,model_1_3
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [None]:
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2.values, y.values, test_size=0.2)
y_2_train = y_2_train.ravel()
y_2_test = y_2_test.ravel()

In [None]:
model_2.fit(X_2_train, y_2_train)
pred_2 = model_2.predict(X_2_test)

In [None]:
performance_bottom = {
    "Accuracy": accuracy_score(y_2_test, pred_2),
    "Precision": precision_score(y_2_test, pred_2),
    "Recall": recall_score(y_2_test, pred_2),
    "F1-score": f1_score(y_2_test, pred_2),
}

performance_bottom

{'Accuracy': 0.9824561403508771,
 'Precision': 0.9846153846153847,
 'Recall': 0.9846153846153847,
 'F1-score': 0.9846153846153847}

### Using `sklearn`

In [None]:
from sklearn.ensemble import StackingClassifier

models = [("lr", LogisticRegression()),
          ("nb", GaussianNB()),
          ("dt", DecisionTreeClassifier(max_depth=20))]

model = StackingClassifier(estimators=models, final_estimator=LogisticRegression())

In [None]:
X_main_train, X_main_test, y_main_train, y_main_test = train_test_split(X.values, y.values, test_size=0.2)
y_main_train = y_main_train.ravel()
y_main_test = y_main_test.ravel()

In [None]:
model.fit(X_main_train, y_main_train)
pred = model.predict(X_main_test)

In [None]:
performance = {
    "Accuracy": accuracy_score(y_main_test, pred),
    "Precision": precision_score(y_main_test, pred),
    "Recall": recall_score(y_main_test, pred),
    "F1-score": f1_score(y_main_test, pred),
}

performance

{'Accuracy': 0.9473684210526315,
 'Precision': 0.9285714285714286,
 'Recall': 0.9848484848484849,
 'F1-score': 0.9558823529411765}

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

model_adaboost = AdaBoostClassifier(n_estimators=50, learning_rate=0.2)
Xada_train, Xada_test, yada_train, yada_test = train_test_split(X.values, y.values, test_size=0.2)
yada_train = yada_train.ravel()
yada_test = yada_test.ravel()
model_adaboost.fit(Xada_train, yada_train)

AdaBoostClassifier(learning_rate=0.2)

In [None]:
pred_adaboost = model_adaboost.predict(Xada_test)

In [None]:
performance_ada = {
    "Accuracy": accuracy_score(yada_test, pred_adaboost),
    "Precision": precision_score(yada_test, pred_adaboost),
    "Recall": recall_score(yada_test, pred_adaboost),
    "F1-score": f1_score(yada_test, pred_adaboost),
}

performance_ada

{'Accuracy': 0.9649122807017544,
 'Precision': 0.95,
 'Recall': 1.0,
 'F1-score': 0.9743589743589743}

Diabetes Dataset

In [4]:
from sklearn.datasets import load_diabetes
dataset = load_diabetes()
X = dataset.data
y = dataset.target

In [8]:
from sklearn.ensemble import AdaBoostRegressor

model_adaboost = AdaBoostRegressor(n_estimators=50, learning_rate=0.2)
Xada_train, Xada_test, yada_train, yada_test = train_test_split(X, y, test_size=0.2)
yada_train = yada_train.ravel()
yada_test = yada_test.ravel()
model_adaboost.fit(Xada_train, yada_train)

AdaBoostRegressor(learning_rate=0.2)

In [9]:
pred_adaboost = model_adaboost.predict(Xada_test)

In [11]:
from sklearn.metrics import mean_squared_error
mean_squared_error(yada_test, pred_adaboost)

3469.797073927866

Concrete.csv Dataset

In [21]:
import pandas as pd
from sklearn.metrics import mean_squared_error

In [16]:
dataset = pd.read_csv("/content/drive/MyDrive/Contrete.csv")
dataset.keys()

Index(['cement', 'slag', 'flyash', 'water', 'superplasticizer',
       'coarseaggregate', 'fineaggregate', 'age', 'csMPa'],
      dtype='object')

Therefore, we have to find the csMPa using the other values.

In [22]:
X = dataset.drop(["csMPa"], axis=1)
y = dataset["csMPa"]
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2)
model = AdaBoostRegressor(n_estimators=100, learning_rate=0.05)
model.fit(X_train, y_train)
pred = model.predict(X_test)
mean_squared_error(y_test, pred)

73.10090796368603

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=100)
Xrf_train, Xrf_test, yrf_train, yrf_test = train_test_split(X.values, y.values, test_size=0.2)
yrf_train = yrf_train.ravel()
yrf_test = yrf_test.ravel()
model_rf.fit(Xrf_train, yrf_train)

RandomForestClassifier()

In [None]:
pred_rf = model_rf.predict(Xrf_test)

In [None]:
performance_rf = {
    "Accuracy": accuracy_score(yrf_test, pred_rf),
    "Precision": precision_score(yrf_test, pred_rf),
    "Recall": recall_score(yrf_test, pred_rf),
    "F1-score": f1_score(yrf_test, pred_rf),
}

performance_rf

{'Accuracy': 0.9473684210526315,
 'Precision': 0.9324324324324325,
 'Recall': 0.9857142857142858,
 'F1-score': 0.9583333333333333}