# Gradient Boosting Classifier
* Gradient Boosting combines the strengths of multiple weak learners (typically Decision Trees) to create a stronger predictive model. It takes a sequential, additive approach to correct the errors of previous models and improves the accuracy. The following is how Gradient Boosting works:
    1. Take a shallow decision tree and train it on the data
    2. Make prediction using this weak learner
    3. Assign weights to data points based on the magnitude of pseudo-residuals ($r_i = y_i - p_i$ where $r_i$ is the ith residual, $y_i$ is the class label of the ith data point, and $p_i$ is the predicted probability of that class for the ith data point)
    4. Another weak learner is fitted on this weighted data, focusing more on points that were poorly predicted
    5. Repeat 3-4 until stopping criteria is met
    6. Final predictions is generated using a weighted average of all the weak learners - weights are usually determined by the performance of each weak learner on the training data and pseudo-residuals

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Generate data
#X, y = make_classification(n_samples=10000, n_features=2, n_informative=2, n_redundant=0, random_state=42)

## Split the Data into Training and Testing Sets

In [3]:
# Load the test data into a Pandas DataFrame
df_test = pd.read_csv("https://machine-failure-data-20230822-craiguo.s3.us-west-2.amazonaws.com/test.csv")
df_test.drop(columns=['id', 'Product ID'], inplace=True)

# Display sample data
df_test.head(10)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,L,302.3,311.5,1499,38.0,60,0,0,0,0,0
1,L,301.7,311.0,1713,28.8,17,0,0,0,0,0
2,L,301.3,310.4,1525,37.7,96,0,0,0,0,0
3,M,300.1,309.6,1479,47.6,5,0,0,0,0,0
4,M,303.4,312.3,1515,41.3,114,0,0,0,0,0
5,L,299.1,308.3,1489,38.2,139,0,0,0,0,0
6,L,299.8,309.1,1429,39.9,207,0,0,0,0,0
7,L,302.7,312.4,1540,46.2,17,0,0,0,0,0
8,H,300.7,311.9,1613,36.0,12,0,0,0,0,0
9,L,300.5,311.4,1708,32.2,57,0,0,0,0,0


In [4]:
# Load the test data into a Pandas DataFrame
df_train = pd.read_csv("https://machine-failure-data-20230822-craiguo.s3.us-west-2.amazonaws.com/train.csv")
df_train.drop(columns=['id', 'Product ID'], inplace=True)

# Display sample data
df_train.head(10)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
5,M,298.4,308.9,1429,42.1,65,0,0,0,0,0,0
6,L,299.6,311.0,1413,42.9,156,0,0,0,0,0,0
7,L,298.7,310.1,1609,38.1,67,0,0,0,0,0,0
8,L,297.7,308.8,1578,35.2,13,0,0,0,0,0,0
9,L,300.5,312.3,1447,53.3,98,0,0,0,0,0,0


In [5]:
# Look at nunique counts for train
df_train.nunique()

Type                         3
Air temperature [K]         95
Process temperature [K]     81
Rotational speed [rpm]     952
Torque [Nm]                611
Tool wear [min]            246
Machine failure              2
TWF                          2
HDF                          2
PWF                          2
OSF                          2
RNF                          2
dtype: int64

In [6]:
# Look at nunique counts for test
df_test.nunique()

Type                         3
Air temperature [K]         92
Process temperature [K]     84
Rotational speed [rpm]     946
Torque [Nm]                595
Tool wear [min]            246
TWF                          2
HDF                          2
PWF                          2
OSF                          2
RNF                          2
dtype: int64

In [7]:
# concatenate the dataframes to use pd.get_dummies() on 'Product ID' and 'Type'

combined_df = pd.concat([df_train, df_test], sort=False)
combined_df.tail()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
90949,L,302.3,311.4,1484,40.4,15,,0,0,0,0,0
90950,L,297.9,309.8,1542,33.8,31,,0,0,0,0,0
90951,L,295.6,306.2,1501,41.4,187,,0,0,0,0,0
90952,L,298.1,307.8,1534,40.3,69,,0,0,0,0,0
90953,L,303.5,312.8,1534,36.1,92,,0,0,0,0,0


In [8]:
#use pd.get_dummies() for 'Type' and split test and train sets again

dummies_df = pd.get_dummies(combined_df)

col = df_train.shape[0]

dummy_train = dummies_df.iloc[0:col]
dummy_test = dummies_df.iloc[col:]

### Step 2: Create features and target arrays

In [9]:
# Split our preprocessed data into our features and target arrays

y = dummy_train['Machine failure']
X = dummy_train.drop(columns = 'Machine failure')

In [10]:
oversample = RandomOverSampler(random_state=1, sampling_strategy=0.5)
X, y = oversample.fit_resample(X, y)

In [11]:
# Review the y variable Series
y[:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Machine failure, dtype: float64

In [12]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF,Type_H,Type_L,Type_M
0,300.6,309.6,1596,36.1,140,0,0,0,0,0,0,1,0
1,302.6,312.1,1759,29.1,200,0,0,0,0,0,0,0,1
2,299.3,308.5,1805,26.5,25,0,0,0,0,0,0,1,0
3,301.0,310.9,1524,44.3,197,0,0,0,0,0,0,1,0
4,298.0,309.0,1641,35.4,34,0,0,0,0,0,0,0,1


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [13]:
# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [17]:
from warnings import filterwarnings
filterwarnings('ignore')

# Create a Gradient Boosting Regressor
model = GradientBoostingClassifier()

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_gb = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_gb.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Best Estimators: {best_gb.n_estimators}")
print(f"Best Learning Rate: {best_gb.learning_rate}")
print(f"Best Max Depth: {best_gb.max_depth}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.837 total time=   7.8s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.837 total time=   7.7s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.835 total time=   7.7s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.840 total time=   7.9s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=0.840 total time=   7.5s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.911 total time=  15.3s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.912 total time=  14.7s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.910 total time=  14.8s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.912 total time=  14.9s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=0.913 to

[CV 2/5] END learning_rate=0.1, max_depth=7, n_estimators=200;, score=0.994 total time= 1.1min
[CV 3/5] END learning_rate=0.1, max_depth=7, n_estimators=200;, score=0.993 total time= 1.1min
[CV 4/5] END learning_rate=0.1, max_depth=7, n_estimators=200;, score=0.993 total time= 1.1min
[CV 5/5] END learning_rate=0.1, max_depth=7, n_estimators=200;, score=0.994 total time= 1.1min
[CV 1/5] END learning_rate=0.2, max_depth=3, n_estimators=50;, score=0.940 total time=   7.1s
[CV 2/5] END learning_rate=0.2, max_depth=3, n_estimators=50;, score=0.940 total time=   7.2s
[CV 3/5] END learning_rate=0.2, max_depth=3, n_estimators=50;, score=0.937 total time=   7.0s
[CV 4/5] END learning_rate=0.2, max_depth=3, n_estimators=50;, score=0.938 total time=   7.0s
[CV 5/5] END learning_rate=0.2, max_depth=3, n_estimators=50;, score=0.940 total time=   7.1s
[CV 1/5] END learning_rate=0.2, max_depth=3, n_estimators=100;, score=0.944 total time=  14.7s
[CV 2/5] END learning_rate=0.2, max_depth=3, n_estimato

In [18]:
import pickle

In [19]:
model_gboost = "model_gboost_5.pkl"
with open (model_gboost, "wb") as filename: 
    pickle.dump(best_gb, filename)

In [20]:
model_gboost = "model_gboost_5.pkl"
with open(model_gboost, "rb") as filename:
    model = pickle.load(filename)

In [27]:
val_predict = model.predict(X_test)

val_predict_df = pd.DataFrame(val_predict, columns = ['predict'])

val_predict_df.head(10)

Unnamed: 0,predict
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,1.0
6,0.0
7,1.0
8,1.0
9,0.0


In [None]:
# Plot the decision boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                     np.arange(y_min, y_max, 0.01))

Z = best_gb.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.3)
plt.scatter(X[:, 0], X[:, 1], c=y, marker='o', edgecolor='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(f'Best Gradient Boosting Classifier, Accuracy: {accuracy:.2f}')
plt.show()