https://stackabuse.com/gradient-boosting-classifiers-in-python-with-scikit-learn/

In [4]:
import pandas as pd
import sklearn

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
#  Read the CSV and Perform Basic Data Cleaning
df1 = pd.read_csv('../data/winequality-red.csv', sep=';')
df2 = pd.read_csv('../data/winequality-white.csv', sep=';')
# Drop the null columns where all values are null
df1 = df1.dropna(axis='columns', how='all')
df2 = df2.dropna(axis='columns', how='all')
# Drop the null rows
df1 = df1.dropna()
df2 = df2.dropna()
# Merge two dataframes
df = df1.merge(df2, how='outer')

# pick white/red
# df = df2 # pick white
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6


In [8]:
X = df.drop(columns=["quality"])
y = df["quality"]
print(X.shape, y.shape)

(6495, 11) (6495,)


In [13]:
# Let's split the data into training and testing sets:
# X_train = df.values[0:5000]
# X_test = df.values[5000:]

In [16]:
# We'll now scale our data by creating an instance of the scaler and scaling it:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
# Now we can split the data into training and testing sets. Let's also set a seed (so you can replicate the results) and select the percentage of the data for testing on:
# state = 12  
# test_size = 0.30  
  
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1)

In [20]:
# Now we can try setting different learning rates, so that we can compare the performance of the classifier's performance at different learning rates.

lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_val, y_val)))

Learning rate:  0.05
Accuracy score (training): 0.542
Accuracy score (validation): 0.523
Learning rate:  0.075
Accuracy score (training): 0.544
Accuracy score (validation): 0.520
Learning rate:  0.1
Accuracy score (training): 0.548
Accuracy score (validation): 0.522
Learning rate:  0.25
Accuracy score (training): 0.583
Accuracy score (validation): 0.549
Learning rate:  0.5
Accuracy score (training): 0.607
Accuracy score (validation): 0.554
Learning rate:  0.75
Accuracy score (training): 0.357
Accuracy score (validation): 0.334
Learning rate:  1
Accuracy score (training): 0.280
Accuracy score (validation): 0.265


In [22]:
# Now we can evaluate the classifier by checking its accuracy and creating a confusion matrix. Let's create a new classifier and specify the best learning rate we discovered.

gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_val)

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))

print("Classification Report")
print(classification_report(y_val, predictions))

Confusion Matrix:
[[  0   1   3   1   0   0   0]
 [  0   5  19  22   1   1   0]
 [  1   5 335 185   2   1   0]
 [  2   7 155 507  35   4   1]
 [  0   0  16 187  80   4   1]
 [  0   0   1  26  10   4   1]
 [  0   0   0   0   1   0   0]]
Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.28      0.10      0.15        48
           5       0.63      0.63      0.63       529
           6       0.55      0.71      0.62       711
           7       0.62      0.28      0.38       288
           8       0.29      0.10      0.14        42
           9       0.00      0.00      0.00         1

    accuracy                           0.57      1624
   macro avg       0.34      0.26      0.28      1624
weighted avg       0.57      0.57      0.55      1624

