In [31]:
# import important libraries

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
import xgboost as XGB

In [4]:
# read wine data from url

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine_data = pd.read_csv(url,sep=";")
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
# if the quality > 5 then it is good wine

wine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [6]:
# lambda function to convert quality into binary class

wine_data['quality'] = wine_data['quality'].apply(lambda x : 1 if x > 7 else 0)

In [7]:
# check the final data

wine_data.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0


In [8]:
# Let's create x and y

X = wine_data.drop('quality',axis = 1)
y = wine_data['quality']

In [10]:
# Let's standrarize the data

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# let's check the scaled data

X_scaled

array([[-0.52835961,  0.96187667, -1.39147228, ...,  1.28864292,
        -0.57920652, -0.96024611],
       [-0.29854743,  1.96744245, -1.39147228, ..., -0.7199333 ,
         0.1289504 , -0.58477711],
       [-0.29854743,  1.29706527, -1.18607043, ..., -0.33117661,
        -0.04808883, -0.58477711],
       ...,
       [-1.1603431 , -0.09955388, -0.72391627, ...,  0.70550789,
         0.54204194,  0.54162988],
       [-1.39015528,  0.65462046, -0.77526673, ...,  1.6773996 ,
         0.30598963, -0.20930812],
       [-1.33270223, -1.21684919,  1.02199944, ...,  0.51112954,
         0.01092425,  0.54162988]])

In [13]:
# Let's split the train and test split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state = 35)

In [21]:
xgb_classifier = XGB.XGBClassifier(objective="binary:logistic",eval_metric = 'logloss')
xgb_classifier

In [22]:
# model fit

xgb_classifier.fit(x_train,y_train)

In [24]:
# do the prediction

xgb_classifier.predict(x_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [25]:
# accuracy score

accuracy_score(y_test,xgb_classifier.predict(x_test))

0.984375

In [27]:
# classification report

print(f"Classification report :{classification_report(y_test,xgb_classifier.predict(x_test))}")

Classification report :              precision    recall  f1-score   support

           0       0.98      1.00      0.99       314
           1       1.00      0.17      0.29         6

    accuracy                           0.98       320
   macro avg       0.99      0.58      0.64       320
weighted avg       0.98      0.98      0.98       320



In [29]:
# confusion matrix

confusion_matrix(y_test,xgb_classifier.predict(x_test))

array([[314,   0],
       [  5,   1]])

In [30]:
# hyperparamanter tuning

param_grid = {
    "n_estimator" : [50, 100,150 ,200],
    "learning_rate" : [0.01, 0.1, 0.2],
    "max_depth" : [3,7, 10, 15, 20],
    "subsample" : [0.7, 0.8, 1],
    "colsample_bytree" : [0.7, 0.8, 1]
}

In [34]:
# now start hyperparameter tuning

grid_search = GridSearchCV(XGB.XGBClassifier(objective="binary:logistic",eval_metric = 'logloss'), param_grid, cv = 3, n_jobs = 1, verbose = 2 )
grid_search.fit(x_train,y_train)

Fitting 3 folds for each of 540 candidates, totalling 1620 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimator=50, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimator=50, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimator=50, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimator=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimator=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimator=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimator=50, subsample=1; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=3, n_estimator=50, subsample=1; total time=   0.1s
[CV] END colsample_bytree=0.

In [36]:
# get the best paramaneter to tune the model

grid_search.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.1,
 'max_depth': 3,
 'n_estimator': 50,
 'subsample': 0.8}

In [38]:
# retrain with tuned model

xgb_classifier_post_hyperparameter = XGB.XGBClassifier(**grid_search.best_params_,objective="binary:logistic",eval_metric = 'logloss')
xgb_classifier_post_hyperparameter.fit(x_train,y_train)

In [39]:
# accuracy score

accuracy_score(y_test,xgb_classifier_post_hyperparameter.predict(x_test))

0.98125

In [40]:
# classification report

print(f"Classification report :{classification_report(y_test,xgb_classifier_post_hyperparameter.predict(x_test))}")

Classification report :              precision    recall  f1-score   support

           0       0.98      1.00      0.99       314
           1       0.00      0.00      0.00         6

    accuracy                           0.98       320
   macro avg       0.49      0.50      0.50       320
weighted avg       0.96      0.98      0.97       320



In [41]:
# confusion matrix

confusion_matrix(y_test,xgb_classifier_post_hyperparameter.predict(x_test))

array([[314,   0],
       [  6,   0]])