In [9]:
# Imports
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.preprocessing import LabelEncoder

# Download related imports
import os
import tarfile
import urllib.request

# Matplotlib settings
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# to make this notebook's output identical at every run
np.random.seed(42)

# Figure destination
PROJECT_ROOT_DIR = "."
ASSIGNMENT_ID = "wine"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", ASSIGNMENT_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)


def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=50):
    """
    This function saves the matplotlib figures.
    """
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [10]:
def load_csv_data(file_path: str):
    """
    This function loads CSV (comma separated values) 
    data using the pandas library.

    ARGUMENTS
    ---------
        file_path (str):
            The `.csv` file destination path. 

    RETURNS
    -------
        Pandas `DataFrame` object containing all the data.
    """
    return pd.read_csv(file_path)

## Data Structure Analysis

In [11]:
wine = load_csv_data('datasets/winequality-red.csv')
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [12]:
# Check dataset dimensions
wine.shape

(1599, 12)

In [13]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


# Cleaning the Data

Discover and Visualize the Data to Gain Insights

In [14]:
wine.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


# Model Building

In [38]:
import sklearn
import sklearn.model_selection
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

In [15]:
# Making binary classificaion for the response variable.
# Dividing wine as good and bad by giving the limit for the quality

bins = (2, 6.5, 8)
group_names = ['bad', 'good']
wine['quality'] = pd.cut(wine['quality'], bins = bins, labels = group_names)

In [17]:
label_quality = LabelEncoder()

# binary category transformation
wine['quality'] = label_quality.fit_transform(wine['quality'])

In [18]:
wine['quality'].value_counts()

0    1382
1     217
Name: quality, dtype: int64

In [20]:
X = wine.drop('quality', axis = 1)
y = wine['quality']

In [36]:
# creating the training and testing split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.2, random_state = 42)

In [43]:
# Building pipelines of standard scalers and various classification models.


pipeline_sgd=Pipeline([("sgd_scaler", StandardScaler()),
                     ("lr_classifier", SGDClassifier(penalty=None))])

pipeline_svc=Pipeline([("svc_scaler", StandardScaler()),
                     ("dt_classifier", SVC(C=1.2, gamma=0.9, kernel='rbf'))])

pipeline_rf=Pipeline([("rf_scaler", StandardScaler()),
                     ("rf_classifier", RandomForestClassifier(n_estimators=200))])

pipeline_xgb=Pipeline([("xgb_scaler", StandardScaler()),
                     ("rf_classifier", XGBClassifier())])

In [44]:
# Creating a list of all the pipelines
pipelines = [pipeline_sgd, pipeline_svc, pipeline_rf, pipeline_xgb]

# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "SGDClassifier", 1: "SVC", 2: "RandomForestClassifier", 3: "XGBRegressor"}

In [45]:
for model in pipelines:
    model.fit(X_train, y_train)

In [42]:
for pipeline in pipelines:
    y_pred = pipeline.predict(X_test)
    print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.92      0.87      0.90       273
           1       0.44      0.57      0.50        47

    accuracy                           0.83       320
   macro avg       0.68      0.72      0.70       320
weighted avg       0.85      0.83      0.84       320

              precision    recall  f1-score   support

           0       0.88      0.98      0.93       273
           1       0.71      0.26      0.37        47

    accuracy                           0.88       320
   macro avg       0.80      0.62      0.65       320
weighted avg       0.86      0.88      0.85       320

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       273
           1       0.71      0.53      0.61        47

    accuracy                           0.90       320
   macro avg       0.82      0.75      0.78       320
weighted avg       0.89      0.90      0.89       320

              preci

**Final thoughts:** XGB Regressor appears to be the best performing model for this dataset also.