# Get initial data

In [40]:
# imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *

from sklearn.linear_model import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.naive_bayes import *
from sklearn.tree import *
from sklearn.ensemble import *
from sklearn.discriminant_analysis import *

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, cohen_kappa_score
from sklearn.model_selection import *

In [41]:
# read the data
red_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
white_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')

# Remove duplicates

In [42]:
# remove lines that have all values duplicated
red_wine.drop_duplicates(inplace=True, ignore_index=True)
white_wine.drop_duplicates(inplace=True, ignore_index=True)

# Add missing information and combine data

In [43]:
# add quality_label column
for wine_type in [red_wine, white_wine]:
    # we are creating a new column called "quality_label", we define a range and associate that range with a label
    wine_type['quality_label'] = wine_type['quality'].apply(lambda value: 'low'
    if value <= 5 else 'medium'
    if value <= 7 else 'high')

    # here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
    wine_type['quality_label'] = pd.Categorical(wine_type['quality_label'],
    categories=['low', 'medium', 'high'])

In [44]:
# create a df with all wines

# add color of wine as parameter
red_wine['color'] = 'red'
white_wine['color'] = 'white'

# combine the wine dfs
wine = pd.concat([red_wine, white_wine], ignore_index=True)

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
wine['color'] = pd.Categorical(wine['color'],
categories=['red', 'white'])

In [45]:
# use the rename method to change all columns names lowercase and add an underscore if they are made of 2 words

def adjust_column_names(df):
    df.rename(str.lower, axis='columns', inplace=True)  # make the names lowercase
    df.columns = df.columns.str.replace(' ', '_')     # replace space with underscore in column names
    return df

wine = adjust_column_names(wine)
red_wine = adjust_column_names(red_wine)
white_wine = adjust_column_names(white_wine)

red_wine

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality,quality_label,color
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,low,red
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,low,red
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,low,red
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,medium,red
4,7.4,0.660,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,low,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,medium,red
1355,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,low,red
1356,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,medium,red
1357,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,low,red


# Define needed functions (data splitting, model training and evaluating)

In [46]:
# split the data into features (X) and target variable (y)
def split_wine_data(wine_data_ml):
    X = wine_data_ml.drop(['quality_label'], axis=1) # features
    y = wine_data_ml['quality_label']  # target variable

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    return X_train, X_test, y_train, y_test

# Prepare data for model training

In [47]:
# prepare wine data for model training

# drop color and quality column
wine_ml = wine.drop(['quality'], axis=1)

# replace the quality labels with numerical values
quality_codes = {'low' : 0, 'medium' : 1, 'high' : 2}
wine_ml['quality_label'].replace(quality_codes, inplace=True)

In [48]:
# One Hot Encoding for colors
# create an instance of OneHotEncoder
oh_enc = OneHotEncoder(categories=[['red','white']])

# fit and transform the 'color' column
encoded_color = oh_enc.fit_transform(wine_ml[['color']])

# convert the one-hot encoded data to a DataFrame
encoded_color_df = pd.DataFrame(encoded_color.toarray(), columns=oh_enc.get_feature_names_out(['color']))

# concatenate the one-hot encoded DataFrame with 'wine_ml'
wine_ml = pd.concat([wine_ml, encoded_color_df], axis=1)

# drop the original 'color' column from 'wine_ml'
wine_ml = wine_ml.drop('color', axis=1)

wine_ml

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality_label,color_red,color_white
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0,1.0,0.0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0,1.0,0.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0,1.0,0.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,1,1.0,0.0
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,1,0.0,1.0
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0,0.0,1.0
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,1,0.0,1.0
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1,0.0,1.0


# Tuning Hyperparameters of Machine Learning Model

### Baseline

In [49]:
X_train, X_test, y_train, y_test = split_wine_data(wine_ml)

# initialize the model
forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
forest.fit(X_train, y_train.ravel())

# make predictions on the test set
y_pred = forest.predict(X_test)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.7190
Confusion Matrix:
 [[272 115   0]
 [153 492   0]
 [  1  30   1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.70      0.67       387
           1       0.77      0.76      0.77       645
           2       1.00      0.03      0.06        32

    accuracy                           0.72      1064
   macro avg       0.80      0.50      0.50      1064
weighted avg       0.73      0.72      0.71      1064



### Parameters for the testing

In [50]:
# define the parameters for the hyperparameter grid
n_estimators_range = np.arange(10,210,10)
criterion_choice = ['gini', 'entropy', 'log_loss']
max_features_choice = ['sqrt', 'log2', None]
class_weight_choice = ['balanced', 'balanced_subsample']

# define the hyperparameter grid
param_grid = dict(criterion=criterion_choice, n_estimators=n_estimators_range, max_features=max_features_choice,
                  class_weight=class_weight_choice)

# define the model
rf = RandomForestClassifier()

### RandomizedSearchCV tests n combinations of parameters provided

In [51]:
# create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=20, scoring='f1', cv=5, random_state=0)

# get the split data
X_train, X_test, y_train, y_test = split_wine_data(wine_ml)

# fit the object to the data
random_search.fit(X_train, y_train)

# Get the best hyperparameter combination
best_params = random_search.best_params_

Traceback (most recent call last):
  File "/Users/michalpasternak/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/michalpasternak/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/michalpasternak/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/michalpasternak/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^

In [52]:
best_params = random_search.best_params_
best_params

{'n_estimators': 70,
 'max_features': 'sqrt',
 'criterion': 'gini',
 'class_weight': 'balanced'}

In [53]:
print("The best parameters are %s with a score of %0.4f"
      % (random_search.best_params_, random_search.best_score_))

The best parameters are {'n_estimators': 70, 'max_features': 'sqrt', 'criterion': 'gini', 'class_weight': 'balanced'} with a score of nan


In [54]:
# export the grid search parameters and their resulting scores into a dataframe
random_search_results = pd.concat([pd.DataFrame(random_search.cv_results_["params"]),pd.DataFrame(random_search.cv_results_["mean_test_score"],
                                                                                columns=["accuracy"])],axis=1)

In [55]:
random_search_results

Unnamed: 0,n_estimators,max_features,criterion,class_weight,accuracy
0,70,sqrt,gini,balanced,
1,40,,gini,balanced_subsample,
2,70,,entropy,balanced,
3,170,log2,gini,balanced_subsample,
4,110,log2,entropy,balanced,
5,170,,entropy,balanced,
6,130,,gini,balanced_subsample,
7,10,sqrt,entropy,balanced,
8,180,log2,gini,balanced,
9,10,sqrt,log_loss,balanced,


In [56]:
random_search_results_df = pd.DataFrame(random_search.cv_results_)
random_search_results_df.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_criterion,param_class_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.261847,0.002181,0.010024,0.001355,70,sqrt,gini,balanced,"{'n_estimators': 70, 'max_features': 'sqrt', '...",,,,,,,,1
17,0.823978,0.001776,0.021467,0.000213,190,log2,log_loss,balanced,"{'n_estimators': 190, 'max_features': 'log2', ...",,,,,,,,1
16,0.739103,0.000476,0.019691,0.000218,170,sqrt,log_loss,balanced,"{'n_estimators': 170, 'max_features': 'sqrt', ...",,,,,,,,1
15,0.376957,0.001349,0.010999,0.000127,90,log2,gini,balanced_subsample,"{'n_estimators': 90, 'max_features': 'log2', '...",,,,,,,,1
14,0.21972,0.001595,0.006757,0.000147,50,log2,log_loss,balanced,"{'n_estimators': 50, 'max_features': 'log2', '...",,,,,,,,1
13,1.601463,0.006758,0.012222,7.7e-05,110,,log_loss,balanced_subsample,"{'n_estimators': 110, 'max_features': None, 'c...",,,,,,,,1
12,1.552896,0.009797,0.012138,0.000138,110,,log_loss,balanced,"{'n_estimators': 110, 'max_features': None, 'c...",,,,,,,,1
11,2.319228,0.013612,0.017239,0.000151,160,,log_loss,balanced_subsample,"{'n_estimators': 160, 'max_features': None, 'c...",,,,,,,,1
10,0.262328,0.001837,0.00874,6e-05,70,log2,gini,balanced,"{'n_estimators': 70, 'max_features': 'log2', '...",,,,,,,,1
9,0.044501,0.000685,0.00216,0.000107,10,sqrt,log_loss,balanced,"{'n_estimators': 10, 'max_features': 'sqrt', '...",,,,,,,,1


In [57]:
best_params

{'n_estimators': 70,
 'max_features': 'sqrt',
 'criterion': 'gini',
 'class_weight': 'balanced'}

In [58]:
X_train, X_test, y_train, y_test = split_wine_data(wine_ml)

# initialize the model
forest = RandomForestClassifier(n_estimators=best_params['n_estimators'],
                                max_features=best_params['max_features'],
                                criterion=best_params['criterion'],
                                class_weight=best_params['class_weight'],
                                random_state=0)

forest.fit(X_train, y_train.ravel())

# make predictions on the test set
y_pred = forest.predict(X_test)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.7566
Confusion Matrix:
 [[259 128   0]
 [ 99 546   0]
 [  1  31   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.67      0.69       387
           1       0.77      0.85      0.81       645
           2       0.00      0.00      0.00        32

    accuracy                           0.76      1064
   macro avg       0.50      0.51      0.50      1064
weighted avg       0.73      0.76      0.74      1064



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
kappa = cohen_kappa_score(y_pred, y_test)
print("Cohen-Kappa score:", kappa)

Cohen-Kappa score: 0.48819362674996947
