# Model Selection

In [10]:
# data processing
import pandas as pd

# data splitting
from sklearn.model_selection import train_test_split

# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier, AdaBoostClassifier

# model evaluation
from sklearn.metrics import *


Read the data

In [11]:
# read the data
w = 'white_wine'
wine = pd.read_csv('data/' + w + '.csv')

In [12]:
wine

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality_label
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,1
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,1
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,1
4,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.99490,3.18,0.47,9.6,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3956,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,1
3957,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0
3958,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,1
3959,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1


Split data

In [13]:
X = wine.drop(['quality_label'], axis=1) # features
y = wine['quality_label']  # target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Choose models


The tested here are:
- K-Nearest Neighbours (KNeighborsClassifier)
- Naive Bayes (CategoricalNB)
- Decision trees (DecisionTreeClassifier)
- Gradient Boosting (GradientBoostingClassifier)
- Random Forest (RandomForestClassifier)


In [14]:
# check metrics for multiple models
models = [KNeighborsClassifier,
          # CategoricalNB, - here there are problems in the training already
          DecisionTreeClassifier,
          GradientBoostingClassifier,
          RandomForestClassifier,
          AdaBoostClassifier]

Evaluate models

In [15]:
def evaluate_model(model, X_test, y_test):
# just runs a model and outputs it's evaluation metrics

    try: # if there is no errors

        # make predictions on the test set
        y_pred = model.predict(X_test)

        # evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
        kappa = cohen_kappa_score(y_pred, y_test)
        conf_matrix = confusion_matrix(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred, zero_division=0)
        
    except: # if there are errors, return 0 for evaluation
        accuracy = 0
        precision = 0
        recall = 0
        f1 = 0
        kappa = 0
        conf_matrix = 0
        classification_rep = 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1' : f1,
        'kappa': kappa
        }, conf_matrix, classification_rep



In [16]:
eval_metrics = []

for model in models:
    
    print(model)

    model = model()

    # fit model
    model.fit(X_train, y_train.ravel())

    eval_dict, conf_matrix, classification_rep  = evaluate_model(model, X_test, y_test)

    # save evaluation metrics for all models
    eval_metrics.append({
        'model' : str(model).split('.')[-1].strip("'>"),
        'accuracy': eval_dict['accuracy'],
        'precision': eval_dict['precision'],
        'recall': eval_dict['recall'],
        'f1': eval_dict['f1'],
        'kappa': eval_dict['kappa']
        })

    print(classification_rep)
    print(conf_matrix)
    print()

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
              precision    recall  f1-score   support

           0       0.53      0.48      0.50       278
           1       0.69      0.76      0.73       492
           2       0.00      0.00      0.00        23

    accuracy                           0.64       793
   macro avg       0.41      0.41      0.41       793
weighted avg       0.62      0.64      0.63       793

[[133 145   0]
 [117 375   0]
 [  2  21   0]]

<class 'sklearn.tree._classes.DecisionTreeClassifier'>
              precision    recall  f1-score   support

           0       0.59      0.60      0.59       278
           1       0.73      0.71      0.72       492
           2       0.03      0.04      0.04        23

    accuracy                           0.65       793
   macro avg       0.45      0.45      0.45       793
weighted avg       0.66      0.65      0.65       793

[[166 108   4]
 [116 348  28]
 [  1  21   1]]

<class 'sklearn.ensemble

In [17]:
models_baseline = pd.DataFrame.from_dict(eval_metrics)
models_baseline

Unnamed: 0,model,accuracy,precision,recall,f1,kappa
0,KNeighborsClassifier(),0.640605,0.40698,0.413537,0.409309,0.227653
1,DecisionTreeClassifier(),0.649433,0.448812,0.449306,0.448594,0.29955
2,GradientBoostingClassifier(),0.722573,0.474545,0.4727,0.471703,0.402389
3,RandomForestClassifier(),0.732661,0.644784,0.499237,0.507601,0.431874
4,AdaBoostClassifier(),0.663304,0.47861,0.454672,0.457758,0.301139


Save results

In [18]:
dir = 'results/model_selection/'
models_baseline.to_csv(dir + w + '.csv', index=False)