# Model Selection

In [93]:
# data processing
import pandas as pd

# data splitting
from sklearn.model_selection import train_test_split

# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier, AdaBoostClassifier

# model evaluation
from sklearn.metrics import *


Read the data

In [94]:
# read the data
w = 'wine'
wine = pd.read_csv('data/' + w + '.csv')

In [95]:
wine

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality_label,color_red,color_white
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0,1.0,0.0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0,1.0,0.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0,1.0,0.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,1,1.0,0.0
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,1,0.0,1.0
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0,0.0,1.0
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,1,0.0,1.0
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1,0.0,1.0


Split data

In [96]:
X = wine.drop(['quality_label'], axis=1) # features
y = wine['quality_label']  # target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Choose models


The tested here are:
- K-Nearest Neighbours (KNeighborsClassifier)
- Naive Bayes (CategoricalNB)
- Decision trees (DecisionTreeClassifier)
- Gradient Boosting (GradientBoostingClassifier)
- Random Forest (RandomForestClassifier)


In [97]:
# check metrics for multiple models
models = [KNeighborsClassifier,
          # CategoricalNB, - here there are problems in the training already
          DecisionTreeClassifier,
          GradientBoostingClassifier,
          RandomForestClassifier,
          AdaBoostClassifier]

Evaluate models

In [98]:
def evaluate_model(model, X_test, y_test):
# just runs a model and outputs it's evaluation metrics

    try: # if there is no errors

        # make predictions on the test set
        y_pred = model.predict(X_test)

        # evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        kappa = cohen_kappa_score(y_pred, y_test)
        conf_matrix = confusion_matrix(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred, zero_division=0)
        
    except: # if there are errors, return 0 for evaluation
        accuracy = 0
        precision = 0
        recall = 0
        f1 = 0
        kappa = 0
        conf_matrix = 0
        classification_rep = 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1' : f1,
        'kappa': kappa
        }, conf_matrix, classification_rep



In [99]:
eval_metrics = []

for model in models:
    
    print(model)

    model = model()

    # fit model
    model.fit(X_train, y_train.ravel())

    eval_dict, conf_matrix, classification_rep  = evaluate_model(model, X_test, y_test)

    # save evaluation metrics for all models
    eval_metrics.append({
        'model' : str(model).split('.')[-1].strip("'>"),
        'accuracy': eval_dict['accuracy'],
        'precision': eval_dict['precision'],
        'recall': eval_dict['recall'],
        'f1': eval_dict['f1'],
        'kappa': eval_dict['kappa']
        })

    print(classification_rep)
    print(conf_matrix)
    print()

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
              precision    recall  f1-score   support

           0       0.49      0.42      0.45       390
           1       0.66      0.74      0.70       648
           2       0.00      0.00      0.00        26

    accuracy                           0.60      1064
   macro avg       0.38      0.38      0.38      1064
weighted avg       0.58      0.60      0.59      1064

[[162 227   1]
 [169 479   0]
 [  2  24   0]]

<class 'sklearn.tree._classes.DecisionTreeClassifier'>
              precision    recall  f1-score   support

           0       0.56      0.61      0.58       390
           1       0.72      0.67      0.69       648
           2       0.13      0.15      0.14        26

    accuracy                           0.64      1064
   macro avg       0.47      0.48      0.47      1064
weighted avg       0.64      0.64      0.64      1064

[[237 150   3]
 [189 435  24]
 [  1  21   4]]

<class 'sklearn.ensemble

In [100]:
models_baseline = pd.DataFrame.from_dict(eval_metrics)
models_baseline

Unnamed: 0,model,accuracy,precision,recall,f1,kappa
0,KNeighborsClassifier(),0.602444,0.577936,0.602444,0.587658,0.14946
1,DecisionTreeClassifier(),0.635338,0.643766,0.635338,0.638614,0.278357
2,GradientBoostingClassifier(),0.734023,0.715698,0.734023,0.723979,0.440998
3,RandomForestClassifier(),0.75188,0.731306,0.75188,0.74046,0.47596
4,AdaBoostClassifier(),0.68797,0.682245,0.68797,0.684992,0.365757


Save results

In [101]:
dir = 'results/model_selection/'
models_baseline.to_csv(dir + w + '.csv', index=False)