In [33]:
import pandas as pd

df = pd.read_csv('../data/winequality-white.csv', sep=";")

# the preprocessed classes could be 'less than 5', '5', '6', '7', 'larger than 7'. 
def label_map(label):
    label = int(label)
    if label < 5:
        return 1
    elif label == 5:
        return 2
    elif label == 6:
        return 3
    elif label == 7:
        return 4
    elif label > 7:
        return 5

new_labels = [label_map(label) for label in df['quality'].values]
df['quality'] = new_labels

X = df.drop(['quality'], axis=1).values
y = df['quality'].values

df['quality'].value_counts()

3    2198
2    1457
4     880
1     183
5     180
Name: quality, dtype: int64

In [23]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from statistics import mean, stdev


def ML_pipeline_kfold_GridSearchCV(X,y,random_state,n_folds, param_grid, model, print_results = True):
    
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state, stratify=y)
    kf = StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=random_state)
    
    pipe = make_pipeline(StandardScaler(), model)
    
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring = 'accuracy',cv=kf, return_train_score = True, iid=True)
    grid.fit(X_other, y_other)
    
    test_score = grid.score(X_test, y_test)
    
    if print_results:
        print("Results: \n ")
        print(f"Best score: {grid.best_score_}")
        print(f"Best params: {grid.best_params_}")
        print(f"Test score: {test_score}")
        print("\n \n")
    
    return grid, test_score

    

In [35]:
# Logistic regression L1

C_range = [1/alpha for alpha in np.logspace(-2,4,num=8)]
param_grid = {'logisticregression__C': C_range}
model = LogisticRegression(penalty = 'l1', solver = 'saga', max_iter = 10000, multi_class = 'auto')

ML_pipeline_kfold_GridSearchCV(X, y, 3, 5, param_grid, model)
print("done")

Results: 
 
Best score: 0.5352220520673813
Best params: {'logisticregression__C': 1.9306977288832508}
Test score: 0.5428571428571428

 

done


In [40]:
# Logistic regression elastic net

l1_ratio_range = np.linspace(0, 1, num=5)
C_range = [1/alpha for alpha in np.logspace(-2,4,num=5)]
param_grid = {
    'logisticregression__C': C_range,
    'logisticregression__l1_ratio': l1_ratio_range
}
model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=4000, multi_class='multinomial')
ML_pipeline_kfold_GridSearchCV(X, y, 3, 5, param_grid, model)
print("done")

Results: 
 
Best score: 0.535732516590097
Best params: {'logisticregression__C': 0.1, 'logisticregression__l1_ratio': 0.25}
Test score: 0.5448979591836735

 

done


In [25]:
# Random forest

from sklearn.ensemble import RandomForestClassifier

n_estimators_range = [10, 50, 100, 200, 300, 400]
max_depth_range = [8, 10, 12]
param_grid = {
    'randomforestclassifier__n_estimators': n_estimators_range,
    'randomforestclassifier__max_depth': max_depth_range
}
model = RandomForestClassifier(random_state=1)

ML_pipeline_kfold_GridSearchCV(X, y, 3, 5, param_grid, model)
print("done")

Results: 
 
Best score: 0.6620724859622257
Best params: {'randomforestclassifier__max_depth': 12, 'randomforestclassifier__n_estimators': 300}
Test score: 0.6428571428571429

 

done


In [34]:
# SVM
from sklearn.svm import SVC

gamma_range = np.logspace(-4, 2, num=4)
c_range = [0.1, 1, 5, 50]
param_grid = {
    'svc__gamma': gamma_range,
    'svc__C': c_range
}

model = SVC()
ML_pipeline_kfold_GridSearchCV(X, y, 3, 5, param_grid, model)
print("done")

Results: 
 
Best score: 0.6322103113833588
Best params: {'svc__C': 1, 'svc__gamma': 1.0}
Test score: 0.639795918367347

 

done


In [31]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

neighbors_range = [1, 3, 5, 10, 30]
param_grid = {
    'kneighborsclassifier__n_neighbors': neighbors_range
}
model = KNeighborsClassifier()
ML_pipeline_kfold_GridSearchCV(X, y, 3, 5, param_grid, model)
print("done")

Results: 
 
Best score: 0.6166411434405309
Best params: {'kneighborsclassifier__n_neighbors': 1}
Test score: 0.6173469387755102

 

done
