In [87]:
import altair as alt
import pandas as pd
import numpy as np
import sklearn
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier


1. loading and filtering of data

In [140]:
data_red = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";").assign(type = "red")
data_white = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep=";").assign(type = "white")
data_wine = pd.concat([data_red, data_white])
#data_wine = data_wine[(data_wine['quality'] > 3) & (data_wine['quality'] < 8)] 
data_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


2. split the data

In [159]:
X = data_wine.drop('quality', axis=1)
y = data_wine['quality']
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

X_red = data_red.drop(['quality', 'type'], axis=1)
y_red = data_red['quality']
X_train_red_raw, X_test_red_raw, y_train_red, y_test_red = train_test_split(X_red, y_red, test_size=0.25, random_state=123)

X_white = data_white.drop(['quality', 'type'], axis=1)
y_white = data_white['quality']
X_train_white_raw, X_test_white_raw, y_train_white, y_test_white = train_test_split(X_white, y_white, test_size=0.25, random_state=123)

3. preprocessing of data

In [164]:
X_train_raw.fillna(X_train_raw.mean(), inplace=True)
X_train_red_raw.fillna(X_train_red_raw.mean(), inplace=True)
X_train_white_raw.fillna(X_train_white_raw.mean(), inplace=True)

  X_train_raw.fillna(X_train_raw.mean(), inplace=True)


In [176]:
features_categorical = [
    "type"
]
features_numerical = [
    'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
    'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
    'pH', 'sulphates', 'alcohol'
]
ct = make_column_transformer(
    (StandardScaler(), features_numerical),
    (OneHotEncoder(drop="first"), features_categorical)
)
X_train = ct.fit_transform(X_train_raw)
column_names = (
    features_numerical   
    + ct.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
)
pd.DataFrame(X_train, columns = column_names)

ct = make_column_transformer(
    (StandardScaler(), features_numerical),
)
X_train_red = ct.fit_transform(X_train_red_raw)

ct = make_column_transformer(
    (StandardScaler(), features_numerical),
)
X_train_white = ct.fit_transform(X_train_white_raw)

TODO: performs a summary of the data set that is relevant for exploratory data analysis related to the planned analysis 

TODO: creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis

4. model

In [170]:
classifiers = {
        "knn":KNeighborsClassifier(weights="distance"),
        "rfc":RandomForestClassifier(),
        "svc":SVC(),
        "dc":DecisionTreeClassifier(),
        "etc":ExtraTreesClassifier(),
}
param_grids = {
        "knn": {"n_neighbors": range(1, 25), "algorithm": ["ball_tree", "kd_tree", "brute"]},
        "rfc": {"max_depth": range(1, 25), "n_estimators": [1, 10, 100]},
        "svc": {"C": [0.01, 0.1, 1, 10], "gamma": [0.01, 0.1, 1, 10, 100]},
        "dc": {"max_depth": range(1, 200, 1), "criterion": ["gini", "entropy"]},
        "etc": {"n_estimators": [100, 300, 500, 800, 1200], "class_weight": ["balanced", "balanced_subsample"]},
} 
def train_all_classifiers(X_train, y_train):
    models = {}
    for classifier in classifiers.keys():
        tune = GridSearchCV(
            classifiers[classifier],
            param_grids[classifier],
            cv = 5,
            n_jobs = -1,
            return_train_score=True,
        )
        models[classifier] = tune.fit(X_train, y_train)
    return models

def get_plot(models):
    plots = {}
    for classifier in classifiers.keys():
        accuracies = pd.DataFrame(models[classifier].cv_results_)
        plots[classifier] = (alt.Chart(accuracies, title="Accuracy vs hyperparameter for Model "+classifier)
                .transform_fold(["mean_test_score", "mean_train_score"])
                .mark_line()
                .encode(
                        x=alt.X(
                            "param_"+list(param_grids[classifier].keys())[0], 
                            title=list(param_grids[classifier].keys())[0],
                            scale=alt.Scale(zero=False),
                        ),
                        y=alt.Y(
                            "value:Q", 
                            title="accuracy",
                            scale=alt.Scale(zero=False)
                        ),
                        strokeDash='key:N',
                        color="param_"+list(param_grids[classifier].keys())[1]+":N",
                    )
        )
    plot = (
        alt.vconcat(plots["knn"],plots["dc"]).resolve_scale(color='independent')|
        alt.vconcat(plots["rfc"],plots["svc"]).resolve_scale(color='independent')|
        plots["etc"]
    ).resolve_scale(color='independent')
    return plot

In [171]:
models_all = train_all_classifiers(X_train, y_train)
plot = get_plots(models_all)
plot

In [177]:
models_all = train_all_classifiers(X_train_red, y_train_red)
plot = get_plots(models_all)
plot

In [178]:
models_all = train_all_classifiers(X_train_white, y_train_white)
plot = get_plots(models_all)
plot

