In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier

! pip install altair==5.0.0rc1  #I was getting the same schema error for hconcating the sample plots, this should fix it
import altair as alt

Collecting altair==5.0.0rc1
  Using cached altair-5.0.0rc1-py3-none-any.whl (709 kB)
Installing collected packages: altair
  Attempting uninstall: altair
    Found existing installation: altair 4.2.0
    Uninstalling altair-4.2.0:
      Successfully uninstalled altair-4.2.0
Successfully installed altair-5.0.0rc1


1. loading and filtering of data

In [2]:
data_red = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";").assign(type = "red")
data_white = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep=";").assign(type = "white")
data_wine = pd.concat([data_red, data_white])
#data_wine = data_wine[(data_wine['quality'] > 3) & (data_wine['quality'] < 8)] 
data_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


Summary of the data:

In [3]:
data_wine.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.215307,0.339666,0.318633,5.443235,0.056034,30.525319,115.744574,0.994697,3.218501,0.531268,10.491801,5.818378
std,1.296434,0.164636,0.145318,4.757804,0.035034,17.7494,56.521855,0.002999,0.160787,0.148806,1.192712,0.873255
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


In [4]:
data_wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
type                    0
dtype: int64

In [5]:
data_wine.nunique()

fixed acidity           106
volatile acidity        187
citric acid              89
residual sugar          316
chlorides               214
free sulfur dioxide     135
total sulfur dioxide    276
density                 998
pH                      108
sulphates               111
alcohol                 111
quality                   7
type                      2
dtype: int64

In [6]:
scatter_plots = []
sample_size = 450
sample_opacity = 0.7

data_sample = data_wine.sample(sample_size, random_state=123)

red_count   = data_sample[data_sample["type"] == "red"].shape[0]      #for subtitle
white_count = data_sample[data_sample["type"] == "white"].shape[0]    #for subtitle

for property_name in data_sample.columns[:-2]:
    scatter_plot = (alt.Chart(data_sample)
                    .mark_circle(opacity=sample_opacity)
                    .encode(
                        x=alt.X(f"quality:Q", scale=alt.Scale(domain=(1, 10))),
                        y=alt.Y(f"{property_name}:Q", scale=alt.Scale(zero=False)),
                        color="type:N")
                    .properties(
                        width=250,
                        height=250,
                        title=f"{property_name.title()} vs Quality, by Wine Type"))
    scatter_plots.append(scatter_plot)
    
scatter_plots.append(alt.Chart(data_sample)
                     .mark_circle(opacity=sample_opacity)
                     .encode(
                         x=alt.X(f"quality:Q", scale=alt.Scale(domain=[1,10])),
                         y=alt.Y("type"),
                         color="type:N")
                     .properties(
                         width=250,
                         height=250,
                         title=f"Type vs Quality, by Wine Type"))
 
    
wine_plot = alt.VConcatChart(vconcat=(alt.hconcat(*scatter_plots[0:4]), 
                                      alt.hconcat(*scatter_plots[4:8]), 
                                      alt.hconcat(*scatter_plots[8:])), 
                             title=alt.TitleParams(
                                 "Plot of All Wine Properties vs Quality", 
                                 fontSize=30,
                                 subtitle="(sample size "+str(sample_size)+": "+str(red_count)+" red / "+str(white_count)+" white)", anchor="middle", dy=-10))

wine_plot

2. split the data

In [7]:
# splits for concatenated wine data
X = data_wine.drop('quality', axis=1)
y = data_wine['quality']
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

# splits for red wine data
X_red = data_red.drop(['quality', 'type'], axis=1)
y_red = data_red['quality']
X_train_red_raw, X_test_red_raw, y_train_red, y_test_red = train_test_split(X_red, y_red, test_size=0.25, random_state=123)

#splits for white wine data
X_white = data_white.drop(['quality', 'type'], axis=1)
y_white = data_white['quality']
X_train_white_raw, X_test_white_raw, y_train_white, y_test_white = train_test_split(X_white, y_white, test_size=0.25, random_state=123)

3. preprocessing of data

In [8]:
# X_train_raw.fillna(X_train_raw.mean(), inplace=True)
# X_train_red_raw.fillna(X_train_red_raw.mean(), inplace=True)
# X_train_white_raw.fillna(X_train_white_raw.mean(), inplace=True) 

#we dont need this since there are no observations with NA values

In [9]:
features_categorical = [
    "type"
]
features_numerical = [
    'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
    'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
    'pH', 'sulphates', 'alcohol'
]

#wine data
ct = make_column_transformer(
    (StandardScaler(), features_numerical),
    (OneHotEncoder(drop="first"), features_categorical)
)
X_train = ct.fit_transform(X_train_raw)

# column_names = (
#     features_numerical   
#     + ct.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
# )
# test = pd.DataFrame(X_train, columns = column_names)

# looks like these aren't needed


#red and white wine data
ct = make_column_transformer(
    (StandardScaler(), features_numerical),
)
X_train_red = ct.fit_transform(X_train_red_raw)

X_train_white = ct.fit_transform(X_train_white_raw)

4. model

In [12]:
classifiers = {
        "knn":KNeighborsClassifier(weights="distance")#,
        # "rfc":RandomForestClassifier(),
        # "svc":SVC(),
        # "dc":DecisionTreeClassifier(),
        # "etc":ExtraTreesClassifier(),
}
param_grids = {
        "knn": {"n_neighbors": range(1, 25), "algorithm": ["ball_tree", "kd_tree", "brute"]}#,
        # "rfc": {"max_depth": range(1, 25), "n_estimators": [1, 10, 100]},
        # "svc": {"C": [0.01, 0.1, 1, 10], "gamma": [0.01, 0.1, 1, 10, 100]},
        # "dc": {"max_depth": range(1, 200, 1), "criterion": ["gini", "entropy"]},
        # "etc": {"n_estimators": [100, 300, 500, 800, 1200], "class_weight": ["balanced", "balanced_subsample"]},
} 
def train_all_classifiers(X_train, y_train):
    models = {}
    for classifier in classifiers.keys():
        tune = GridSearchCV(
            classifiers[classifier],
            param_grids[classifier],
            cv = 5,
            n_jobs = -1,
            return_train_score=True,
        )
        models[classifier] = tune.fit(X_train, y_train)
    return models

def get_plots(models):
    plots = {}
    for classifier in classifiers.keys():
        accuracies = pd.DataFrame(models[classifier].cv_results_)
        plots[classifier] = (alt.Chart(accuracies, title="Accuracy vs Hyperparameter for Model "+classifier)
                .transform_fold(["mean_test_score", "mean_train_score"])
                .mark_line(opacity=0.8)
                .encode(
                        x=alt.X(
                            "param_"+list(param_grids[classifier].keys())[0], 
                            title=list(param_grids[classifier].keys())[0],
                            scale=alt.Scale(zero=False),
                        ),
                        y=alt.Y(
                            "value:Q", 
                            title="Accuracy",
                            scale=alt.Scale(zero=False)
                        ),
                        strokeDash='key:N',
                        color="param_"+list(param_grids[classifier].keys())[1]+":N")
                .properties(
                    width=400,
                    height=400))
    # plot = (
    #     alt.vconcat(plots["knn"],plots["dc"]).resolve_scale(color='independent')|
    #     alt.vconcat(plots["rfc"],plots["svc"]).resolve_scale(color='independent')|
    #     plots["etc"]
    # ).resolve_scale(color='independent')
    plot = plot["knn"]
    return plot

In [13]:
models_all_wine = train_all_classifiers(X_train, y_train)
plot_wine = get_plots(models_all_wine)
plot_wine

UnboundLocalError: local variable 'plot' referenced before assignment

In [None]:
models_all_red = train_all_classifiers(X_train_red, y_train_red)
plot_red = get_plots(models_all_red)
plot_red

In [None]:
models_all_white = train_all_classifiers(X_train_white, y_train_white)
plot_white = get_plots(models_all_white)
plot_white