In [21]:
import altair as alt
import pandas as pd
import numpy as np
import sklearn
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

1. loading and filtering of data

In [22]:
data_red = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=";").assign(type = "red")
data_white = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep=";").assign(type = "white")
data_wine = pd.concat([data_red, data_white])
#data_wine = data_wine[(data_wine['quality'] > 3) & (data_wine['quality'] < 8)] 

2. split the data

In [23]:
wine_train, wine_test = train_test_split(data_wine, test_size=0.25, random_state=123)
X_train_raw = wine_train.drop('quality', axis=1)
y_train = wine_train['quality']
X_test_raw = wine_test.drop('quality', axis=1)
y_test = wine_test['quality']

3. preprocessing of data

In [24]:
features_categorical = [
    "type"
]
features_numerical = [
    'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
    'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
    'pH', 'sulphates', 'alcohol'
]
ct = make_column_transformer(
    (StandardScaler(), features_numerical),
    (OneHotEncoder(drop="first"), features_categorical)
)
X_train = ct.fit_transform(X_train_raw)
column_names = (
    features_numerical   
    + ct.named_transformers_["onehotencoder"].get_feature_names_out().tolist()
)
pd.DataFrame(X_train, columns = column_names)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type_white
0,0.453635,0.319504,-0.408391,0.923677,0.094709,0.122813,1.132680,0.270866,-1.112001,-0.737375,0.333868,1.0
1,-0.469247,2.364443,-1.578105,0.005206,1.761534,-1.212652,-1.739088,0.618050,0.820365,-0.260049,0.250535,0.0
2,-1.084502,0.136376,1.242969,-0.934140,2.048917,1.792144,1.080148,-0.846736,-0.737995,-0.532807,-0.666130,1.0
3,1.607238,0.136376,0.486095,-0.829768,0.698215,0.567968,-1.073678,0.608130,1.568378,1.376496,0.333868,0.0
4,0.222915,-0.962397,-0.064358,1.299415,-0.135198,-0.377987,-0.390758,0.766843,-0.176985,-0.600996,-0.832796,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4867,-0.315433,0.685762,-0.270778,4.294883,-0.709965,-0.211053,0.099544,2.734219,-0.987333,-0.737375,0.083868,1.0
4868,0.761263,-0.474054,0.073255,-0.600150,-0.077721,-1.101364,-0.793506,-0.218498,-2.047018,-0.260049,-0.832796,1.0
4869,-0.853781,1.540363,-2.128558,-0.704522,0.008494,-1.435230,-1.809131,0.012958,1.880050,0.217277,0.750533,0.0
4870,0.607449,-0.718226,-0.270778,1.779525,-0.077721,2.905032,1.412853,1.130559,-0.488657,-0.532807,-0.582796,1.0


TODO: performs a summary of the data set that is relevant for exploratory data analysis related to the planned analysis 

TODO: creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis

4. model

In [53]:
classifiers = {
    "knn":KNeighborsClassifier(),
    "lr":LogisticRegression(),
    "rfc":RandomForestClassifier(),
    "svc":SVC(),
}
param_grids = {
    "knn": {"n_neighbors": range(1, 50), "weights": ["uniform", "distance"]},
    "lr": {"C": [0.001, 0.01, 0.1, 1, 10], "penalty": ["l1", "l2"]},
    "rfc": {"max_depth": range(1, 25), "n_estimators": [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
    "svc": {"C": [0.01, 0.1, 1, 10], "gamma": [0.01, 0.1, 1, 10]}
}
models = {}
for classifier in ["knn", "lr", "rfc", "svc"]:
    tune = GridSearchCV(
        classifiers[classifier],
        param_grids[classifier],
        cv = 5,
        n_jobs = -1,
        return_train_score=True,
    )
    models[classifier] = tune.fit(X_train, y_train)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\haoqi\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\haoqi\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\haoqi\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.538999        

In [72]:
params = {
    "knn": ["param_n_neighbors", "param_weights:N"],
    "lr": ["param_C:N", "param_penalty:N"],
    "rfc": ["param_max_depth", "param_n_estimators:N"],
    "svc": ["param_C:N", "param_gamma:N"]
}
plots = {}
for classifier in ["knn", "lr", "rfc", "svc"]:
    accuracies = pd.DataFrame(models[classifier].cv_results_)
    plots[classifier] = (alt.Chart(accuracies, title="Accuracy vs Neighbors for Model Including Both Wine Types, model:"+classifier)
            .transform_fold(["mean_test_score", "mean_train_score"])
            .mark_line()
            .encode(
                    x=alt.X(
                        params[classifier][0], 
                        title=params[classifier][0],
                        scale=alt.Scale(zero=False),
                    ),
                    y=alt.Y(
                        "value:Q", 
                        scale=alt.Scale(zero=False)
                    ),
                    strokeDash='key:N',
                    color=params[classifier][1],
                )
    )



In [73]:
(alt.vconcat(plots["knn"],plots["lr"]).resolve_scale(color='independent')|alt.vconcat(plots["rfc"],plots["svc"]).resolve_scale(color='independent')).resolve_scale(color='independent')

In [18]:
np.random.seed(123)
param_grid = {
    "n_neighbors": range(1, 50, 1),
}
knn_tune = GridSearchCV(
    KNeighborsClassifier(n_jobs=-1), 
    param_grid,  
    cv=5, 
    n_jobs=-1,
    return_train_score=True,
)

knn_model = knn_tune.fit(X_train, y_train)

In [19]:
accuracies = pd.DataFrame(knn_model.cv_results_)

#this one takes a long time to run bc the param grid from above is set to 50. just wait like 4 min and it'll come thorugh!

cv_plot = (alt.Chart(accuracies, title="Accuracy vs Neighbors for Model Including Both Wine Types")
           .transform_fold(["mean_test_score", "mean_train_score"])
           .mark_line()
           .encode(
                x=alt.X(
                    "param_n_neighbors", 
                    title="Neighbors",
                    scale=alt.Scale(zero=False),
                ),
                y=alt.Y(
                    "value:Q", 
                    scale=alt.Scale(zero=False)
                ),
                strokeDash='key:N',
                color="param_metric:N"
            )
)

cv_plot

In [7]:
plots = []

#this one takes even longer to run, just wait and it'll work eventually!

for data in [data_red, data_white]:
    
    wine_train, wine_test = train_test_split(data, test_size=0.25, random_state=123)
    X_train_raw = wine_train.drop(['quality', "type"], axis=1)
    y_train = wine_train['quality']
    X_test_raw = wine_test.drop(['quality', "type"], axis=1)
    y_test = wine_test['quality']

    ct = make_column_transformer(
        (StandardScaler(), features_numerical),
    )
    X_train = ct.fit_transform(X_train_raw)

    np.random.seed(123)

    knn_model = knn_tune.fit(X_train, y_train)

    accuracies = pd.DataFrame(knn_model.cv_results_)

    cv_plot = (alt.Chart(accuracies, title="Accuracy vs Neighbors for Model of :going to add name here: Wine")
               .transform_fold(["mean_test_score", "mean_train_score"])
               .mark_line()
               .encode(
                    x=alt.X(
                        "param_n_neighbors", 
                        title="Neighbors",
                        scale=alt.Scale(zero=False),
                    ),
                    y=alt.Y(
                        "value:Q", 
                        scale=alt.Scale(zero=False)
                    ),
                    color='key:N'
                )
    )

    plots.append(cv_plot)



In [8]:
plots[0]

  for col_name, dtype in df.dtypes.iteritems():


In [9]:
plots[1]