In [1]:
!pip install --quiet umap-learn

In [2]:
import pandas as pd

WINE = '/kaggle/input/wine-quality-dataset/WineQT.csv'

df = pd.read_csv(filepath_or_buffer=WINE, index_col=['Id'])
df['high quality'] = df['quality'] > 5
df.head()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,high quality
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,False
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,False
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,False
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,True
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,False


How is our target variable distributed?

In [3]:
from plotly import express

express.histogram(data_frame=df, x='quality')

Our target variable has a Gaussian-looking distribution. Because 3s and 8s are so rare we will probably not be able to distinguish them.

In [4]:
COLUMNS = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'high quality']

In [5]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=False, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

done with UMAP in 0:00:10.156229


In [6]:
from plotly import express

express.scatter(data_frame=df, x='x', y='y', color='quality')

As expected our numerical qualities look very much mixed; how about our somewhat arbitary separation between high and low quality?

In [7]:
from plotly import express

express.scatter(data_frame=df, x='x', y='y', color='high quality')

If we squint we may see more low quality wines with low quality neighbors clustered to the left; otherwise we see a lot of local clustering but not a lot of separation between high/low quality clusters.

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df['quality'], test_size=0.2, random_state=2024, stratify=df['quality'])

logreg = LogisticRegression(max_iter=10000, tol=1e-4).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test))))
print('f1: {:5.4f}'.format(f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test))))
print(classification_report(zero_division=0.0, y_true=y_test, y_pred=logreg.predict(X=X_test)))

model fit in 4948 iterations
accuracy: 0.9738
f1: 0.9682
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.80      0.57      0.67         7
           5       0.97      1.00      0.98        97
           6       1.00      1.00      1.00        92
           7       0.94      1.00      0.97        29
           8       1.00      0.33      0.50         3

    accuracy                           0.97       229
   macro avg       0.78      0.65      0.69       229
weighted avg       0.97      0.97      0.97       229



As expected our model doesn't get the extreme values, which are poorly represented, and it draws most of its accuracy and f1 from the well-represented classes.

In [9]:
probability_df = pd.DataFrame(data=logreg.predict_proba(X=X_test).max(axis=1), columns=['probability'])
probability_df['true'] = y_test.tolist()
probability_df['pred'] = logreg.predict(X=X_test)
probability_df['correct'] = probability_df['true'] == probability_df['pred']
probability_df[['x', 'y']] = umap.transform(X=X_test)

probability_df.head()


Unnamed: 0,probability,true,pred,correct,x,y
0,0.942962,7,7,True,3.748603,-1.714986
1,0.959984,5,5,True,5.950453,-2.404728
2,0.564596,4,4,True,11.20158,9.918726
3,0.725918,7,7,True,10.815564,9.896008
4,0.980212,5,5,True,-2.059483,5.299526


In [10]:
express.scatter(data_frame=probability_df, x='x', y='y', facet_col='correct', color='probability')

What do we see? We see that in one strange case our model produces a high probability for an incorrect prediction, and it occasionally produces low probabilities for correct predictions.

In [11]:
express.histogram(data_frame=probability_df, x='true', facet_col='correct')

Not surprisingly our model is mostly correct for well-represented classes and misses all the 3s and 8s.

In [12]:
express.strip(data_frame=probability_df, x='true', y='pred', )