In [1]:
!pip install --quiet umap-learn

In [2]:
import pandas as pd

WINE = '/kaggle/input/wine-quality-dataset/WineQT.csv'

df = pd.read_csv(filepath_or_buffer=WINE, index_col=['Id'])
COLUMNS = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol',]
df.head()

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


How is our target variable distributed?

In [3]:
from plotly import express

express.histogram(data_frame=df, x='quality')

Our target variable has a Gaussian-looking distribution. Because 3s and 8s are so rare we will probably not be able to distinguish them.

In [4]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=False, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

done with UMAP in 0:00:11.002360


In [5]:
from plotly import express

express.scatter(data_frame=df, x='x', y='y', color='quality')

As expected our numerical qualities look very much mixed; we see some local clustering but not a lot. This is a tough problem.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df['quality'], test_size=0.2, random_state=2024, stratify=df['quality'])

logreg = LogisticRegression(max_iter=10000, tol=1e-4).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test))))
print('f1: {:5.4f}'.format(f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test))))
print(classification_report(zero_division=0.0, y_true=y_test, y_pred=logreg.predict(X=X_test)))

model fit in 5568 iterations
accuracy: 0.6201
f1: 0.5929
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         7
           5       0.69      0.76      0.72        97
           6       0.56      0.66      0.61        92
           7       0.58      0.24      0.34        29
           8       0.00      0.00      0.00         3

    accuracy                           0.62       229
   macro avg       0.30      0.28      0.28       229
weighted avg       0.59      0.62      0.59       229



As expected our model doesn't get the extreme values, which are poorly represented, and it draws most of its accuracy and f1 from the well-represented classes.

In [7]:
probability_df = pd.DataFrame(data=logreg.predict_proba(X=X_test).max(axis=1), columns=['probability'])
probability_df['true'] = y_test.tolist()
probability_df['pred'] = logreg.predict(X=X_test)
probability_df['correct'] = probability_df['true'] == probability_df['pred']
probability_df[['x', 'y']] = umap.transform(X=X_test)

probability_df.head()


Unnamed: 0,probability,true,pred,correct,x,y
0,0.480125,7,7,True,3.839138,-0.767492
1,0.482512,5,6,False,5.577554,-1.671057
2,0.413728,4,6,False,9.521704,11.713301
3,0.410045,7,6,False,9.07113,11.362028
4,0.761162,5,5,True,-1.780562,5.76687


How are the model probablities distributed when the model is correct vs. when it is incorrect?

In [8]:
probability_df[['correct', 'probability']].groupby(by=['correct']).mean().to_dict()

{'probability': {False: 0.5581914618367122, True: 0.6112605613286948}}

The mean probability is about the same in either case. Let's use a scatter plot to take a look.

In [9]:
express.scatter(data_frame=probability_df, x='x', y='y', facet_col='correct', color='probability')

What do we see? We have some cases where the model is correct with high model probability, and a handful where it is incorrect with high model probability, but most of the time the model probabilities are relatively low, whether the prediction is correct or not.

In [10]:
express.histogram(data_frame=probability_df, x='true', facet_col='correct')

Not surprisingly when our model is correct it is predicting well-represented classes and it misses all the 3s and 8s.

In [11]:
express.strip(data_frame=probability_df, x='true', y='pred', color='correct')