In [1]:
import pandas as pd

RED = '/kaggle/input/wine-quality/winequality-red.csv'
WHITE = '/kaggle/input/wine-quality/winequality-white.csv'

red_df = pd.read_csv(filepath_or_buffer=RED, sep=';')
white_df = pd.read_csv(filepath_or_buffer=WHITE, sep=';')
print('done reading data.')

done reading data.


In [2]:
red_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


Let's see if we can distinguish red wine from white based just on the available data.

In [4]:
df = pd.concat(axis=0, ignore_index=True,
               objs=[pd.concat(axis=1, objs=[red_df, pd.DataFrame(data={'target': ['red'] * len(red_df)})]), 
                     pd.concat(axis=1, objs=[white_df, pd.DataFrame(data={'target': ['white'] * len(white_df)})])])
df.sample(frac=1, random_state=2024).head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target
6387,6.4,0.31,0.31,12.9,0.045,55.0,161.0,0.99546,3.02,0.59,10.2,5,white
177,7.5,0.52,0.42,2.3,0.087,8.0,38.0,0.9972,3.58,0.61,10.5,6,red
1726,6.5,0.24,0.32,7.6,0.038,48.0,203.0,0.9958,3.45,0.54,9.7,7,white
3394,6.7,0.37,0.41,6.3,0.061,22.0,149.0,0.9953,3.16,0.47,9.6,6,white
1814,5.8,0.26,0.24,9.2,0.044,55.0,152.0,0.9961,3.31,0.38,9.4,5,white


Is our target class balanced?

In [5]:
print(df['target'].value_counts(normalize=True).to_dict())

{'white': 0.7538864091118977, 'red': 0.2461135908881022}


No; we have about three times as many white wines as red wines.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
 12  target                6497 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB


Let's use dimension reduction to see if our data clusters according to the target variable.

In [7]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df.drop(columns=['target']))
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-16 15:52:41.068740: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-16 15:52:41.068926: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-16 15:52:41.221584: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Tue Jul 16 15:52:51 2024 Construct fuzzy simplicial set
Tue Jul 16 15:52:51 2024 Finding Nearest Neighbors
Tue Jul 16 15:52:51 2024 Building RP forest with 9 trees
Tue Jul 16 15:52:57 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	Stopping threshold met -- exiting after 3 iterations
Tue Jul 16 15:53:14 2024 Finished Nearest Neighbor Search
Tue Jul 16 15:53:18 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Tue Jul 16 15:53:23 2024 Finished embedding
done with UMAP in 0:00:32.217291


In [8]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df, x='x', y='y', color='target')

This simple analysis and visualization suggests we may be able to classify wines according to whether they are red or white using a relatively simple model. Let's build a model.

In [9]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=2024, stratify=df['target'])

time_start = arrow.now()
model = LogisticRegression(max_iter=100000, tol=1e-4).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(model.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))
print('model done in {}'.format(arrow.now() - time_start))

model fit in 1260 iterations took 0:00:01.167748
accuracy: 0.9862
model done in 0:00:01.181118


In [10]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

         red       0.98      0.96      0.97       320
       white       0.99      0.99      0.99       980

    accuracy                           0.99      1300
   macro avg       0.99      0.98      0.98      1300
weighted avg       0.99      0.99      0.99      1300

