In [1]:
from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning)
import pandas as pd

WINE = '/kaggle/input/wine-classification-dataset/wine_classification.csv'
df = pd.read_csv(filepath_or_buffer=WINE)
df.head()

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [2]:
from plotly import express
for x in df.columns[1:]:
    express.histogram(data_frame=df, x=x, facet_col='Class', marginal='box').show()

Our features look quite different from class to class. Let's do some dimension reduction.

In [3]:
from arrow import now
from umap import UMAP
from plotly import express

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=100,)
columns = [ 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash ',
       'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315', 'Proline']

df[['x', 'y']] = umap.fit_transform(X=df[columns])
express.scatter(data_frame=df, x='x', y='y', color='Class',).show()
print('done with UMAP in {}'.format(now() - time_start))

2024-03-05 17:59:56.005338: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-05 17:59:56.005511: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-05 17:59:56.175697: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=100, n_jobs=1, random_state=2024, verbose=True)
Tue Mar  5 18:00:10 2024 Construct fuzzy simplicial set
Tue Mar  5 18:00:10 2024 Finding Nearest Neighbors
Tue Mar  5 18:00:14 2024 Finished Nearest Neighbor Search
Tue Mar  5 18:00:18 2024 Construct embedding


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
Tue Mar  5 18:00:19 2024 Finished embedding


done with UMAP in 0:00:09.807761


Our data isn't random, so we should be able to classify reasonably well. Let's build a model.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[columns], df['Class'], test_size=0.2, random_state=2024)

regression = LogisticRegression(max_iter=100000)

regression.fit(X=X_train, y=y_train)

express.histogram(y=regression.coef_.tolist()[0], x=columns).show(validate=True)
print('accuracy: {:5.4f} '.format(regression.score(X=X_test, y=y_test)))

accuracy: 0.9722 


An accuracy of 0.9722 will be hard to improve.

In [5]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report


MODELS = {'Linear SVM': SVC(kernel='linear', C=0.025, random_state=2024),'QDA': QuadraticDiscriminantAnalysis(), }

for name, model in MODELS.items():
    classifier = make_pipeline(StandardScaler(), model)
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print(name, score)
    print(classification_report(y_true=y_test, y_pred=classifier.predict(X=X_test)))

Linear SVM 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00        13
           3       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

QDA 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00        13
           3       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

