In [1]:
from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd

APPLES = '/kaggle/input/apple-quality-analysis-dataset/apple_quality.csv'

df = pd.read_csv(filepath_or_buffer=APPLES, index_col=['A_id']).dropna(subset=['Quality'])
df['Acidity'] = df['Acidity'].astype(float)
df.head()

Unnamed: 0_level_0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
A_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,good
1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,good
2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad
3.0,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,good
4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0.0 to 3999.0
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Size         4000 non-null   float64
 1   Weight       4000 non-null   float64
 2   Sweetness    4000 non-null   float64
 3   Crunchiness  4000 non-null   float64
 4   Juiciness    4000 non-null   float64
 5   Ripeness     4000 non-null   float64
 6   Acidity      4000 non-null   float64
 7   Quality      4000 non-null   object 
dtypes: float64(7), object(1)
memory usage: 281.2+ KB


In [4]:
from plotly import express
xs = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity',]
for x in xs:
    express.histogram(data_frame=df, x=x, facet_col='Quality', marginal='box').show()

In [5]:
from arrow import now
from umap import UMAP

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=2000,)
columns = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity',]

df[['x', 'y']] = umap.fit_transform(X=df[columns])
express.scatter(data_frame=df, x='x', y='y', color='Quality').show()
print('done with UMAP in {}'.format(now() - time_start))

2024-03-04 14:33:38.899151: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-04 14:33:38.899305: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-04 14:33:39.084075: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=2000, n_jobs=1, random_state=2024, verbose=True)
Mon Mar  4 14:33:52 2024 Construct fuzzy simplicial set
Mon Mar  4 14:33:59 2024 Finding Nearest Neighbors
Mon Mar  4 14:34:03 2024 Finished Nearest Neighbor Search
Mon Mar  4 14:34:06 2024 Construct embedding


Epochs completed:   0%|            0/2000 [00:00]

	completed  0  /  2000 epochs
	completed  200  /  2000 epochs
	completed  400  /  2000 epochs
	completed  600  /  2000 epochs
	completed  800  /  2000 epochs
	completed  1000  /  2000 epochs
	completed  1200  /  2000 epochs
	completed  1400  /  2000 epochs
	completed  1600  /  2000 epochs
	completed  1800  /  2000 epochs
Mon Mar  4 14:34:22 2024 Finished embedding


done with UMAP in 0:00:29.683692


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[columns], df['Quality'], test_size=0.2, random_state=2024)

regression = LogisticRegression(max_iter=100000)

regression.fit(X=X_train, y=y_train)

express.histogram(y=regression.coef_.tolist()[0], x=columns).show(validate=True)
print('accuracy: {:5.4f} '.format(regression.score(X=X_test, y=y_test)))


accuracy: 0.7425 


In [7]:
from sklearn.metrics import classification_report
print(classification_report(y_true = y_test, y_pred=regression.predict(X=X_test)))

              precision    recall  f1-score   support

         bad       0.76      0.72      0.74       404
        good       0.73      0.77      0.75       396

    accuracy                           0.74       800
   macro avg       0.74      0.74      0.74       800
weighted avg       0.74      0.74      0.74       800



In [8]:
# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

RANDOM_STATE = 2024
names = [
    'Nearest Neighbors',
    'Linear SVM',
    'RBF SVM',
    'Gaussian Process',
    'Decision Tree',
    'Random Forest',
    'Neural Net',
    'AdaBoost',
    'Naive Bayes',
    'QDA',
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel='linear', C=0.025, random_state=RANDOM_STATE),
    SVC(gamma=2, C=1, random_state=RANDOM_STATE),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=RANDOM_STATE),
    DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=RANDOM_STATE),
    MLPClassifier(alpha=1, max_iter=1000, random_state=RANDOM_STATE),
    AdaBoostClassifier(algorithm='SAMME', random_state=RANDOM_STATE),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

for name, clf in zip(names, classifiers):
    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print('{}: {:5.4f}'.format(name, score))


Nearest Neighbors: 0.8862
Linear SVM: 0.7462
RBF SVM: 0.8912
Gaussian Process: 0.9125
Decision Tree: 0.7425
Random Forest: 0.8013
Neural Net: 0.8812
AdaBoost: 0.7725
Naive Bayes: 0.7275
QDA: 0.8550
