In [1]:
import pandas as pd

APPLE = '/kaggle/input/apple-quality/apple_quality.csv'

# we have some weird junk at the end of this file so we need to load an explicit number of rows
df = pd.read_csv(filepath_or_buffer=APPLE, nrows=3941, index_col=['A_id'])

df.head()

Unnamed: 0_level_0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
A_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,good
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,good
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,good
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good


In [2]:
print(df['Quality'].value_counts().to_dict())

{'good': 1973, 'bad': 1968}


Our target variable has balanced classes.

In [3]:
from plotly import express
columns = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity',]
for x in columns:
    express.histogram(data_frame=df, x=x, color='Quality', facet_col='Quality').show()

At first glance our property distributions look very similar; let's see what dimension reduction can tell us.

In [4]:
from umap import UMAP

columns = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity',]

reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=100)
df[['x', 'y']] = pd.DataFrame(data=reducer.fit_transform(X=df[columns]))
express.scatter(data_frame=df, x='x', y='y', color='Quality', height=900, facet_col='Quality', marginal_x='box')

UMAP(n_epochs=100, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Mon Mar  4 18:58:30 2024 Construct fuzzy simplicial set
Mon Mar  4 18:58:38 2024 Finding Nearest Neighbors
Mon Mar  4 18:58:43 2024 Finished Nearest Neighbor Search
Mon Mar  4 18:58:46 2024 Construct embedding


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
Mon Mar  4 18:58:49 2024 Finished embedding


Plotting our results this way shows that we have some apples where it's hard to tell from the available data whether they are good or bad, and others where it's pretty clear. Let's build a simple model.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

columns = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity',]


X_train, X_test, y_train, y_test = train_test_split(df[columns], df['Quality'], test_size=0.25, random_state=2024)
regression = LogisticRegression(max_iter=100)
regression.fit(X_train, y_train)
print('accuracy: {:5.4f}'.format(accuracy_score(y_test, regression.predict(X_test))))

accuracy: 0.7495


In [6]:
from sklearn.metrics import classification_report
print(classification_report(y_true = y_test, y_pred = regression.predict(X_test)))
express.histogram(x=columns, y=regression.coef_[0])

              precision    recall  f1-score   support

         bad       0.75      0.75      0.75       498
        good       0.74      0.75      0.75       488

    accuracy                           0.75       986
   macro avg       0.75      0.75      0.75       986
weighted avg       0.75      0.75      0.75       986



Our regression coefficients aren't surprising are they? Big, heavy, sweet, juicy apples are good; acidic overrripe apples are bad. Let's use a more complicated model and see if we can improve our results.

All of our feature data looks Gaussian, so let's give the Gaussian process classifier a try. It is kind of computationally intensive, but it yields substantially better results than our linear regression model above.

In [7]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import Matern
from arrow import now

time_start = now()
classifier = GaussianProcessClassifier(kernel=1.0 * Matern(length_scale=1.0), random_state=2024)
classifier.fit(X=X_train, y=y_train)
print('score: {:5.4f}'.format(classifier.score(X=X_test, y=y_test)))
print('Gaussian process model time: {}'.format(now() - time_start))

score: 0.9168
Gaussian process model time: 0:02:20.858309
