In [1]:
import polars as pl

APPLE = '/kaggle/input/apple-quality/apple_quality.csv'

# we have some weird junk at the end of this file so we need to load an explicit number of rows
df = pl.read_csv(source=APPLE, n_rows=3941)

df.head()

A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
i64,f64,f64,f64,f64,f64,f64,f64,str
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,"""good"""
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,"""good"""
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,"""bad"""
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,"""good"""
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,"""good"""


In [2]:
df['Quality'].value_counts()

Quality,count
str,u32
"""bad""",1968
"""good""",1973


Our target variable has balanced classes.

In [3]:
from plotly.express import histogram
for x in df.columns[1:-1]:
    histogram(data_frame=df, x=x, color='Quality', facet_col='Quality').show()

Our property distributions look very similar; if we plot them together (without facets) they look essentially identical.

In [4]:
from umap import UMAP
from plotly.express import scatter

reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=200)
reducer_df = pl.DataFrame(data=reducer.fit_transform(X=df.drop(columns=['Quality'])), schema={'ux': pl.Float32, 'uy': pl.Float32})
reducer_df = pl.DataFrame(data=[reducer_df['ux'], reducer_df['uy'], df['Quality']])
scatter(data_frame=reducer_df, x='ux', y='uy', color='Quality', height=900, facet_col='Quality')

UMAP(n_epochs=200, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Mon Jan 29 18:19:47 2024 Construct fuzzy simplicial set
Mon Jan 29 18:20:02 2024 Finding Nearest Neighbors
Mon Jan 29 18:20:07 2024 Finished Nearest Neighbor Search
Mon Jan 29 18:20:12 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Mon Jan 29 18:20:18 2024 Finished embedding


Similarly dimension reduction with UMAP doesn't show us anything encouraging.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

columns = df.columns[1:-1]
# columns = df.columns[1:-3]
X_train, X_test, y_train, y_test = train_test_split(df[columns], df['Quality'], test_size=0.25, random_state=2024)
model = LogisticRegression(max_iter=100000)
model.fit(X_train, y_train)
print('accuracy: {} pct'.format(round(1000 * accuracy_score(y_test, model.predict(X_test)))/10))

accuracy: 74.9 pct


In [6]:
from sklearn.metrics import classification_report
print(classification_report(y_true = y_test, y_pred = model.predict(X_test)))
histogram(x=columns, y=model.coef_[0])

              precision    recall  f1-score   support

         bad       0.75      0.75      0.75       498
        good       0.74      0.75      0.75       488

    accuracy                           0.75       986
   macro avg       0.75      0.75      0.75       986
weighted avg       0.75      0.75      0.75       986

