In [1]:
import pandas as pd

PENGUINS = '/kaggle/input/palmer-penguins-dataset-for-eda/penguins.csv'

df = pd.read_csv(filepath_or_buffer=PENGUINS, index_col=['id']).dropna()
df.head()

Unnamed: 0_level_0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [2]:
from plotly import express

express.histogram(data_frame=df, x='species', color='sex').show()
express.histogram(data_frame=df, x='island', color='sex').show()

Surely penguins are sexually dimorphic, and we can distinguish males from females based on their size. Let's find out.

In [3]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=False, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']])
print('done with UMAP in {}'.format(arrow.now() - time_start))

done with UMAP in 0:00:09.155671


In [4]:
express.scatter(data_frame=df, x='x', y='y', color='sex')

We see some clustering and separation between clusters, but also some hard cases. Let's build a model.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']], df['sex'], test_size=0.2, random_state=2024, stratify=df['sex'])

logreg = LogisticRegression(max_iter=100000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f} f1: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test)), f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test))))
print(classification_report(y_true=y_test, y_pred=logreg.predict(X=X_test)))

model fit in 61 iterations
accuracy: 0.8955 f1: 0.8955
              precision    recall  f1-score   support

      female       0.88      0.91      0.90        33
        male       0.91      0.88      0.90        34

    accuracy                           0.90        67
   macro avg       0.90      0.90      0.90        67
weighted avg       0.90      0.90      0.90        67



Let's go back and look at our model predictions in terms of their model probabilities. We can use the UMAP model to project them onto the same scatter plot we used above.

In [6]:
probability_df = pd.DataFrame(data=logreg.predict_proba(X=X_test).max(axis=1), columns=['probability'])
probability_df['prediction'] = logreg.predict(X=X_test)
probability_df['actual'] = y_test.tolist()
probability_df['correct'] = probability_df['actual'] == probability_df['prediction']
probability_df[['x', 'y']] = umap.transform(X=X_test)

probability_df.head()

Unnamed: 0,probability,prediction,actual,correct,x,y
0,0.653299,female,female,True,11.008036,3.76633
1,0.585515,female,female,True,9.244903,8.520247
2,0.996142,female,female,True,11.639905,-4.287094
3,0.617196,male,female,False,9.154723,6.250491
4,0.965372,female,female,True,13.069413,-1.352945


In [7]:
from plotly import express

express.scatter(data_frame=probability_df, x='x', y='y', color='probability', facet_col='correct', hover_name='actual')

The good news is that most of the time when our model is incorrect its model probabilities are relatively low. There's only one case where it is incorrect and has a model probability of greater than 0.9.

Recall that our model probabilities will range from 0.5 to 1.0; what does the mean probability look like conditioned on whether the model is correct?

In [8]:
probability_df[['correct', 'probability']].groupby(by='correct').mean().to_dict()

{'probability': {False: 0.6753058032895167, True: 0.8793411331137099}}

We'd like the mean probability for False to be close to 0.5 and the mean probability to be close to 1.0; this is not bad but not great. Let's look at the distribution.

In [9]:
express.histogram(data_frame=probability_df,  x='probability', facet_col='correct', hover_name='actual', nbins=10)