In [1]:
import pandas as pd

BLOOD = '/kaggle/input/anaemia-prediction/output.csv'

df = pd.read_csv(filepath_or_buffer=BLOOD, index_col=['Number'])
df['Anaemic'] = df['Anaemic'] == 'Yes'
df['Sex'] = df['Sex'].str.strip()
df['sex'] = df['Sex'] == 'F'
df.head()

Unnamed: 0_level_0,Sex,%Red Pixel,%Green pixel,%Blue pixel,Hb,Anaemic,sex
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,M,43.2555,30.8421,25.9025,6.3,True,False
2,F,45.6033,28.19,26.2067,13.5,False,True
3,F,45.0107,28.9677,26.0215,11.7,False,True
4,F,44.5398,28.9899,26.4703,13.5,False,True
5,M,43.287,30.6972,26.0158,12.4,False,False


Are our classes balanced? We would expect not, because anemia is relatively rare.

In [2]:
df['Sex'].value_counts().to_dict(), df['Anaemic'].value_counts().to_dict()

({'M': 55, 'F': 49}, {False: 78, True: 26})

We have about as many men as women, but our target class is unbalanced.

Do we expect anemia to correlate with sex? Let's find out.

In [3]:
df[['Sex', 'Anaemic']].value_counts().to_frame().reset_index()

Unnamed: 0,Sex,Anaemic,count
0,M,False,46
1,F,False,32
2,F,True,17
3,M,True,9


We have slightly more men than women in our dataset, but in the target group women out number men about two to one.

Let's do a little dimension reduction (we can't do a lot, because we have so few samples and only a few variables) and see if the target class is easy to spot.

In [4]:
import arrow
from umap import UMAP

COLUMNS = [column for column, datatype in df.dtypes.to_dict().items() if str(datatype) in {'float64',} ] + ['sex']

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-11 16:05:06.643503: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-11 16:05:06.643650: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-11 16:05:06.785388: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Thu Jul 11 16:05:17 2024 Construct fuzzy simplicial set
Thu Jul 11 16:05:18 2024 Finding Nearest Neighbors
Thu Jul 11 16:05:21 2024 Finished Nearest Neighbor Search
Thu Jul 11 16:05:25 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Thu Jul 11 16:05:27 2024 Finished embedding
done with UMAP in 0:00:09.351798


In [5]:
from plotly import express

express.scatter(data_frame=df, x='x', y='y', color='Anaemic', symbol='Sex')

UMAP does a pretty good job pickint out the target class. Let's build a little model.

In [6]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

TARGET = 'Anaemic'
X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])

time_start = arrow.now()
model = LogisticRegression(max_iter=100000, tol=1e-4).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(model.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))
print('model done in {}'.format(arrow.now() - time_start))

model fit in 50 iterations took 0:00:00.017083
accuracy: 0.9524
model done in 0:00:00.020586


In [7]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.94      1.00      0.97        16
        True       1.00      0.80      0.89         5

    accuracy                           0.95        21
   macro avg       0.97      0.90      0.93        21
weighted avg       0.96      0.95      0.95        21



Our regression model does reasonably well, but unfortunately does least well in the part of the population that is the most interesting: true recall.