In [1]:
import pandas as pd

DATA = '/kaggle/input/blood-transfusion-classification/blood_transfusion.csv'

df = pd.read_csv(filepath_or_buffer=DATA)
df['target'] = df['Class'] == 'donated'

df.head()

Unnamed: 0,Recency,Frequency,Monetary,Time,Class,target
0,2,50,12500,98,donated,True
1,0,13,3250,28,donated,True
2,1,16,4000,35,donated,True
3,2,20,5000,45,donated,True
4,1,24,6000,77,not donated,False


We don't have a lot of data, and we don't expect the independent variables to be completely determinative of the target for obvious reasons.

In [2]:
from plotly import express

express.histogram(data_frame=df, x='Class')

In [3]:
df['Class'].value_counts(normalize=True).to_dict()

{'not donated': 0.7620320855614974, 'donated': 0.23796791443850268}

Our target class is unbalanced, with non-donors outnumbering donors about three to one.

In [4]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

COLUMNS = ['Recency', 'Frequency', 'Monetary', 'Time']
X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df['target'], test_size=0.2, random_state=2024, stratify=df['target'])

time_start = arrow.now()
model = LogisticRegression(max_iter=100000, tol=1e-6).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(model.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))
print('model done in {}'.format(arrow.now() - time_start))

model fit in 46 iterations took 0:00:00.021870
accuracy: 0.7400
model done in 0:00:00.025859


In [5]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.77      0.93      0.84       114
        True       0.38      0.14      0.20        36

    accuracy                           0.74       150
   macro avg       0.58      0.53      0.52       150
weighted avg       0.68      0.74      0.69       150



In [6]:
import numpy as np

np.unique(model.predict(X=X_test), return_counts=True)

(array([False,  True]), array([137,  13]))

In [7]:
np.unique(y_test, return_counts=True)

(array([False,  True]), array([114,  36]))

Our model does a really poor job of predicting donors. It predicts "not donor" more than 90% of the time when it should predict "not donor" only 75% of the time.

Let's look at the regression coefficients and see if they make sense.

In [8]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0])

This probably makes some sense, as donors have to wait a certain amount of time before they can donate again; but is it surprising that money makes only a small difference? Probably not, given how our data is dominated by not-donors.

We can do a bit better with AdaBoost, but still not very well.

In [9]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import classification_report

ada = AdaBoostClassifier(algorithm='SAMME', random_state=2024).fit(X=X_train, y=y_train)

print(classification_report(y_true=y_test, y_pred=ada.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.82      0.90      0.86       114
        True       0.56      0.39      0.46        36

    accuracy                           0.78       150
   macro avg       0.69      0.65      0.66       150
weighted avg       0.76      0.78      0.77       150

