In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score
from operator import itemgetter
import numpy as np
from sklearn.linear_model import LogisticRegression



In [2]:
dataset = pd.read_csv("transfusion.data")

In [3]:
dataset.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [4]:
dataset.isnull().any()

Recency (months)                              False
Frequency (times)                             False
Monetary (c.c. blood)                         False
Time (months)                                 False
whether he/she donated blood in March 2007    False
dtype: bool

In [5]:
dataset.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [6]:
dataset.rename(columns={"whether he/she donated blood in March 2007": "Donated in March 2007"}, inplace=True)

In [7]:
dataset.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),Donated in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [8]:
dataset["Donated in March 2007"].value_counts(normalize=True)

0    0.762032
1    0.237968
Name: Donated in March 2007, dtype: float64

In [9]:
X = dataset.drop(columns = 'Donated in March 2007')
y = dataset["Donated in March 2007"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5, stratify=dataset["Donated in March 2007"])

In [11]:
y_train.value_counts(normalize=True)

0    0.762923
1    0.237077
Name: Donated in March 2007, dtype: float64

In [12]:
y_test.value_counts(normalize=True)

0    0.759358
1    0.240642
Name: Donated in March 2007, dtype: float64

In [13]:
# Instantiate TPOTClassifier
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=5,
    disable_update_check=True,
    config_dict='TPOT light'
)

In [14]:
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7430741559195733

Generation 2 - Current best internal CV score: 0.7430741559195733

Generation 3 - Current best internal CV score: 0.7430741559195733

Generation 4 - Current best internal CV score: 0.7430741559195733

Generation 5 - Current best internal CV score: 0.7482235434424217

Best pipeline: LogisticRegression(input_matrix, C=0.01, dual=False, penalty=l2)


TPOTClassifier(config_dict='TPOT light', disable_update_check=True,
               generations=5, population_size=20, random_state=5,
               scoring='roc_auc', verbosity=2)

In [15]:
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')

1. LogisticRegression(C=0.01, random_state=5)


In [16]:
# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7563


In [17]:
X_train.var().round(3)

Recency (months)              67.134
Frequency (times)             33.105
Monetary (c.c. blood)    2069039.263
Time (months)                605.886
dtype: float64

We can see that the variance of Monetary (c.c. blood) is very high. So, we need to reduce this spread so that the model can perform better.

# Log Normalization

In [18]:
X_train_n = X_train.copy()
X_test_n = X_test.copy()

In [19]:
for df in [X_train_n, X_test_n]:
    df["Monetary log"] = np.log(df["Monetary (c.c. blood)"])
    df.drop(columns="Monetary (c.c. blood)", inplace=True)

In [20]:
X_train_n.var()

Recency (months)      67.134040
Frequency (times)     33.104628
Time (months)        605.886064
Monetary log           0.856381
dtype: float64

# Training the model 
Using Tpot we find that Logistic Regression is will be the best model for this set of data

In [21]:
logreg = LogisticRegression(solver='liblinear',random_state=5)
logreg.fit(X_train_n, y_train)

LogisticRegression(random_state=5, solver='liblinear')

In [22]:
pred = logreg.predict_proba(X_test_n)[:, 1]

In [23]:
# AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, pred)
logreg_auc_score

0.761737089201878

# Conclusion

In [24]:
# Sort models based on their AUC score from highest to lowest
sorted(
    [('tpot', tpot_auc_score), ('logreg', logreg_auc_score)],
    key=itemgetter(1),
    reverse=True)

[('logreg', 0.761737089201878), ('tpot', 0.7562597809076682)]

<i>From the above data we can conclude there has been an increase in the accuracy by 0.5% by normalizing the variance of Monetary (c.c. blood).</i>