# Ngoc Ha
# CSCI 547
## Project: predict blood donation

## 1. Exploring the data

## 1.1. Import

In [1]:
# Import pandas
import pandas as pd

# Read in dataset
transfusion = pd.read_csv('datasets/transfusion.csv')

# Print out the first rows of our dataset
transfusion.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


## 1.2. Summary of data to look for non-numeric data points

In [2]:
# Print a concise summary of transfusion DataFrame
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
Recency (months)                              748 non-null int64
Frequency (times)                             748 non-null int64
Monetary (c.c. blood)                         748 non-null int64
Time (months)                                 748 non-null int64
whether he/she donated blood in March 2007    748 non-null int64
dtypes: int64(5)
memory usage: 29.3 KB


## 1.3. Rename target column

In [3]:
# Rename target column as 'target' for brevity 
transfusion.rename(
    columns={'whether he/she donated blood in March 2007': 'target'},
    inplace=True
)


# Print out the first 2 rows
transfusion.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


## 1.4. Proportions of target

In [4]:
# Print target incidence proportions, rounding output to 3 decimal places
transfusion.target.value_counts(normalize=True)

0    0.762032
1    0.237968
Name: target, dtype: float64

## 2. Preprocessing

### 2.1 Splitting data into train and test

In [5]:
# Import train_test_split method
from sklearn.model_selection import train_test_split

# Split transfusion DataFrame into
# X_train, X_test, y_train and y_test datasets,
# stratifying on the `target` column
X_train, X_test, y_train, y_test = train_test_split(
    transfusion.drop(columns='target'),
    transfusion.target,
    test_size=0.25,
    random_state=1,
    stratify=transfusion.target
)

# Print out the first 2 rows of X_train
X_train.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
132,2,2,500,10
294,11,5,1250,35
522,4,13,3250,39
291,16,12,3000,50
106,0,8,2000,59


### 2.2 Standardization

In [6]:
import warnings
from sklearn import preprocessing

warnings.simplefilter('ignore')

std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std  = std_scale.transform(X_test) # Use training mean and std to standardize test set

print(X_train_std[:2,:])
print(X_test_std[:2,:])

[[-0.93525944 -0.60457944 -0.60457944 -0.98853267]
 [ 0.20617757 -0.08807832 -0.08807832  0.02693431]]
[[-0.8084331  -0.08807832 -0.08807832 -0.3386338 ]
 [ 0.84030924 -0.4324124  -0.4324124  -0.62296456]]


## 3. TPOT AutoML
<p><a href="https://github.com/EpistasisLab/tpot">TPOT</a> is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. TPOT will automatically explore hundreds of possible pipelines to find the best one for our training set. The outcome of this search will be a <a href="https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html">scikit-learn pipeline</a>.

## 3.1 Parallel Genetic Algorithm (3 subprocesses) - 100 generations; population size: 100 

### (may take ~5 minutes to run)

In [40]:
# Import TPOTClassifier and roc_auc_score
from tpot import TPOTClassifier

# Instantiate TPOTClassifier
tpot = TPOTClassifier(
    generations=100,
    population_size=100,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light',
    n_jobs = 3
)

tpot.fit(X_train_std, y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=10100, style=ProgressStyle(descri…

Generation 1 - Current best internal CV score: 0.7527359001640808
Generation 2 - Current best internal CV score: 0.7528776097996344
Generation 3 - Current best internal CV score: 0.7528776097996344
Generation 4 - Current best internal CV score: 0.7528776097996344
Generation 5 - Current best internal CV score: 0.7528776097996344
Generation 6 - Current best internal CV score: 0.753230441848773
Generation 7 - Current best internal CV score: 0.7543862561920017
Generation 8 - Current best internal CV score: 0.7543862561920017
Generation 9 - Current best internal CV score: 0.7581829129982344
Generation 10 - Current best internal CV score: 0.7581829129982344
Generation 11 - Current best internal CV score: 0.7581829129982344
Generation 12 - Current best internal CV score: 0.7581829129982344
Generation 13 - Current best internal CV score: 0.7581829129982344
Generation 14 - Current best internal CV score: 0.7598150681461215
Generation 15 - Current best internal CV score: 0.7598150681461215
Gener

TPOTClassifier(config_dict='TPOT light', crossover_rate=0.1, cv=5,
        disable_update_check=True, early_stop=None, generations=100,
        max_eval_time_mins=5, max_time_mins=None, memory=None,
        mutation_rate=0.9, n_jobs=3, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=100,
        random_state=42, scoring='roc_auc', subsample=1.0,
        template='RandomTree', use_dask=False, verbosity=2,
        warm_start=False)

## 3.2 Pipeline found by TPOT

In [44]:
# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')


Best pipeline steps:
1. FeatureUnion(n_jobs=None,
       transformer_list=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('functiontransformer', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function copy at 0x00000207B7137D08>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprec...rgs=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          validate=None))],
       transformer_weights=None)
2. MaxAbsScaler(copy=True)
3. StackingEstimator(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=7, min_samples_split=13,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))
4. LogisticRegression(C=0.1, class_weight=None, dual=True, fit_intercept=True,
          intercep

In [56]:
# Export the pipeline
tpot.export('tpot_pipeline.py')

## 3.3. TPOT model performance (AUC score)

In [55]:
print('Loss on training set', tpot.score(X_train_std, y_train))
print('Loss on test set', tpot.score(X_test_std, y_test))

Loss on training set 0.8272696929238986
Loss on test set 0.7557902973395931


## 4 Vanillia logistic regression

In [7]:
# Importing modules
from sklearn import linear_model
from sklearn.metrics import roc_auc_score
import numpy as np


# Instantiate LogisticRegression
logreg = linear_model.LogisticRegression(
    solver='lbfgs',
    random_state=42,
    C=np.inf
)

# Train the model
logreg.fit(X_train_std, y_train)

# AUC score for tpot model
print('Loss on training set', roc_auc_score(y_train, logreg.predict_proba(X_train_std)[:, 1]))
print('Loss on test set', roc_auc_score(y_test, logreg.predict_proba(X_test_std)[:, 1]))

Loss on training set 0.7475932822710983
Loss on test set 0.7778560250391237


## 5. Logistic regression with polynomial features and regularization

In [8]:
X_train_poly = preprocessing.PolynomialFeatures(2)
X_train_poly = X_train_poly.fit_transform(X_train_std)
X_test_poly = preprocessing.PolynomialFeatures(2)
X_test_poly = X_test_poly.fit_transform(X_test_std)

In [10]:
# Instantiate LogisticRegression
logreg2 = linear_model.LogisticRegression(
    solver='liblinear',
    random_state=42,
    penalty = 'l2'
)

# Train the model
logreg2.fit(X_train_poly, y_train)

# AUC score for tpot model
print('Loss on training set', roc_auc_score(y_train, logreg2.predict_proba(X_train_poly)[:, 1]))
print('Loss on test set', roc_auc_score(y_test, logreg2.predict_proba(X_test_poly)[:, 1]))

Loss on training set 0.7603471295060079
Loss on test set 0.7836463223787168


## 6. Conclusion

- A "vanilla" logistic regression model outperformed the pipeline produced by TPOT after 100 generations with population size of 100.

- The TPOT pipeline overfit the data, which was clearly shown by the large difference between training loss and test loss. There are several TPOT functions that can help with preventing overfitting, which will be studied in a future project.

- We managed to improve our logistic model a little bit by creating 2nd-order features, and fit the logistic regression model with L2 regularization.

- Final loss on test set: 0.7836, which is better than guessing everything to be 0 (0.7620)