# Import Necessary libs

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Importing data

In [None]:
data = pd.read_csv('../input/transfusion-dataset/transfusion.data')
data.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


# insight on the dataset

In [None]:
data.count()

Recency (months)                              748
Frequency (times)                             748
Monetary (c.c. blood)                         748
Time (months)                                 748
whether he/she donated blood in March 2007    748
dtype: int64

***Checking for null values or missing data (Good news! we do not have any NAN values in the dataset)***

In [None]:
data.isnull().sum(axis = 0)

Recency (months)                              0
Frequency (times)                             0
Monetary (c.c. blood)                         0
Time (months)                                 0
whether he/she donated blood in March 2007    0
dtype: int64

***monetary needs scaling***

In [None]:
data.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


From the varience of the coulmns we can see that Monetary (c.c. blood) needs normalization due to high varience. 
***monetary needs scaling***
**NB: After dividing the data from the training and test sets, normalization between instances can be performed using data from the training set and testing set. Since the test set represents new, previously unknown data, it is not meant to be available during the training period. It also reduces possible data leakage problem.**


In [None]:
data.var().round(3)

Recency (months)                                   65.535
Frequency (times)                                  34.098
Monetary (c.c. blood)                         2131094.230
Time (months)                                     594.224
whether he/she donated blood in March 2007          0.182
dtype: float64

# Preporcessing

In [None]:
# rename the target or prediction coloumn 
data.rename(columns={'whether he/she donated blood in March 2007': 'target'},inplace=True)

In [None]:
X= data.drop(columns='target')
X.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
0,2,50,12500,98
1,0,13,3250,28
2,1,16,4000,35
3,2,20,5000,45
4,1,24,6000,77


In [None]:
y= data['target']
y.value_counts()

0    570
1    178
Name: target, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size= 0.2,random_state= 1,stratify=y)

In [None]:
X_train.var().round(3)

Recency (months)              62.404
Frequency (times)             34.788
Monetary (c.c. blood)    2174231.140
Time (months)                612.595
dtype: float64

***Perform log normalization ***

In [None]:

X_train_normed, X_test_normed = X_train.copy(), X_test.copy()
col_to_normalize = 'Monetary (c.c. blood)'
# Log normalization
for data_ in [X_train_normed, X_test_normed]:
    data_['monetary_log'] = np.log(data_[col_to_normalize])
    data_.drop(columns=col_to_normalize, inplace=True)
X_train_normed.var()

Recency (months)      62.404453
Frequency (times)     34.787698
Time (months)        612.595194
monetary_log           0.856891
dtype: float64

# **Model**

create instance of 
*  logistic regression
*  Support Vector Classification
*  Naive biar classification
*  Random forest classification

In [None]:
lr = LogisticRegression(random_state=40)
svc = SVC(probability=True)
gnb= GaussianNB()
rfc = RandomForestClassifier()

classifiers = [('LogisticRegression', lr),
           ("Supportvector",svc)  ,
              ("GaussianNB",gnb),
              ('rfc',rfc)]

Training and testing of each classifier independently 

***NB: As we can see there is class imbalance problem is present on the dataset, AUC would be indecator of a better model rather than accuracy. ***


In [None]:
for clf_name, clf in classifiers:
    clf.fit(X_train_normed, y_train)
    y_pred = clf.predict(X_test_normed)
    clf_auc_score = roc_auc_score(y_test, clf.predict_proba(X_test_normed)[:, 1])
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))
    print('{:s} AUC score : {:.3f}'.format(clf_name, clf_auc_score))
    print('\n')

LogisticRegression : 0.793
LogisticRegression AUC score : 0.755


Supportvector : 0.767
Supportvector AUC score : 0.722


GaussianNB : 0.747
GaussianNB AUC score : 0.700


rfc : 0.740
rfc AUC score : 0.710




# Create Ensemble model(Soft Voting classifier)

In [None]:
# declare VotingClassifier 'vc_soft'
vc_soft = VotingClassifier(estimators=classifiers,voting='soft')
vc_soft.fit(X_train_normed, y_train)
y_pred_vc = vc_soft.predict(X_test_normed)
print('Voting Classifier accuracy: {}'.format(accuracy_score(y_test, y_pred_vc)))
vc_auc_score = roc_auc_score(y_test, vc_soft.predict_proba(X_test_normed)[:, 1])
print(f'\n VotingClassifier AUC score: {vc_auc_score:.3f}')


Voting Classifier accuracy: 0.78

 VotingClassifier AUC score: 0.750


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84       114
           1       0.43      0.28      0.34        36

    accuracy                           0.74       150
   macro avg       0.62      0.58      0.59       150
weighted avg       0.71      0.74      0.72       150



# Future work

Use gridsearch CV and randomized search cv with ensemble model to see if performances can be more tuned or not.