## Below notebook has been divided into two major section -- modelling with/ without under-sampling. 
### Key steps
- Dummy variable creation for categorical variables
- Logistic modelling and tuning
- Random forest modelling and tuning
- Grad boost modelling and tuning
- Neural net modelling and tuning
- Evaluation on test set 

In [257]:
import pandas as pd
import numpy as np
import pickle as pkl
import keras
from keras.layers import merge, Input,InputLayer
from keras.engine import InputLayer
from keras.layers import Dense, Activation
from keras.layers.core import Activation, Dense, Flatten
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

In [258]:
data=pd.read_csv('./Training Data.csv')

In [259]:
data.dtypes

patient_id              int64
Age                     int64
Gender                 object
Prescription_period     int64
Diabetes                int64
Alcoholism              int64
HyperTension            int64
Smokes                  int64
Tuberculosis            int64
Sms_Reminder            int64
Adherence              object
dtype: object

In [260]:
data.Adherence.value_counts()/len(data.Adherence)

No     0.698189
Yes    0.301811
Name: Adherence, dtype: float64

In [261]:
data.head()

Unnamed: 0,patient_id,Age,Gender,Prescription_period,Diabetes,Alcoholism,HyperTension,Smokes,Tuberculosis,Sms_Reminder,Adherence
0,1,19,M,7,0,0,0,0,0,0,No
1,2,24,F,59,0,0,0,0,0,0,No
2,3,4,F,43,0,0,0,0,0,0,No
3,4,38,M,66,0,0,0,0,0,1,No
4,5,46,F,98,0,0,0,0,0,1,No


In [262]:
data.describe()

Unnamed: 0,patient_id,Age,Prescription_period,Diabetes,Alcoholism,HyperTension,Smokes,Tuberculosis,Sms_Reminder
count,180212.0,180212.0,180212.0,180212.0,180212.0,180212.0,180212.0,180212.0,180212.0
mean,90106.5,37.795363,54.668485,0.078524,0.025043,0.216512,0.052566,0.000338,0.573968
std,52022.867693,22.852072,35.752491,0.268995,0.156255,0.411868,0.223166,0.018395,0.499824
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,45053.75,19.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,90106.5,38.0,51.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,135159.25,56.0,86.0,0.0,0.0,0.0,0.0,0.0,1.0
max,180212.0,113.0,120.0,1.0,1.0,1.0,1.0,1.0,2.0


In [263]:
char_col=[]
num_col=[]
for col in data.drop(labels=['patient_id'],axis=1,inplace=False).columns:
    if len(data[col].value_counts()) < 5:
        char_col.append(col)
    else:
        num_col.append(col)

In [264]:
data.columns

Index(['patient_id', 'Age', 'Gender', 'Prescription_period', 'Diabetes',
       'Alcoholism', 'HyperTension', 'Smokes', 'Tuberculosis', 'Sms_Reminder',
       'Adherence'],
      dtype='object')

In [265]:
char_col

['Gender',
 'Diabetes',
 'Alcoholism',
 'HyperTension',
 'Smokes',
 'Tuberculosis',
 'Sms_Reminder',
 'Adherence']

In [266]:
char_col.remove('Adherence')

In [267]:
data=pd.get_dummies(data,columns=char_col)

In [268]:
data.Adherence=data.Adherence.map({'No':0,'Yes':1})

In [269]:
data.Adherence.value_counts()

0    125822
1     54390
Name: Adherence, dtype: int64

In [270]:
final_data=data

In [271]:
final_data.columns

Index(['patient_id', 'Age', 'Prescription_period', 'Adherence', 'Gender_F',
       'Gender_M', 'Diabetes_0', 'Diabetes_1', 'Alcoholism_0', 'Alcoholism_1',
       'HyperTension_0', 'HyperTension_1', 'Smokes_0', 'Smokes_1',
       'Tuberculosis_0', 'Tuberculosis_1', 'Sms_Reminder_0', 'Sms_Reminder_1',
       'Sms_Reminder_2'],
      dtype='object')

In [272]:

X_train_, X_test_, y_train_, y_test_ = train_test_split(final_data.drop(labels=['patient_id','Adherence'],axis=1,inplace=False), final_data.Adherence, test_size=0.3, random_state=101)

In [273]:
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train_, y_train_)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## train set result 

In [274]:
logisticRegr.score(X_train_, y_train_)

0.8810127786409614

In [275]:
preds=logisticRegr.predict_proba(X_test_)

preds=pd.Series(preds[:,1])

##  test set result

In [276]:
print(classification_report(y_test_,pd.Series(np.where(preds>0.45,1,0))))

logisticRegr.score(X_test_, y_test_)

              precision    recall  f1-score   support

           0       0.94      0.89      0.91     37736
           1       0.77      0.87      0.81     16328

   micro avg       0.88      0.88      0.88     54064
   macro avg       0.85      0.88      0.86     54064
weighted avg       0.89      0.88      0.88     54064



0.8784218703758508

## Gradient boosting classifier

In [277]:
gbc=GradientBoostingClassifier(learning_rate=0.1,verbose=True)

gbc.fit(X_train_,y_train_)
gbc.score(X_train_,y_train_)

      Iter       Train Loss   Remaining Time 
         1           1.1117           13.71s
         2           1.0236           13.93s
         3           0.9528           13.07s
         4           0.8949           12.86s
         5           0.8470           12.74s
         6           0.8071           12.93s
         7           0.7736           12.84s
         8           0.7453           12.91s
         9           0.7215           12.78s
        10           0.7013           12.53s
        20           0.6106           10.81s
        30           0.5937            9.20s
        40           0.5907            7.74s
        50           0.5899            6.31s
        60           0.5895            4.93s
        70           0.5892            3.61s
        80           0.5889            2.39s
        90           0.5886            1.19s
       100           0.5884            0.00s


0.8954323493039922

##  test set result

In [278]:
gbc.score(X_test_,y_test_)

0.8923128144421426

## Precision, Recall and F1 Score

In [279]:
print(classification_report(y_test_,pd.Series(gbc.predict(X_test_))))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92     37736
           1       0.78      0.89      0.83     16328

   micro avg       0.89      0.89      0.89     54064
   macro avg       0.87      0.89      0.88     54064
weighted avg       0.90      0.89      0.89     54064



In [280]:
X_train_.shape

(126148, 17)

In [281]:
var_imp=dict(zip(X_train_.columns,gbc.feature_importances_))

var_imp_sorted=sorted(var_imp.items(),key=lambda x : x[1],reverse=True)

In [282]:
for i in var_imp_sorted[:6]:
    print('variable name is {} and importance is {:,.4f}'.format(i[0],i[1]))

variable name is Prescription_period and importance is 0.9913
variable name is Age and importance is 0.0075
variable name is Smokes_0 and importance is 0.0003
variable name is Alcoholism_1 and importance is 0.0002
variable name is Alcoholism_0 and importance is 0.0002
variable name is Smokes_1 and importance is 0.0001


## random forest classifier 

In [283]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=100,verbose=True,class_weight='balanced',n_jobs=3)

rfc.fit(X_train_,y_train_)

rfc.score(X_train_,y_train_)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    3.5s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    7.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    1.6s finished


0.9253416621745886

## test set result

In [284]:
rfc.score(X_test_,y_test_)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.7s finished


0.8649748446285883

In [285]:
var_imp=dict(zip(X_train_.columns,rfc.feature_importances_))
var_imp_sorted=sorted(var_imp.items(),key=lambda x : x[1],reverse=True)

In [286]:
for i in var_imp_sorted[:6]:
    print('variable name is {} and importance is {:,.4f}'.format(i[0],i[1]))

variable name is Prescription_period and importance is 0.8717
variable name is Age and importance is 0.1182
variable name is HyperTension_0 and importance is 0.0018
variable name is HyperTension_1 and importance is 0.0011
variable name is Sms_Reminder_1 and importance is 0.0009
variable name is Sms_Reminder_0 and importance is 0.0009


## Precision, Recall and F1 Score

In [287]:
print(classification_report(y_test_,pd.Series(rfc.predict(X_test_))))

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.3s


              precision    recall  f1-score   support

           0       0.92      0.88      0.90     37736
           1       0.75      0.83      0.79     16328

   micro avg       0.86      0.86      0.86     54064
   macro avg       0.84      0.86      0.84     54064
weighted avg       0.87      0.86      0.87     54064



[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.7s finished


## MLP based classifier

In [288]:

y_main_encode = np_utils.to_categorical(final_data.Adherence, num_classes=2)

In [289]:
X_train, X_test, y_train, y_test = train_test_split(final_data.drop(labels=['patient_id','Adherence'],axis=1,inplace=False), y_main_encode, test_size=0.3, random_state=101)

In [290]:
model_ = Sequential()
model_.add(Dense(32,input_dim=X_train.shape[1],activation='relu'))
model_.add(Dense(64,activation='relu'))
model_.add(Dense(128, activation='relu'))
model_.add(Dense(256, activation='relu'))
model_.add(Dense(2, activation='softmax'))
adam = keras.optimizers.Adam(lr=0.01)
model_.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model_.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 32)                576       
_________________________________________________________________
dense_12 (Dense)             (None, 64)                2112      
_________________________________________________________________
dense_13 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_14 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_15 (Dense)             (None, 2)                 514       
Total params: 44,546
Trainable params: 44,546
Non-trainable params: 0
_________________________________________________________________


In [291]:
model_.fit(x=X_train, y=y_train, epochs=20, batch_size=128,validation_split=0.1)

Train on 113533 samples, validate on 12615 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8a2e40ad68>

In [292]:
y_pred=np.argmax(model_.predict(X_test),axis=-1)
y_test=np.argmax(y_test,axis=-1)
model_.predict(X_test)

array([[0.96193963, 0.03806042],
       [0.96193963, 0.03806042],
       [0.96193963, 0.03806042],
       ...,
       [0.96193963, 0.03806042],
       [0.96193963, 0.03806042],
       [0.96193963, 0.03806042]], dtype=float32)

## Precision, Recall and F1 Score

In [293]:
print(classification_report(y_test,pd.Series(np.where(model_.predict(X_test)[:,1]>0.72,1,0))))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91     37736
           1       0.80      0.74      0.77     16328

   micro avg       0.87      0.87      0.87     54064
   macro avg       0.85      0.83      0.84     54064
weighted avg       0.86      0.87      0.86     54064



'

## Results with under-sampling the majority class

In [294]:
final_data_0=final_data.loc[final_data.Adherence==0,:].sample(55000)
final_data_1=final_data.loc[final_data.Adherence==1,:]
final_data=pd.concat(objs=[final_data_0,final_data_1],axis=0)

## train test split

In [295]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(final_data.drop(labels=['patient_id','Adherence'],axis=1,inplace=False), final_data.Adherence, test_size=0.3, random_state=101)

## Logistic regression modelling

In [296]:
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train_, y_train_)
logisticRegr.score(X_train_, y_train_)

preds=logisticRegr.predict_proba(X_test_)

preds=pd.Series(preds[:,1])



##  test set result

In [297]:
print(classification_report(y_test_,pd.Series(np.where(preds>0.45,1,0))))

logisticRegr.score(X_test_, y_test_)

              precision    recall  f1-score   support

           0       0.91      0.77      0.83     16471
           1       0.80      0.92      0.85     16346

   micro avg       0.84      0.84      0.84     32817
   macro avg       0.85      0.84      0.84     32817
weighted avg       0.85      0.84      0.84     32817



0.8586403388487674

## Gradient boosting classifier

In [298]:

gbc=GradientBoostingClassifier(learning_rate=0.1,verbose=True)

In [299]:
gbc.fit(X_train_,y_train_)
gbc.score(X_train_,y_train_)

      Iter       Train Loss   Remaining Time 
         1           1.2660            6.75s
         2           1.1678            6.59s
         3           1.0868            6.48s
         4           1.0194            6.72s
         5           0.9629            6.56s
         6           0.9153            6.43s
         7           0.8751            6.32s
         8           0.8411            6.37s
         9           0.8122            6.25s
        10           0.7877            6.15s
        20           0.6773            5.53s
        30           0.6571            4.90s
        40           0.6534            4.18s
        50           0.6524            3.49s
        60           0.6518            2.70s
        70           0.6512            2.01s
        80           0.6508            1.30s
        90           0.6504            0.63s
       100           0.6500            0.00s


0.8961905632533661

In [300]:
gbc.score(X_test_,y_test_)

0.8979797056403693

##  test set result

In [302]:
print(classification_report(y_test_,pd.Series(gbc.predict(X_test_))))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90     16471
           1       0.89      0.91      0.90     16346

   micro avg       0.90      0.90      0.90     32817
   macro avg       0.90      0.90      0.90     32817
weighted avg       0.90      0.90      0.90     32817



## Saving GBC model as pickle file for test prediction.

In [301]:
with open('./gbc_model.pickle','wb') as f:
    pkl.dump(gbc,f)

## random forest classifier 

In [65]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=100,verbose=True,class_weight='balanced',n_jobs=3)

rfc.fit(X_train_,y_train_)

rfc.score(X_train_,y_train_)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    1.7s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    3.7s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.5s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    1.1s finished


0.9365833910124979

## test set result

In [66]:
rfc.score(X_test_,y_test_)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.3s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.5s finished


0.8658317335527318

## Precision, Recall and F1 Score

In [67]:
print(classification_report(y_test_,pd.Series(rfc.predict(X_test_))))

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.2s


              precision    recall  f1-score   support

           0       0.88      0.85      0.86     16471
           1       0.86      0.88      0.87     16346

   micro avg       0.87      0.87      0.87     32817
   macro avg       0.87      0.87      0.87     32817
weighted avg       0.87      0.87      0.87     32817



[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.4s finished


## MLP based classifier

In [68]:

y_main_encode = np_utils.to_categorical(final_data.Adherence, num_classes=2)

X_train, X_test, y_train, y_test = train_test_split(final_data.drop(labels=['patient_id','Adherence'],axis=1,inplace=False), y_main_encode, test_size=0.3, random_state=101)

In [70]:
model_ = Sequential()
model_.add(Dense(32,input_dim=X_train.shape[1],activation='relu'))
model_.add(Dense(64,activation='relu'))
model_.add(Dense(128, activation='relu'))
model_.add(Dense(256, activation='relu'))
model_.add(Dense(2, activation='softmax'))

adam = keras.optimizers.Adam(lr=0.01)

model_.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model_.summary()

In [73]:
model_.fit(x=X_train, y=y_train, epochs=20, batch_size=128,validation_split=0.1)

Train on 68915 samples, validate on 7658 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8a6c440ac8>

In [74]:
y_pred=np.argmax(model_.predict(X_test),axis=-1)

y_test=np.argmax(y_test,axis=-1)

model_.predict(X_test)

## Precision, Recall and F1 Score

In [77]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pd.Series(y_pred)))

              precision    recall  f1-score   support

           0       0.91      0.88      0.89     16471
           1       0.88      0.91      0.89     16346

   micro avg       0.89      0.89      0.89     32817
   macro avg       0.89      0.89      0.89     32817
weighted avg       0.89      0.89      0.89     32817



## Test set predictions

In [303]:
test_data=pd.read_csv('./Test Data.csv')

In [304]:
test_data.head()

Unnamed: 0,patient_id,Age,Gender,Prescription_period,Diabetes,Alcoholism,HyperTension,Smokes,Tuberculosis,Sms_Reminder
0,1,5,M,28,0,0,0,0,0,1
1,2,62,F,9,1,0,1,0,0,0
2,3,4,F,73,0,0,0,0,0,1
3,4,33,M,117,0,0,0,0,0,0
4,5,38,M,8,0,0,0,0,0,1


In [305]:
test_data=pd.get_dummies(test_data,columns=char_col)

## loading saved model as pickle file.

In [306]:
with open('./gbc_model.pickle','rb') as f:
    gbc_test=pkl.load(f)

In [307]:
test_probs=gbc_test.predict_proba(test_data.drop('patient_id',axis=1))

In [308]:
adherence_test=np.argmax(test_probs,axis=-1)

In [309]:
prob_adherence=test_probs[:,1]

In [310]:
results_test=pd.DataFrame(index=test_data.patient_id)

In [311]:
results_test['adherence_test']=adherence_test
results_test.adherence_test=results_test.adherence_test.replace({1:'Yes',0:'No'})

In [312]:
results_test['prob_adherence']=prob_adherence

In [313]:
results_test.to_csv('./test_results.csv')

In [314]:
results_test.head()

Unnamed: 0_level_0,adherence_test,prob_adherence
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Yes,0.90882
2,Yes,0.864512
3,No,0.112822
4,No,0.136641
5,Yes,0.900378
