In [2]:
import pandas as pd
import numpy as np



In [3]:
df = pd.read_csv('heart_failure.csv')

In [4]:
df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [5]:
df.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [6]:
list(df.columns)

['age',
 'anaemia',
 'creatinine_phosphokinase',
 'diabetes',
 'ejection_fraction',
 'high_blood_pressure',
 'platelets',
 'serum_creatinine',
 'serum_sodium',
 'sex',
 'smoking',
 'time',
 'DEATH_EVENT']

In [7]:
# so the above model can be used to predict the probbility of someone dying of a heart attack
df.dtypes

age                         float64
anaemia                       int64
creatinine_phosphokinase      int64
diabetes                      int64
ejection_fraction             int64
high_blood_pressure           int64
platelets                   float64
serum_creatinine            float64
serum_sodium                  int64
sex                           int64
smoking                       int64
time                          int64
DEATH_EVENT                   int64
dtype: object

In [9]:
df.sex.nunique()

2

In [10]:
# so we have established here that the field sex has a binary attribute 
#this should indicate as either male or female
# so next is to convert it


In [11]:
df.DEATH_EVENT.nunique()

2

In [12]:
#also the death event field is also
# indicating it is aslo a binary attribute 
# 

In [13]:
df.anaemia.nunique()

2

In [14]:
#An ejection fraction (EF) is the volumetric fraction (or portion of the total) of fluid (usually blood) 
#ejected from a chamber (usually the heart) with each contraction (or heartbeat). 
#It can refer to the cardiac atrium, ventricle, gall bladder, or leg veins,
# although if unspecified it usually refers to the left ventricle of the heart. 
#EF is widely used as a measure of the pumping efficiency of the heart and is used to classify heart failure types. 
#It is also used as an indicator of the severity of heart failure, although it has recognized limitations.

In [15]:
df.ejection_fraction.max()

80

In [16]:
df.ejection_fraction.min()# be careful this is a percentage and the safest one is 55-70

14

In [17]:
df.platelets.min(),df.platelets.max()# but the best range is 150000 - 450000

(25100.0, 850000.0)

In [18]:
survival_rate={
    1:'high',
    0:'low'
}

In [19]:
df.DEATH_EVENT=df.DEATH_EVENT.map(survival_rate)

In [20]:
gender={
    1:'male',
    0:'female'
    
}

In [21]:
df.sex=df.sex.map(gender)

In [22]:
df.head(10)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,male,0,4,high
1,55.0,0,7861,0,38,0,263358.03,1.1,136,male,0,6,high
2,65.0,0,146,0,20,0,162000.0,1.3,129,male,1,7,high
3,50.0,1,111,0,20,0,210000.0,1.9,137,male,0,7,high
4,65.0,1,160,1,20,0,327000.0,2.7,116,female,0,8,high
5,90.0,1,47,0,40,1,204000.0,2.1,132,male,1,8,high
6,75.0,1,246,0,15,0,127000.0,1.2,137,male,0,10,high
7,60.0,1,315,1,60,0,454000.0,1.1,131,male,1,10,high
8,65.0,0,157,0,65,0,263358.03,1.5,138,female,0,10,high
9,80.0,1,123,0,35,1,388000.0,9.4,133,male,1,10,high


In [23]:
anaemic={
    0:'non_amaemic',
    1:'anaemic'
}

In [24]:
df.anaemia=df.anaemia.map(anaemic)

In [25]:
df.head(10)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,non_amaemic,582,0,20,1,265000.0,1.9,130,male,0,4,high
1,55.0,non_amaemic,7861,0,38,0,263358.03,1.1,136,male,0,6,high
2,65.0,non_amaemic,146,0,20,0,162000.0,1.3,129,male,1,7,high
3,50.0,anaemic,111,0,20,0,210000.0,1.9,137,male,0,7,high
4,65.0,anaemic,160,1,20,0,327000.0,2.7,116,female,0,8,high
5,90.0,anaemic,47,0,40,1,204000.0,2.1,132,male,1,8,high
6,75.0,anaemic,246,0,15,0,127000.0,1.2,137,male,0,10,high
7,60.0,anaemic,315,1,60,0,454000.0,1.1,131,male,1,10,high
8,65.0,non_amaemic,157,0,65,0,263358.03,1.5,138,female,0,10,high
9,80.0,anaemic,123,0,35,1,388000.0,9.4,133,male,1,10,high


In [26]:
smoker={
    0:'non_smoker',
    1:'smoker'
}

In [27]:
df.smoking=df.smoking.map(smoker)

In [28]:
df.head(10)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,non_amaemic,582,0,20,1,265000.0,1.9,130,male,non_smoker,4,high
1,55.0,non_amaemic,7861,0,38,0,263358.03,1.1,136,male,non_smoker,6,high
2,65.0,non_amaemic,146,0,20,0,162000.0,1.3,129,male,smoker,7,high
3,50.0,anaemic,111,0,20,0,210000.0,1.9,137,male,non_smoker,7,high
4,65.0,anaemic,160,1,20,0,327000.0,2.7,116,female,non_smoker,8,high
5,90.0,anaemic,47,0,40,1,204000.0,2.1,132,male,smoker,8,high
6,75.0,anaemic,246,0,15,0,127000.0,1.2,137,male,non_smoker,10,high
7,60.0,anaemic,315,1,60,0,454000.0,1.1,131,male,smoker,10,high
8,65.0,non_amaemic,157,0,65,0,263358.03,1.5,138,female,non_smoker,10,high
9,80.0,anaemic,123,0,35,1,388000.0,9.4,133,male,smoker,10,high


In [29]:
diabetic={
    0:'non_diabetic',
    1:'diabetic'
}

In [30]:
df.diabetes =df.diabetes.map(diabetic)

In [31]:
df.head(10)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,non_amaemic,582,non_diabetic,20,1,265000.0,1.9,130,male,non_smoker,4,high
1,55.0,non_amaemic,7861,non_diabetic,38,0,263358.03,1.1,136,male,non_smoker,6,high
2,65.0,non_amaemic,146,non_diabetic,20,0,162000.0,1.3,129,male,smoker,7,high
3,50.0,anaemic,111,non_diabetic,20,0,210000.0,1.9,137,male,non_smoker,7,high
4,65.0,anaemic,160,diabetic,20,0,327000.0,2.7,116,female,non_smoker,8,high
5,90.0,anaemic,47,non_diabetic,40,1,204000.0,2.1,132,male,smoker,8,high
6,75.0,anaemic,246,non_diabetic,15,0,127000.0,1.2,137,male,non_smoker,10,high
7,60.0,anaemic,315,diabetic,60,0,454000.0,1.1,131,male,smoker,10,high
8,65.0,non_amaemic,157,non_diabetic,65,0,263358.03,1.5,138,female,non_smoker,10,high
9,80.0,anaemic,123,non_diabetic,35,1,388000.0,9.4,133,male,smoker,10,high


In [32]:
blood_pressure={
    0:'normal_bp',
    1:'high_bp'
}


In [33]:
df.high_blood_pressure=df.high_blood_pressure.map(blood_pressure)

In [34]:
df.head(10)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,non_amaemic,582,non_diabetic,20,high_bp,265000.0,1.9,130,male,non_smoker,4,high
1,55.0,non_amaemic,7861,non_diabetic,38,normal_bp,263358.03,1.1,136,male,non_smoker,6,high
2,65.0,non_amaemic,146,non_diabetic,20,normal_bp,162000.0,1.3,129,male,smoker,7,high
3,50.0,anaemic,111,non_diabetic,20,normal_bp,210000.0,1.9,137,male,non_smoker,7,high
4,65.0,anaemic,160,diabetic,20,normal_bp,327000.0,2.7,116,female,non_smoker,8,high
5,90.0,anaemic,47,non_diabetic,40,high_bp,204000.0,2.1,132,male,smoker,8,high
6,75.0,anaemic,246,non_diabetic,15,normal_bp,127000.0,1.2,137,male,non_smoker,10,high
7,60.0,anaemic,315,diabetic,60,normal_bp,454000.0,1.1,131,male,smoker,10,high
8,65.0,non_amaemic,157,non_diabetic,65,normal_bp,263358.03,1.5,138,female,non_smoker,10,high
9,80.0,anaemic,123,non_diabetic,35,high_bp,388000.0,9.4,133,male,smoker,10,high


In [35]:
df.isnull().nunique()

age                         1
anaemia                     1
creatinine_phosphokinase    1
diabetes                    1
ejection_fraction           1
high_blood_pressure         1
platelets                   1
serum_creatinine            1
serum_sodium                1
sex                         1
smoking                     1
time                        1
DEATH_EVENT                 1
dtype: int64

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
df_full_train , df_test = train_test_split(df, test_size=0.2,random_state=1)
df_train,df_val = train_test_split(df_full_train , test_size=0.25,random_state=1)


In [38]:
len(df_train),len(df_test),len(df_val)

(179, 60, 60)

In [39]:
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)

In [40]:
y_train=(df_train.DEATH_EVENT=='high')
y_test=(df_test.DEATH_EVENT=='high')
y_val=(df_val.DEATH_EVENT=='high')

In [41]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [42]:
numerical= ['age','creatinine_phosphokinase','ejection_fraction','serum_creatinine','serum_sodium','time']
categorical=['anaemia','diabetes','high_blood_pressure','sex','smoking']


In [43]:
def predict(df,dv,model):
    dicts = df[categorical+numerical].to_dict(orient='records')
    x=dv.transform(dicts)
    y_pred=model.predict_proba(x)[:,1]
    
    return y_pred

In [44]:
df_train.DEATH_EVENT.values

array(['low', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'high',
       'low', 'low', 'low', 'high', 'low', 'high', 'high', 'high', 'high',
       'low', 'low', 'low', 'high', 'low', 'low', 'high', 'high', 'low',
       'low', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'high', 'low', 'high', 'low', 'low', 'low',
       'high', 'high', 'high', 'high', 'low', 'low', 'high', 'low', 'low',
       'high', 'high', 'high', 'low', 'high', 'low', 'high', 'low', 'low',
       'low', 'high', 'low', 'low', 'low', 'high', 'high', 'high', 'high',
       'low', 'low', 'high', 'low', 'high', 'high', 'low', 'low', 'high',
       'low', 'high', 'high', 'high', 'high', 'low', 'low', 'low', 'low',
       'low', 'low', 'low', 'low', 'high', 'low', 'low', 'low', 'low',
       'low', 'low', 'high', 'high', 'low', 'low', 'low', 'high', 'low',
       'low', 'low', 'high', 'low', 'low', 'low', 

In [45]:
def train(df_train,y_train,C=1.0):
    dicts=df_train[categorical+numerical].to_dict(orient='records')
    
    dv= DictVectorizer(sparse=False)
    x_train=dv.fit_transform(dicts)
    
    model=LogisticRegression(C=C, max_iter=1000)
    model.fit(x_train,y_train)
    
    return dv,model

In [46]:
C=1.0
n_splits=5

kfold=KFold(n_splits,shuffle=True,random_state=1)
scores=[]
fold=0
for train_idx,val_idx in kfold.split(df_full_train):
    df_train=df_full_train.iloc[train_idx]
    df_val=df_full_train.iloc[val_idx]
    df_train.DEATH_EVENT=(df_train.DEATH_EVENT=='high').astype(int)
    df_val.DEATH_EVENT=(df_val.DEATH_EVENT=='high').astype(int)
    y_train=df_train.DEATH_EVENT.values
    y_val=df_val.DEATH_EVENT.values
    
    
    dv,model=train(df_train,y_train,C=C)
    y_pred=predict(df_val,dv,model)
    
    auc =roc_auc_score(y_val,y_pred)
    scores.append(auc)
    
    print(f'auc on fold{fold} is {auc}')
    
    fold=fold+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


auc on fold0 is 0.8482142857142857
auc on fold1 is 0.8483516483516483


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
A value is trying to be set on a copy of a slice from a D

auc on fold2 is 0.8566243194192378
auc on fold3 is 0.8690702087286527
auc on fold4 is 0.8914027149321267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [47]:
print('validation results:')
print(' for C=%s the mean value is %.3f and the standard deviation is -/+ %.3f' %(C,np.mean(scores),np.std(scores)))

validation results:
 for C=1.0 the mean value is 0.863 and the standard deviation is -/+ 0.016


In [48]:
import pickle

In [49]:
output_file=f'model_C={C}.bin'
output_file


'model_C=1.0.bin'

In [50]:
with open(output_file,'wb') as f_out:
    pickle.dump((dv,model),f_out)
    
print(f'the model is saved to{output_file}')

the model is saved tomodel_C=1.0.bin


In [51]:
df.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [52]:
from sklearn.metrics import mutual_info_score


In [53]:
def mutual_death_event(series):
    return mutual_info_score(series, df_full_train.DEATH_EVENT) 


In [58]:
df_full_train[categorical+numerical].apply(mutual_death_event)



anaemia                     0.000822
diabetes                    0.000455
high_blood_pressure         0.001406
sex                         0.000039
smoking                     0.000091
age                         0.168972
creatinine_phosphokinase    0.457338
ejection_fraction           0.119779
serum_creatinine            0.209243
serum_sodium                0.115043
time                        0.565838
dtype: float64