In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The project is about predicting heart failure from existing data. 

In [2]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import seaborn as sb


In [3]:
data = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

In [4]:
data.head(5)

In [5]:
data.info()

In [6]:
#check for null values and duplicates
data.isnull().any()
data.duplicated().any()

To get a better idea of what type of data is in the columns we can use unique function with the columns 

In [7]:
print("Age rangees from:", data['age'].unique())
print("sex values are:",data['sex'].unique())
print("Age rangees from:", data['age'].unique())

In [8]:
column_list = data.columns.values.tolist()
for column_name in column_list:
    print(column_name,data[column_name].unique())

In [9]:
column_list

In [10]:
#visualize data
#VISUALISATION
f, x = plt.subplots(4, 2, figsize = (15, 10))
sb.countplot(x = data["smoking"], data = data, palette = 'mako', ax = x[0, 0])
sb.countplot(x = data["anaemia"], data = data, palette = 'crest', ax = x[0, 1])
sb.countplot(x = data["ejection_fraction"], data = data,palette = 'mako', ax = x[1, 0])
sb.countplot(x = data["high_blood_pressure"], data = data, palette = 'crest',ax = x[1, 1])
sb.countplot(x = data["sex"], data = data, palette = 'crest',ax = x[2, 0])
sb.countplot(x = data["diabetes"], data = data, palette = 'crest',ax = x[2, 1])
sb.countplot(x = data["DEATH_EVENT"], data = data, palette = 'crest',ax = x[3, 0])
sb.countplot(x = data["diabetes"], data = data, palette = 'crest',ax = x[3, 1])









In [11]:
# looking at the distributions 
sb.displot(x = data["platelets"], data = data, color = 'c', kind = 'kde')
sb.displot(x = data["age"], data = data, color = 'c', kind = 'hist')
sb.displot(x = data["serum_creatinine"], data = data, color = 'c', kind = 'hist')

 **We see that  there is no non-numerical values in these columns therefore we dont need to perform encoding for any categorical variables.**

In [12]:
#define the target prediction and the features
X = data.iloc[:,0:11] #  # death is our target to show a heart failure
Y = data.iloc[:,12].values # features


In [13]:
# split the data set into training and testing dataset with  80:20 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train,Y_test =train_test_split(X, Y, test_size=0.2,random_state=0)
# random state is  used  for initializing the internal random number generator, which will decide the splitting of data into train and test indices


 Gradient Boosting model



In [14]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score

gb_model = GradientBoostingClassifier(n_estimators=600, learning_rate= .01, random_state=0)
gb_model.fit(X_train, Y_train)
gb_preds = gb_model.predict(X_test)
accuracy = accuracy_score(Y_test, gb_preds)

print(accuracy)

Ok so now we have got an accuracy of around 73%. we will try to improve it. we can try to look at the importances and see what are the top sigficant features and then train the model  based on those features. 

In [15]:
# GET feature importance

importances =pd.DataFrame({'feature':data.iloc[:,0:11].columns,'importance': np.round(gb_model.feature_importances_,3)})
importances = importances.sort_values('importance', ascending = False).set_index('feature')
importances

**We see that serum_creatinine,ejection_fraction,creatinine_phosphokinase and age can be taken as features to predict heart failure. we will now train the model again to see any imporvements **

In [16]:
#define the target prediction and the features
Y = data['DEATH_EVENT']
features = ["serum_creatinine", "ejection_fraction", "creatinine_phosphokinase","age"]
X = data[features]
# split the data set into training and testing dataset with  80:20 ratio
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train,Y_test =train_test_split(X, Y, test_size=0.2,random_state=0)
# random state is  used  for initializing the internal random number generator, which will decide the splitting of data into train and test indices

In [17]:
# we train our model again 
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score

gb_model = GradientBoostingClassifier(n_estimators=500, learning_rate= 0.01, random_state=0)
gb_model.fit(X_train, Y_train)
gb_preds = gb_model.predict(X_test)
accuracy = accuracy_score(Y_test, gb_preds)

print(accuracy)

Training with relavant feaures the model improved by 3 percent

**Lets test some more models **

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn=KNeighborsClassifier(n_neighbors= 10,metric='minkowski',p=5)
knn.fit(X_train,Y_train)

model_preds = knn.predict(X_test)
accuracy = accuracy_score(Y_test, model_preds)

print(accuracy)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model_lg = LogisticRegression()
model_lg.fit(X_train,Y_train)

model_lg_predict = model_lg.predict(X_test)
accuracy = accuracy_score(Y_test, model_lg_predict)

print(accuracy)

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model_lg = LogisticRegression()
model_lg.fit(X_train,Y_train)
model_lg_predict = model_lg.predict(X_test)
accuracy = accuracy_score(Y_test, model_lg_predict)
print(accuracy)

In [21]:
  # use the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
forest.fit(X_train, Y_train)
forest_predict = forest.predict(X_test)
accuracy = accuracy_score(Y_test,forest_predict)
print(accuracy)


In [22]:
  # use GussianNB
from sklearn.naive_bayes import GaussianNB
guass = GaussianNB()
guass.fit(X_train,Y_train)
guass_predict = guass.predict(X_test)
accuracy = accuracy_score(Y_test,guass_predict)
print(accuracy)

In [23]:
# use Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion ='entropy', random_state=0)
tree.fit(X_train, Y_train)
tree_predict = tree.predict(X_test)
accuracy = accuracy_score(Y_test,tree_predict)
print(accuracy)

In [24]:
 # support vector (Linear kernel)
from sklearn.svm import SVC
svc_lin =SVC(kernel='linear',random_state = 0)
svc_lin.fit(X_train, Y_train)
svc_predict = svc_lin.predict(X_test)
accuracy = accuracy_score(Y_test,svc_predict)
print(accuracy)

From the algorithms used above GradientBoostingClassifier performs the best with an accuracy of 77%