## Problem Statement:
To See we you can find any other trends in heart data to predict certain cardiovascular events or find any clear indications of heart health.

* Identify the attributes which explains the prediction

### Dataset reference:
https://www.kaggle.com/ronitf/heart-disease-uci

## Import Packages

In [150]:
# import Required Packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas_profiling
import pylab
from sklearn.model_selection import train_test_split
from scipy import stats
import sklearn.preprocessing as preproc


## Data Class

In [136]:
#read dataset
class Data:
    def __init__(self, path, file,categorical_cols, numerical_cols, boolean_cols,target_col,for_prediction):
        self.path = path
        self.file = file

        self.categorical_cols = list(categorical_cols)
        self.numerical_cols = list(numerical_cols)
        self.boolean_cols = list(boolean_cols)
        self.target_col = list(target_col)
        self.col_list = categorical_cols+numerical_cols+boolean_cols
        self.for_prediction = for_prediction
        self.for_prediction = for_prediction
        
        self.DS = self._read_csv_to_DF()
        #self._exportEDA(self.DS)
        self._chk_duplicate_clean()
               

    def _read_csv_to_DF(self):  #read CSV to dataframe
        DS = pd.read_csv(self.path+'/'+self.file)
        print(str(DS.shape[0]) + ' observations with '+str(DS.shape[1])+' Features loaded \n')
        return DS
    
    def _exportEDA(self): #export feature analysis profiling 
        profile = self.DS.profile_report(title='Pandas Profiling Report - EDA_output.html \n')
        profile.to_file(output_file="EDA_output.html")
        pass
    
    def _chk_duplicate_clean(self): #remove duplicate records
        dup = self.DS[self.DS.duplicated()].copy()
        print('removed duplicate observations- '+ str(dup.shape[0])+'\n')
        self.DS.drop_duplicates(keep='first',inplace=True) 
        pass
    
    def _splitdata(self): #split train/test dataset (for_prediction= true, data for prediction been passed)
        X= self.DS.drop('target',axis=1)
        y= self.DS['target']       
        if self.for_prediction:
            test_set = X
            test_target = y
        else:
            train_set,test_set,train_target,test_target = train_test_split(X,y,test_size=.3,random_state=42)
        return train_set,test_set,train_target,test_target

## Data Transformation

In [181]:
class Transformation:
    def __init__(self,dataset,col_logT,col_boxCoxT,categorical_cols, for_prediction):
        self.dataset = dataset
        self.for_prediction = for_prediction
        self.col_logT = list(col_logT)
        self.col_boxCoxT = list(col_boxCoxT)
        self.categorical_cols = list(categorical_cols)      
        self.ColLambda ={}
        self.__applyFeatureTransformation()
           
        
    def _storeTransLamda(self, colname,lamda):
        self.ColLambda.update({colname:lamda})
        pass
        
    def _apply_BoxCoxTrans(self, colname): # BoxCox Transformation, returns lamda for test set
        newcol = colname+'BCox'
        if self.for_prediction:
            fitted_lambda = dic.get(colname)
            trn_data = stats.boxcox(self.dataset[colname], fitted_lambda)
        else:
            trn_data,fitted_lambda = stats.boxcox(self.dataset[colname])
            self.dataset[newcol] = trn_data
            self.dataset.drop(columns = colname,inplace =True)
            print(colname +' feature transformed: '+newcol)
            self._storeTransLamda(colname,fitted_lambda)
        
    def _applyStandardization(self,colname):
        newcol=colname+'-Tstd'
        self.dataset[newcol] = preproc.StandardScaler().fit_transform(self.dataset[[colname]])
        self.dataset.drop(columns = colname,inplace =True)
        print(colname +' feature transformed: '+newcol)
    
    def _dummyEncode_category(self, colname): #Dummy Encode Categorical features
        newcol = colname+'-'
        DS_dummytype = pd.get_dummies(self.dataset[colname],prefix=newcol)
        self.dataset= pd.concat((self.dataset,DS_dummytype),axis=1)
        self.dataset = self.dataset.drop(columns=colname,axis=1)
        
    def __applyFeatureTransformation()
        if self.col_boxCoxT:
            print('BoxCox Transformation')
            for colname in col_boxCoxT:
                self._apply_BoxCoxTrans(colname)
            print('Lamda values \n'+str(self.ColLambda)+'\n')

        if self.col_logT:
            print('Log Transformation')
            for colname in col_logT:
                self._applyStandardization(colname)

        if self._dummyEncode_category:
            print('Dummy Encoded'+ str(categorical_cols) + ' \n')
            for colname in categorical_cols:
                self._dummyEncode_category(colname)
        

#apply_standardization(DS,'thalachBCox',keepexistingcol = False)  
     
    def _returnTransformation(self):
        return self.dataset
        

SyntaxError: invalid syntax (<ipython-input-181-9cb59f0a3f59>, line 40)

In [182]:
# data load 
path = "D:/Datasets/HeartDiseaseUCI/heart-disease-uci"
file ='heart.csv'

numeric_cols = ['age','chol','oldpeak','trestbps','thalach']
categorical_cols = ['ca','cp','exang','slope','thal','restecg']
boolean_cols = ['exang','fbs','sex']
target_col = ['target']

data = Data(path,file,categorical_cols, numeric_cols, boolean_cols,target_col,for_prediction=0)
train_set,test_set,train_target,test_target = data._splitdata()

# Normalize the train features - apply tansformations
col_logT= ['chol']
col_boxCoxT= ['thalach','age','trestbps']

trainsetobj = Transformation(train_set,col_logT,col_boxCoxT,categorical_cols,for_prediction = False)
transformedset =  trainsetobj._returnTransformation()
transformedset.head()

303 observations with 14 Features loaded 

removed duplicate observations- 1

BoxCox Transformation
thalach feature transformed: thalachBCox
age feature transformed: ageBCox
trestbps feature transformed: trestbpsBCox
Lamda values 
{'thalach': 2.0203142406883083, 'age': 1.5424134424242417, 'trestbps': -0.8611426256437702}

Log Transformation
chol feature transformed: chol-Tstd
Dummy Encoded['ca', 'cp', 'exang', 'slope', 'thal', 'restecg'] 



Unnamed: 0,sex,fbs,oldpeak,thalachBCox,ageBCox,trestbpsBCox,chol-Tstd,ca-_0,ca-_1,ca-_2,...,slope-_0,slope-_1,slope-_2,thal-_0,thal-_1,thal-_2,thal-_3,restecg-_0,restecg-_1,restecg-_2
124,0,0,0.0,17621.380811,183.801655,1.138032,-0.86142,1,0,0,...,0,0,1,0,0,1,0,0,1,0
72,1,0,0.0,22496.021628,116.145933,1.143688,-0.76964,1,0,0,...,0,0,1,0,0,1,0,1,0,0
15,0,0,1.6,13694.418672,269.94268,1.142435,-0.494303,1,0,0,...,0,1,0,0,0,1,0,0,1,0
10,1,0,1.2,14046.908745,304.047573,1.144774,-0.127186,1,0,0,...,0,0,1,0,0,1,0,0,1,0
163,1,0,0.0,16448.428815,176.55776,1.144568,-1.30196,0,0,0,...,0,0,1,0,0,1,0,0,1,0


### Feature Importance - Random forrest

In [56]:
X= DS.drop('target',axis=1)
y=DS['target']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=42)

In [58]:
from sklearn.ensemble import RandomForestClassifier
train_model  = RandomForestClassifier( max_depth=5, min_samples_split=2
                                      , min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                      max_features='auto', max_leaf_nodes=2, min_impurity_decrease=0.0,
                                      min_impurity_split=None, bootstrap=True, oob_score=True
                                      , n_jobs=None, random_state=42, verbose=0, warm_start=False, class_weight=None)


In [60]:
train_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=2,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [61]:
print(train_model.oob_score_)

0.7203791469194313


In [None]:
(pd.Series(train_model.feature_importances_, index= X.columns).nlargest(50).plot(kind = 'barh', figsize=(10,10)))


In [None]:
def apply_standardization(DS,colname,colReplace = True):
    newcolname=colname+'-Tstd'
    DS[newcolname] = preproc.StandardScaler().fit_transform(DS[[colname]])
    if(colReplace ==True):
        DS.drop(columns = colname,inplace =True)
        print('feature dropped: '+colname)
        

#apply_standardization(DS,'thalachBCox',keepexistingcol = False)

In [None]:
cols= ['cholBCox','thalachBCox','ageBCox','trestbpsBCox','oldpeak']
for i in cols:
    apply_standardization(DS,i,colReplace= True)
DS.head()

In [None]:
def apply_LogTransformation(DS,colname, keepexistingcol= False):
    newcol = colname+'Tlog'
    DS[newcol] = np.log(DS[colname]+1)
    if(keepexistingcol ==False):
        DS.drop(columns = colname,inplace =True)
        print('feature dropped: '+colname)

    
#apply_LogTransformation(DS,'chol',keepexistingcol= False)

In [None]:
X= DS.drop('target',axis=1)
y=DS['target']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=42)



In [None]:
import sklearn.preprocessing as preproc
#Min-Max Scaling
DS['Min-Maxage'] = preproc.minmax_scale(DS[['age']])
# Standardization
DS['Standardization'] = preproc.StandardScaler().fit_transform(DS[['age']])
fig, (ax1, ax2, ax3) = plt.subplots(3,1)
fig.tight_layout()

# Plot Original Price
DS['age'].hist(ax=ax1, bins=50)
ax1.tick_params(labelsize=14)
ax1.set_xlabel("Original age", fontsize=10)
ax1.set_ylabel("Frequency", fontsize=14)

# Plot Min-Max Scaling on Price
DS['Min-Max'].hist(ax=ax2, bins=50, color='r')
ax2.tick_params(labelsize=14)
ax2.set_xlabel("Min-Max age", fontsize=10)

# Plot Standardized Scaling on Price
DS['Standardization'].hist(ax=ax3, bins=50, color='g')
ax3.tick_params(labelsize=14)
ax3.set_xlabel("Standarized age", fontsize=10)

In [None]:
DS.drop(columns=['Standardization','age'],inplace= True)

In [None]:
sns.countplot(x='sex',hue='slope',data=DS)

In [None]:
sns.jointplot(x='age', y='chol', data=DS, kind='scatter', color = 'b');

In [None]:
sns.lmplot('age', 'chol', data=DS, hue='target', fit_reg=False);

In [None]:
sns.scatterplot(x="age", y="sex", hue="target",data=DS)

In [None]:
sns.boxplot(x='chol', data= DS)

In [None]:
sns.distplot(DS['chol'])

### EDA & Visualization summary
* Age- population set is covering the adults (since min is 29 and max is 80) - so the target patients are of adult & older people
* CA- number of vessels blocked in heart (ordinal - so it says from factor plot 1/3 of the population set is having 0 vessel block is having hear disease.) (major population are having 0 vessels – so need the check how value 0 correlates with positive of heart disease)
* Chol – cholesterol is interval data- also discovered that cholesterol level is normal/high/low based on age. So planning to do impute additional feature with age&cholesterol = (high/normal/low)
* cp: The chest pain experienced – ordinal
* exang: Exercise induced angina (yes or No) – it means pain experienced during exercise, only minor fraction of patients seems to have pain and diagnosed heart disease. (Negative correlation – can check value later in heatmap)
* fbs: The person's fasting blood sugar (> 120 mg/dl) 1 = true; 0 = false) (more than 230 patients among 300 are having sugar level less than 120  so…….(May be imbalanced data?))
* restecg: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria) , most of patients have normal ecg value 0 at rest but no heart disease (need to check the importance of this feature) 
* trestbps: The person's resting blood pressure (checked and there are 2 values systolic and diastolic – not sure the given BP ready is which one. Need to check more on that – can add extra feature to say low/normal/high based on the value)
* Thalch: maximum heart rate achieved – Distribution Normal
* Sex: more male patients seems to have heart disease diagnosed at early 40s.(not sure if this is a correct plot- need to check the syntax ) Female patient doesn’t seem to have any relation wrt to age


Not much of a correlation identified - lets proceed
Choletrol seems to be left skwed


## Split Train/Test

## Data Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scaled)

In [None]:
from sklearn.ensemble import RandomForestClassifier
train_model  = RandomForestClassifier( max_depth=5, min_samples_split=2
                                      , min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                      max_features='auto', max_leaf_nodes=2, min_impurity_decrease=0.0,
                                      min_impurity_split=None, bootstrap=True, oob_score=True
                                      , n_jobs=None, random_state=42, verbose=0, warm_start=False, class_weight=None)



In [None]:
train_model.fit(X_train_scaled,y_train)

In [None]:
print(train_model.oob_score_)

In [None]:
(pd.Series(train_model.feature_importances_, index= X.columns).nlargest(50).plot(kind = 'barh'))

## Check model prediction

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
Y_predit = train_model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print('Train accuracy : '+ str (accuracy_score(y_train, train_model.predict(X_train))))
print('Test accuracy : '+ str (accuracy_score(y_test, Y_predit)))

### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, Y_predit)
print(cm)

In [None]:
total=sum(sum(cm))

sensitivity = cm[0,0]/(cm[0,0]+cm[1,0])
print('Sensitivity : ', sensitivity )

specificity = cm[1,1]/(cm[1,1]+cm[0,1])
print('Specificity : ', specificity)

### ROC curve

In [None]:
from sklearn.metrics import roc_curve, auc

Y_predicted_Prob = train_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, Y_predicted_Prob)

fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for diabetes classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
auc(fpr, tpr)

### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state=0)
lr_model.fit(X_train,y_train)
y_pred = lr_model.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print('Accuracy score : '+ str(accuracy_score(y_test, y_pred)))

### Naive Bayes Algorithm

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print('Accuracy score : '+ str(accuracy_score(y_test, y_pred)))

In [None]:
from sklearn.svm import SVC
svm = SVC(random_state = 1)
svm.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print('Accuracy score : '+ str(accuracy_score(y_test, y_pred)))

**PCA Visualization**

In [None]:
scaler1 = StandardScaler()
scaler1.fit(X)
feature_scaled = scaler1.transform(X)

#now apply PCA
from sklearn.decomposition import PCA
pca1 =  PCA(n_components =4)
pca1.fit(feature_scaled)
feature_scaled_pca =  pca1.transform(feature_scaled)
print("shape of scaled and pca features",np.shape(feature_scaled))

In [None]:
#print (type(cancer.target))
target_list = y.tolist()
print (type(target_list))
feature_scaled_pca_X0 = feature_scaled_pca[:, 0]
feature_scaled_pca_X1 = feature_scaled_pca[:, 1]
feature_scaled_pca_X2 = feature_scaled_pca[:, 2]
feature_scaled_pca_X3 = feature_scaled_pca[:, 3]

labels = target_list
colordict = {0:'brown', 1:'darkslategray'}
piclabel = {0:'Positive', 1:'Negative'}
markers = {0:'o', 1:'*'}
alphas = {0:0.3, 1:0.4}

fig = plt.figure(figsize=(12, 7))
plt.subplot(1,2,1)
for l in np.unique(labels):
    ix = np.where(labels==l)
    plt.scatter(feature_scaled_pca_X0[ix], feature_scaled_pca_X1[ix], c=colordict[l], 
               label=piclabel[l], s=40, marker=markers[l], alpha=alphas[l])
plt.xlabel("First Principal Component", fontsize=15)
plt.ylabel("Second Principal Component", fontsize=15)

plt.legend(fontsize=15)

plt.subplot(1,2,2)
for l1 in np.unique(labels):
    ix1 = np.where(labels==l1)
    plt.scatter(feature_scaled_pca_X2[ix1], feature_scaled_pca_X3[ix1], c=colordict[l1], 
               label=piclabel[l1], s=40, marker=markers[l1], alpha=alphas[l1])
plt.xlabel("Third Principal Component", fontsize=15)
plt.ylabel("Fourth Principal Component", fontsize=15)

plt.legend(fontsize=15)

plt.savefig('heartDisease_PCAs.png', dpi=200)
plt.show()

**SVM**