In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('../input/heart-failure-prediction/heart.csv')
data

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
plt.figure(figsize=(20,10))
plt.title('Histogram of Age with Positive vs Negative Cases')
sns.histplot(data=data, x='Age', hue='HeartDisease', bins=30, kde=True);

## Categorical scatterplots

In [None]:
sns.catplot(y="Age", x="Sex", hue = "ChestPainType", kind="swarm", data=data)

In [None]:
sns.catplot(y="Age", x="Sex", hue = "RestingBP", kind="swarm", data=data)

In [None]:
sns.catplot(y="Age", x="Sex", hue = "HeartDisease", kind="swarm", data=data)


### Distributions of observations within categories

In [None]:
sns.catplot(y="Age", x="Sex", hue = "HeartDisease", kind="box", data=data)

In [None]:
sns.catplot(x="ChestPainType", y="Cholesterol", kind="boxen",
            data=data.sort_values("ChestPainType"))

In [None]:
sns.catplot(x="ST_Slope", y="Cholesterol", hue="Sex",
            kind="violin", data=data)

In [None]:
g = sns.catplot(x="ChestPainType", y="MaxHR", kind="violin", inner=None, data=data)
sns.swarmplot(x="ChestPainType", y="MaxHR", color="k", size=3, data=data, ax=g.ax)

## Showing multiple relationships with facets

In [None]:
sns.catplot(x="ChestPainType", y="MaxHR", hue="ExerciseAngina",
            col="HeartDisease", aspect=.7,
            kind="swarm", data=data)

In [None]:
g = sns.catplot(x="Age", y="HeartDisease", row="RestingECG",
                kind="box", orient="h", height=2, aspect=6,
                data=data)
g.set(xscale="log")

In [None]:
sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(20, 10))
ax.set_xscale("log")

# Plot the orbital period with horizontal boxes
sns.boxplot(x="Cholesterol", y="ChestPainType", data=data,
            whis=[0, 100], width=.6, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="Cholesterol", y="ChestPainType", data=data,
              size=4, color=".6", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(xlabel="Cholesterol")
ax.set(ylabel="ChestPainType")
sns.despine(trim=True, left=True)

In [None]:
f, ax = plt.subplots(figsize=(20, 10))
ax.set_xscale("log")

# Plot the orbital period with horizontal boxes
sns.boxplot(x="RestingBP", y="ChestPainType", data=data,
            whis=[0, 100], width=.6, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="RestingBP", y="ChestPainType", data=data,
              size=4, color=".3", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(xlabel="RestingBP")
ax.set(ylabel="ChestPainType")
sns.despine(trim=True, left=True)

In [None]:
f, ax = plt.subplots(figsize=(20, 10))
ax.set_xscale("log")

# Plot the orbital period with horizontal boxes
sns.boxplot(x="MaxHR", y="ChestPainType", data=data,
            whis=[0, 100], width=.6, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="MaxHR", y="ChestPainType", data=data,
              size=4, color=".3", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(xlabel="MaxHR")
ax.set(ylabel="ChestPainType")
sns.despine(trim=True, left=True)

## Correlation Heatmap

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(data.corr(),annot=True,cmap='BrBG')
plt.title('Correlation Heatmap\n',fontweight='bold',fontsize=14)
plt.show()

# Convert Categorical Variable to Numeric using "get_dummies" 

In [None]:
sex = pd.get_dummies(data.Sex)
cpt = pd.get_dummies(data.ChestPainType)
recg = pd.get_dummies(data.RestingECG)
ea = pd.get_dummies(data.ExerciseAngina)
sts = pd.get_dummies(data.ST_Slope)

# Concatenate the dummies to original dataframe

In [None]:
merged = pd.concat([data, sex,cpt,recg,ea,sts], axis='columns')

In [None]:
# drop the values
data = merged.drop(['Sex', 'ChestPainType','RestingECG','ExerciseAngina','ST_Slope'], axis='columns')
data

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(data.corr(),annot=True,cmap='BrBG')
plt.title('Correlation Heatmap\n',fontweight='bold',fontsize=14)
plt.show()

In [None]:
data.isnull()

In [None]:
# No null value detect
sns.heatmap(data.isnull())

In [None]:
data.shape

In [None]:
# Features
X = data.drop('HeartDisease', axis = 1) # selecting all columns except the target
# Target variable
y = data['HeartDisease']

In [None]:
print(X.shape, y.shape)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=1)

In [None]:
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
X = scaler.fit_transform(X)

In [None]:
def model_matrics(model,name):
    lrscore = model.score(X_test,y_test)
    pred = model.predict(X_test)
    lrcm = confusion_matrix(y_test,pred)
    lrcr = classification_report(y_test,pred)
    acc = metrics.accuracy_score(y_test,pred)
    prec = metrics.precision_score(y_test,pred)
    rec = metrics.recall_score(y_test,pred)
    f1 = metrics.f1_score(y_test,pred)
    cm = confusion_matrix(y_test,pred)
    Specificity = cm[0,0]/(cm[0,0]+cm[0,1])
    Sensitivity = cm[1,1]/(cm[1,0]+cm[1,1])

    print(name)
    print('*******************')
    print('Testscore')
    print('---------')
    print(lrscore)
    print('\n')
    print('confusion Matrix')
    print('----------------')
    print(lrcm)
    print('\n')
    print('Classification Report')
    print('---------------------')
    print(lrcr)
    print('Accuracy')
    print('---------------------')
    print(acc)
    print('Precision')
    print('---------------------')
    print(prec)
    print('Recall')
    print('---------------------')
    print(rec)
    print('F1_score')
    print('---------------------')
    print(f1)
    print('Specificity')
    print('---------------------')
    print(Specificity )
    print('Sensitivity  ')
    print('---------------------')
    print(Sensitivity )

    # Compute False postive rate, and True positive rate
    fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])

    # Calculate Area under the curve to display on the plot
    roc_auc = metrics.roc_auc_score(y_test,model.predict(X_test))

    plt.title('Receiver Operating Characteristic of '+name)
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    print("ROC_AUC of " +name,roc_auc)
    print("   ")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()

In [None]:
lr_model = LogisticRegression(penalty='l2',C=1,max_iter=100)
lr_model.fit(X_train,y_train)

In [None]:
model_matrics(lr_model,"Logistic Regression")

# Majority Class Labels (Majority/Hard Voting)

In majority voting, the predicted class label for a particular sample is the class label that represents the majority (mode) of the class labels predicted by each individual classifier.

E.g., if the prediction for a given sample is

classifier 1 -> class 1

classifier 2 -> class 1

classifier 3 -> class 2

the VotingClassifier (with voting='hard') would classify the sample as “class 1” based on the majority class label.

In the cases of a tie, the VotingClassifier will select the class based on the ascending sort order. E.g., in the following scenario

classifier 1 -> class 2

classifier 2 -> class 1

the class label 1 will be assigned to the sample.

In [None]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

In [None]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='hard')

In [None]:
for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

# Weighted Average Probabilities (Soft Voting)

In contrast to majority voting (hard voting), soft voting returns the class label as argmax of the sum of predicted probabilities.

Specific weights can be assigned to each classifier via the weights parameter. When weights are provided, the predicted class probabilities for each classifier are collected, multiplied by the classifier weight, and averaged. The final class label is then derived from the class label with the highest average probability.

To illustrate this with a simple example, let’s assume we have 3 classifiers and a 3-class classification problems where we assign equal weights to all classifiers: w1=1, w2=1, w3=1.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product

In [None]:
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[2, 1, 2])

In [None]:
clf1 = clf1.fit(X, y)
clf2 = clf2.fit(X, y)
clf3 = clf3.fit(X, y)
eclf = eclf.fit(X, y)

In [None]:
for clf, label in zip([clf1, clf2, clf3, eclf], ['DecisionTreeClassifier', 'KNeighborsClassifier', 'SVC', 'Ensemble']):
    model_matrics(clf,label)    

In [None]:
nan