In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import model_selection
from sklearn.ensemble import StackingClassifier
import joblib
import warnings
import pickle
from matplotlib import pyplot
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split



warnings.filterwarnings('ignore')
data = pd.read_csv("ha.csv")
df = pd.DataFrame(data)
df



#dropping the columns with missing value
df.dropna()

#Renaming the columns names
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
       'exercise_induced_angina', 'st_depression', 'st_slope','target']
df

#description of each column
data.describe()

#removal of outliers
'''q1 = data.resting_blood_pressure.quantile(0.25)
q2 = data.resting_blood_pressure.quantile(0.75)
print(q1,q2)
IQR1 = q2-q1
print(IQR1)
lower_limit = q1-1.5*IQR1
upper_limit = q2+1.5*IQR1
print(lower_limit,upper_limit)

q3 = data.cholesterol.quantile(0.25)
q4 = data.cholesterol.quantile(0.75)
print(q3,q4)
IQR2 = q4-q3
print(IQR2)
lower_limit1 = q3-1.5*IQR2
upper_limit1= q4+1.5*IQR2
print(lower_limit1,upper_limit1)

q5 = data.max_heart_rate_achieved.quantile(0.25)
q6 = data.max_heart_rate_achieved.quantile(0.75)
print(q5,q6)
IQR3 = q6-q5
print(IQR3)
lower_limit2 = q5-1.5*IQR3
upper_limit2 = q6+1.5*IQR3
print(lower_limit2,upper_limit2)

df_no_outlier =data[(data.resting_blood_pressure>lower_limit)&(data.resting_blood_pressure<upper_limit)&(data.cholesterol>lower_limit1)&(data.cholesterol<upper_limit1)&(data.max_heart_rate_achieved>lower_limit2)&(data.max_heart_rate_achieved<upper_limit2)]
df_no_outlier'''

numeric_features = ['age', 'resting_blood_pressure', 'cholesterol', 'max_heart_rate_achieved', 'st_depression']

eda_df = data.loc[:, numeric_features].copy()
plt.figure(figsize=(16, 10))

for i in range(len(eda_df.columns)):
    plt.subplot(2, 4, i + 1)
    sns.boxplot(eda_df[eda_df.columns[i]])

plt.show()
corr = eda_df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, vmin=-1.0, cmap='mako')
plt.title("Correlation Heatmap")
plt.show()
plt.figure(figsize=(8, 8))
plt.pie(data['target'].value_counts(), labels=["Heart Disease", "No Heart Disease"], autopct='%.1f%%', colors=['#36a2ac', '#413f80'])
plt.title("Class Distribution")
plt.show()
def onehot_encode(df, column_dict):
    df = df.copy()
    for column, prefix in column_dict.items():
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df
def preprocess_inputs(df, scaler):
    df = df.copy()
    
    # One-hot encode the nominal features
    nominal_features = ['chest_pain_type', 'st_slope']
    df = onehot_encode(df, dict(zip(nominal_features, ['CP', 'SL'])))
    
    # Split df into X and y
    y = df['target'].copy()
    X = df.drop('target', axis=1).copy()
    
    # Scale X
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, y
X, y = preprocess_inputs(data, RobustScaler())
#X = df.drop('target',axis=1)
#y = df['target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('lr', LogisticRegression(C= 1.0, penalty= 'l2', solver= 'newton-cg')))
    level0.append(('knn', KNeighborsClassifier(metric= 'manhattan', n_neighbors= 17, weights= 'distance')))
    level0.append(('cart', DecisionTreeClassifier(criterion= 'entropy', max_depth= 12, max_features= 'auto', min_samples_leaf= 1, min_samples_split= 2)))
    level0.append(('svm',SVC(C= 1000, gamma= 0.0001, kernel= 'rbf')))
    level0.append(('bayes', GaussianNB()))
    level0.append(('rfa',RandomForestClassifier(max_features='sqrt', n_estimators= 100)))
    level0.append(('neural',MLPClassifier(max_iter=1500)))    
        # define meta learner model
    level1 = LogisticRegression(C= 1.0, penalty= 'l2', solver= 'newton-cg')
        # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

def get_models():
    models = dict()
    models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    models['cart'] = DecisionTreeClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    models['neural'] = MLPClassifier()  
    models['stacking'] = get_stacking()
    return models
model=get_stacking()

def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()



#Bagging 
#level1 = LogisticRegression(C= 1.0, penalty= 'l2', solver= 'newton-cg')
model1= BaggingClassifier(n_estimators=500,max_samples=1.0)
#model1 = StackingClassifier(estimators=level2, final_estimator=level1, cv=10)
cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=5, random_state=1)
n_scores = cross_val_score(model1, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

model2 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10),n_estimators=5000)
# fit the model on the whole dataset
cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=5, random_state=1)
n_scores = cross_val_score(model2, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))





model3 = RandomForestClassifier(max_features='sqrt', n_estimators= 100)
cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=5, random_state=1)
n_scores = cross_val_score(model2, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))





from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
def get_voting():
    # define the base models
    models = list()
    models.append(('stacking', model))
    models.append(('bagging',  model1))
    models.append(('boosting',  model2))
    models.append(('rf',model3))
    # define the voting ensemble
    ensemble = VotingClassifier(estimators=models, voting='hard')
    return ensemble
 
# get a list of models to evaluate
def get_models():
    models = dict()
    models['stacking'] = model
    models['bagging'] = model1
    models['boosting'] = model2
    models['rf'] = model3
    models['hard_voting'] = get_voting()
    return models
 
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=5, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores
 
# define dataset
# get the models to evaluate
models = get_models()
final = models['hard_voting']
final.fit(X_train, y_train)
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X, y)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))











# get a list of base models
from sklearn.metrics import accuracy_score
X_train1, X_val, y_train1, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=1)
def get_models():
	models = list()
	models.append(('stacking',model))
	models.append(('bagging',model1))
	models.append(('boosting',model2))
	return models

# evaluate each base model
def evaluate_models(models, X_train1, X_val, y_train1, y_val):
	# fit and evaluate the models
	scores = list()
	for name, model in models:
		# fit the model
		model.fit(X_train1, y_train1)
		# evaluate the model
		yhat = model.predict(X_val)
		acc = accuracy_score(y_val, yhat)
		# store the performance
		scores.append(acc)
		# report model performance
	return scores
 
# define dataset
#X, y = make_classification(n_samples=10000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# split dataset into train and test sets
#X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.50, random_state=1)
# split the full train set into train and validation sets
#X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.33, random_state=1)
# create the base models
models = get_models()
# fit and evaluate each model
scores = evaluate_models(models, X_train1, X_val, y_train1, y_val)
print(scores)
# create the ensemble
ensemble = VotingClassifier(estimators=models, voting='hard', weights=scores)
# fit the ensemble on the training dataset
ensemble.fit(X_train, y_train)
# make predictions on test set
yhat = ensemble.predict(X_test)
# evaluate predictions
score = accuracy_score(y_test, yhat)
print('Weighted Avg Accuracy: %.3f' % (score*100))

