___

<p style="text-align: center;"><img src="https://docs.google.com/uc?id=1lY0Uj5R04yMY3-ZppPWxqCr5pvBLYPnV" class="img-fluid" alt="CLRSWY"></p>

___

Data vocabulary:

1. #3 (age): age in years 
2. #4 (sex): sex (1 = male; 0 = female) 
3. #9 (cp): cp: chest pain type | Value 0: typical angina | Value 1: atypical angina | Value 2: non-anginal pain | Value 3: asymptomatic 
4. #10 (trestbps): resting blood pressure (in mm Hg on admission to the hospital) 
5. #12 (chol): serum cholestoral in mg/dl 
6. #16 (fbs): (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
7. #19 (restecg): resting electrocardiographic results | Value 0: normal | Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) | Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
8. #32 (thalach): maximum heart rate achieved 
9. #38 (exang): exercise induced angina (1 = yes; 0 = no) 
10. #40 (oldpeak): ST depression induced by exercise relative to rest 
11. #41 (slope): the slope of the peak exercise ST segment | Value 1: upsloping | Value 2: flat | Value 3: downsloping 
12. #44 (ca): number of major vessels (0-3) colored by flourosopy 
13. #51 (thal): 3 = normal; 6 = fixed defect; 7 = reversable defect 
14. #58 (num) (the predicted attribute): Value 0: < 50% diameter narrowing | Value 1: > 50% diameter narrowing 

# Import esential Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
#%matplotlib notebook
plt.rcParams["figure.figsize"] = (10,6)
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")
pd.set_option('display.float_format', lambda x: '%.3f' % x)


# Ingest the data to notebook

In [None]:
df = pd.read_csv('heart.csv')
df.sample(5)

# EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['target'].value_counts()

In [None]:
df = df.astype({ "cp": str, "restecg":str, "slope":str, "thal":str})

In [None]:
df = pd.get_dummies(df, drop_first= True )

In [None]:
df

In [None]:
df.info()

In [None]:
df.corr()


In [None]:
plt.figure(figsize=(16,12), dpi=80)
sns.heatmap(df.corr(), annot=True, cmap="YlGnBu");

In [None]:
plt.figure(figsize=(14,8), dpi=80)
df.corr()["target"].drop("target").sort_values().plot.barh()

In [None]:
# sns.pairplot(df, hue = "target")

In [None]:
index = 0
plt.figure(figsize=(20,20))
for feature in df.columns :
    if feature != 'target' :
        index += 1
        plt.subplot(5,5,index)
        sns.boxplot(x = 'target', y = feature, data = df)
        

In [None]:
df_cont = df[['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'target']]
df_cont

# Data Preprocessing

In [None]:
X=df.drop(["target"], axis=1)
y=df["target"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Implement Logistic Regression and Evaluate

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model=LogisticRegression()

In [None]:
log_model.fit(X_train_scaled, y_train)

In [None]:
y_pred=log_model.predict(X_test_scaled)

In [None]:
y_pred_proba = log_model.predict_proba(X_test_scaled)

In [None]:
test_data = pd.concat([X_test, y_test], axis=1)
test_data["pred"] = y_pred
test_data["pred_proba"] = y_pred_proba[:,1]
test_data.sample(10)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [None]:

eval_metric(log_model, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
plot_confusion_matrix(log_model, X_test_scaled, y_test);

In [None]:
plot_confusion_matrix(log_model, X_test_scaled, y_test, normalize='true');

In [None]:
plot_confusion_matrix(log_model, X_test_scaled, y_test, normalize='pred');

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
model = LogisticRegression()

scores = cross_validate(model, X_train_scaled, y_train, scoring = ['precision','recall','f1','accuracy'], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores

In [None]:
df_scores.mean()[2:]

In [None]:
eval_metric(log_model, X_train_scaled, y_train, X_test_scaled, y_test)

# GridSearch for Logistic Regression:

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

In [None]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression()

penalty = ["l1", "l2"]
C = np.logspace(-1, 5, 20)
class_weight= ["balanced", None] 
# The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies 
# in the input data
solver = ["lbfgs", "liblinear", "sag", "saga"]

param_grid = {"penalty" : penalty,
              "C" : C,
              "class_weight":class_weight,
              "solver":solver}


grid_model = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          cv=10,
                          scoring = "recall", 
                          n_jobs = -1)

In [None]:
grid_model.fit(X_train_scaled,y_train)

In [None]:
grid_model.best_params_

In [None]:
eval_metric(grid_model, X_train_scaled, y_train, X_test_scaled, y_test)

# ROC/AUC for Logistic Regression:

In [None]:
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve, roc_auc_score, auc, roc_curve, average_precision_score, precision_recall_curve

In [None]:
plot_roc_curve(grid_model, X_test_scaled, y_test);

In [None]:
plot_precision_recall_curve(grid_model, X_test_scaled, y_test);

# Best Treshold for Logistic Regression:

In [None]:
plot_roc_curve(grid_model, X_train_scaled, y_train);

In [None]:
y_pred_proba = log_model.predict_proba(X_train_scaled)
roc_auc_score(y_train, y_pred_proba[:,1])

In [None]:
fp_rate, tp_rate, thresholds = roc_curve(y_train, y_pred_proba[:,1])

In [None]:
optimal_idx = np.argmax(tp_rate - fp_rate)
optimal_threshold = thresholds[optimal_idx]
optimal_threshold

# Implement KNN and Evaluate

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)

In [None]:
knn_model.fit(X_train_scaled,y_train)

In [None]:
y_pred = knn_model.predict(X_test_scaled)
y_pred

In [None]:
y_pred_proba = knn_model.predict_proba(X_test_scaled)

In [None]:
pd.DataFrame(y_pred_proba)

In [None]:
my_dict = {"Actual": y_test, "Pred":y_pred, "Proba_1":y_pred_proba[:,1], "Proba_0":y_pred_proba[:,0]}

In [None]:
pd.DataFrame.from_dict(my_dict).sample(10)

# Model Performance on Classification Tasks

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plot_confusion_matrix(knn_model, X_test_scaled, y_test);

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Elbow Method for Choosing Reasonable K Values

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [None]:
test_error_rates = []


for k in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train_scaled,y_train) 
   
    y_pred_test = knn_model.predict(X_test_scaled)
    
    test_error = 1 - accuracy_score(y_test,y_pred_test)
    test_error_rates.append(test_error)

In [None]:
test_error_rates

In [None]:
plt.figure(figsize=(15,8))
plt.plot(range(1,30), test_error_rates, color='blue', linestyle='--', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K_values')
plt.ylabel('Error Rate')
plt.hlines(y=0.19, xmin = 0, xmax = 30, colors= 'r', linestyles="--")
plt.hlines(y=0.23, xmin = 0, xmax = 30, colors= 'r', linestyles="--")

In [None]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print("Test_Set\n")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set\n")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [None]:
knn = KNeighborsClassifier(n_neighbors=8)

knn.fit(X_train_scaled,y_train)
print('WITH K=8\n')
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)

knn.fit(X_train_scaled,y_train)
print('WITH K=15\n')
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=23)

knn.fit(X_train_scaled,y_train)
print('WITH K=23\n')
eval_metric(knn, X_train_scaled, y_train, X_test_scaled, y_test)

# Cross Validate for Optimal K Value:

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate

In [None]:
model = KNeighborsClassifier(n_neighbors=23)

scores = cross_validate(model, X_train_scaled, y_train, scoring = ['accuracy', 'precision','recall',
                                                                   'f1'], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores

In [None]:
df_scores.mean()[2:]

# Gridsearch Method for Choosing Reasonable K Values

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
knn_grid = KNeighborsClassifier()

In [None]:
k_values= range(1,30)

In [None]:
param_grid = {"n_neighbors":k_values, "p": [1,2], "weights": ['uniform', "distance"]}

In [None]:
knn_grid_model = GridSearchCV(knn_grid, param_grid, cv=10, scoring= 'accuracy')

In [None]:
knn_grid_model.fit(X_train_scaled, y_train)

In [None]:
knn_grid_model.best_params_

In [None]:
print('WITH K=29\n')
eval_metric(knn_grid_model, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=23).fit(X_train_scaled, y_train)

In [None]:
plot_roc_curve(knn_model, X_test_scaled, y_test);

# Visually compare models based on your chosen metric

In [None]:
plot_confusion_matrix(log_model, X_test_scaled, y_test);

In [None]:
plot_confusion_matrix(grid_model, X_test_scaled, y_test);

In [None]:
plot_confusion_matrix(knn_model, X_test_scaled, y_test);

In [None]:
eval_metric(log_model, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
eval_metric(grid_model, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
eval_metric(knn_model, X_train_scaled, y_train, X_test_scaled, y_test)

# Chose best model and make a random prediction 

In [None]:
import pickle


In [None]:
scaler = StandardScaler().fit(X)
pickle.dump(scaler, open("scaler_heart", 'wb'))

In [None]:
X_scaled = scaler.transform(X)

In [None]:
final_model = LogisticRegression().fit(X_scaled, y)

In [None]:
pickle.dump(final_model, open("final_model", 'wb'))

# Make a random prediction 

In [None]:
X.columns

In [None]:
X.describe()

In [None]:
X.sample(10)

In [None]:
new_obs = {'age': [15, 21 ,38, 42, 56, 75, 82, 12, 34, 59],
           'sex': [0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0],
           'trestbps': [121, 131, 136, 140, 120, 150, 175, 172, 192, 100],
           'chol': [130, 148, 154, 196, 306, 448, 211, 346, 460, 500],
           'fbs': [0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
           'thalach': [80, 200, 158, 95, 78, 186, 195, 148, 200, 98],
           'oldpeak': [3.3, 4.5, 0.7, 2.4, 3.7, 4.2, 5.7, 0.7, 6.1, 1.0],
           'ca' : [2.0, 1.0, 4.0, 0.0, 4.0, 3.0, 2.0, 3.0, 1.0, 0.0],
           'cp_1': [1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0],
           'cp_2': [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0],
           'cp_3': [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
           'restecg_1': [1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0],
           'restecg_2': [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0],
           'exang_1': [0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0],
           'slope_1': [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0],
           'slope_2': [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0],
           'thal_1': [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0],
           'thal_2': [1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
           'thal_3': [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0]}
           

In [None]:
samples = pd.DataFrame(new_obs)
samples

In [None]:
scaler_heart = pickle.load(open("scaler_heart", "rb"))

In [None]:
samples_scaled = scaler_heart.transform(samples)
samples_scaled

In [None]:
final_model = pickle.load(open("final_model", "rb"))

In [None]:
predictions = final_model.predict(samples_scaled)
predictions_proba = final_model.predict_proba(samples_scaled)

In [None]:
samples["pred"] = predictions
samples["pred_proba"] = predictions_proba[:,1]
samples