In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
data = pd.read_csv("heart.csv")
data.head()
pd.set_option("display.float", "{:.2f}".format)
data.describe()
categorical_val = []
continous_val = []
for column in data.columns:
    if len(data[column].unique()) <= 10:
        categorical_val.append(column)
    else:
        continous_val.append(column)
        
        
# plt.figure(figsize=(15, 15))

# for i, column in enumerate(categorical_val, 1):
#     plt.subplot(3, 3, i)
#     data[data["target"] == 0][column].hist(bins=35, color='blue', label='Have Heart Disease = NO', alpha=0.6)
#     data[data["target"] == 1][column].hist(bins=35, color='red', label='Have Heart Disease = YES', alpha=0.6)
#     plt.legend()
#     plt.xlabel(column)
    
    
    
# plt.figure(figsize=(15, 15))

# for i, column in enumerate(continous_val, 1):
#     plt.subplot(3, 2, i)
#     data[data["target"] == 0][column].hist(bins=35, color='blue', label='Have Heart Disease = NO', alpha=0.6)
#     data[data["target"] == 1][column].hist(bins=35, color='red', label='Have Heart Disease = YES', alpha=0.6)
#     plt.legend()
#     plt.xlabel(column)
    
# # Create another figure
# plt.figure(figsize=(9, 7))

# # Scatter with postivie examples
# plt.scatter(data.age[data.target==1],
#             data.thalach[data.target==1],
#             c="salmon")

# # Scatter with negative examples
# plt.scatter(data.age[data.target==0],
#             data.thalach[data.target==0],
#             c="lightblue")

# # Add some helpful info
# plt.title("Heart Disease in function of Age and Max Heart Rate")
# plt.xlabel("Age")
# plt.ylabel("Max Heart Rate")
# plt.legend(["Disease", "No Disease"])

# # Let's make our correlation matrix a little prettier
# corr_matrix = data.corr()
# fig, ax = plt.subplots(figsize=(15, 15))
# ax = sns.heatmap(corr_matrix,
#                  annot=True,
#                  linewidths=0.5,
#                  fmt=".2f",
#                  cmap="YlGnBu");
# bottom, top = ax.get_ylim()
# ax.set_ylim(bottom + 0.5, top - 0.5)

categorical_val.remove('target')
dataset = pd.get_dummies(data, columns = categorical_val)
dataset.head()


from sklearn.preprocessing import StandardScaler

s_sc = StandardScaler()
col_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
dataset[col_to_scale] = s_sc.fit_transform(dataset[col_to_scale])

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")
        
from sklearn.model_selection import train_test_split

X = dataset.drop('target', axis=1)
y = dataset.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# ===============逻辑回归
accuracies = {}
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)

test_score = accuracy_score(y_test, lr_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, lr_clf.predict(X_train)) * 100

results_df = pd.DataFrame(data=[["Logistic Regression", train_score, test_score]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
accuracies['Logistic Regression'] = test_score

# ===================支持向量机
from sklearn.svm import SVC


svm_clf = SVC(kernel='rbf', gamma=0.1, C=1.0)
svm_clf.fit(X_train, y_train)

print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)

test_score = accuracy_score(y_test, svm_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, svm_clf.predict(X_train)) * 100

results_df_2 = pd.DataFrame(data=[["Support Vector Machine", train_score, test_score]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = results_df.append(results_df_2, ignore_index=True)
accuracies['SVM'] = test_score
# #  ======================随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_clf.fit(X_train, y_train)

print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

test_score = accuracy_score(y_test, rf_clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, rf_clf.predict(X_train)) * 100

results_df_2 = pd.DataFrame(data=[["Random Forest Classifier", train_score, test_score]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df = results_df.append(results_df_2, ignore_index=True)


accuracies['RandomForest'] = test_score


# def feature_imp(df, model):
#     fi = pd.DataFrame()
#     fi["feature"] = df.columns
#     fi["importance"] = model.feature_importances_
#     return fi.sort_values(by="importance", ascending=False)

# # feature_imp(X, lr_clf).plot(kind='barh', figsize=(12,7), legend=False)
# feature_imp(X, svm_clf).plot(kind='barh', figsize=(12,7), legend=False)
# feature_imp(X, rf_clf).plot(kind='barh', figsize=(12,7), legend=False)

# ====================================
# colors = ["purple", "green", "orange"]

# sns.set_style("whitegrid")
# # plt.figure(figsize=(16,5))
# plt.yticks(np.arange(80,100,5))
# plt.ylabel("Accuracy %")
# plt.xlabel("Algorithms")
# sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette=colors)
# plt.show()
# ========================================
# import warnings
# warnings.filterwarnings("ignore")
# from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, StratifiedKFold
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

# labels = ['LR','SVM','RF']
# accuracy = print_score(X_train, X_test, y_train, y_test, 'accuracy')
# precision = print_score(X_train, X_test, y_train, y_test, 'precision')
# recall = print_score(X_train, X_test, y_train, y_test, 'recall')
# f1score = print_score(X_train, X_test, y_train, y_test, 'f1')
# rocauc = print_score(X_train, X_test, y_train, y_test, 'roc_auc')

# x = np.arange(len(labels))  # the label locations
# width = 0.18  # the width of the bars

# fig, ax = plt.subplots()
# rects1 = ax.bar(x-width, accuracy, width, label='accuracy')
# rects2 = ax.bar(x, precision, width, label='precision')
# rects3 = ax.bar(x+width, recall, width, label='recall')
# # Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_ylabel('Scores')
# ax.set_title('Scores by different model')
# ax.set_xticks(x)
# ax.set_xticklabels(labels)
# ax.legend()
# fig.tight_layout()
# plt.show()
# fig1, ax1 = plt.subplots()
# ax1.plot(labels, f1score, label="f1score")
# ax1.plot(labels, rocauc, label="rocauc")
# ax1.legend()


kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

# to give model baseline report in dataframe 
def baseline_report(model, X_train, X_test, y_train, y_test, name):
    model.fit(X_train, y_train)
    accuracy     = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy'))#cv交叉验证折数
    precision    = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring='precision'))
    recall       = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring='recall'))
    f1score      = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring='f1'))
    rocauc       = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring='roc_auc'))
    y_pred = model.predict(X_test)
    logloss      = log_loss(y_test, y_pred)   # SVC & LinearSVC unable to use cvs

    df_model = pd.DataFrame({'model'        : [name],
                             'accuracy'     : [accuracy],
                             'precision'    : [precision],
                             'recall'       : [recall],
                             'f1score'      : [f1score],
                             'rocauc'       : [rocauc],
                             'logloss'      : [logloss],
                             'timetaken'    : [0]       })   # timetaken: to be used for comparison later
    return df_model

# to evaluate baseline models
gnb = GaussianNB()
bnb = BernoulliNB()
mnb = MultinomialNB()
logit = LogisticRegression()
knn = KNeighborsClassifier()
decisiontree = DecisionTreeClassifier()
randomforest = RandomForestClassifier()
svc = SVC()
linearsvc = LinearSVC()

# to concat all models
df_models = pd.concat([baseline_report(gnb, x_train, x_test, y_train, y_test, 'GaussianNB'),
                       baseline_report(bnb, x_train, x_test, y_train, y_test, 'BernoulliNB'),
                       baseline_report(mnb, x_train, x_test, y_train, y_test, 'MultinomialNB'),
                       baseline_report(logit, x_train, x_test, y_train, y_test, 'LogisticRegression'),
                       baseline_report(knn, x_train, x_test, y_train, y_test, 'KNN'),
                       baseline_report(decisiontree, x_train, x_test, y_train, y_test, 'DecisionTree'),
                       baseline_report(randomforest, x_train, x_test, y_train, y_test, 'RandomForest'),
                       baseline_report(svc, x_train, x_test, y_train, y_test, 'SVC'),
                       baseline_report(linearsvc, x_train, x_test, y_train, y_test, 'LinearSVC')
                       ], axis=0).reset_index()

df_models = df_models.drop('index', axis=1)
df_models

Train Result:
Accuracy Score: 86.79%
_______________________________________________
CLASSIFICATION REPORT:
              0      1  accuracy  macro avg  weighted avg
precision  0.88   0.86      0.87       0.87          0.87
recall     0.82   0.90      0.87       0.86          0.87
f1-score   0.85   0.88      0.87       0.87          0.87
support   97.00 115.00      0.87     212.00        212.00
_______________________________________________
Confusion Matrix: 
 [[ 80  17]
 [ 11 104]]

Test Result:
Accuracy Score: 86.81%
_______________________________________________
CLASSIFICATION REPORT:
              0     1  accuracy  macro avg  weighted avg
precision  0.87  0.87      0.87       0.87          0.87
recall     0.83  0.90      0.87       0.86          0.87
f1-score   0.85  0.88      0.87       0.87          0.87
support   41.00 50.00      0.87      91.00         91.00
_______________________________________________
Confusion Matrix: 
 [[34  7]
 [ 5 45]]

Train Result:
Accuracy Score: 

NameError: name 'GaussianNB' is not defined