In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
!curl -o heart.csv https://maktabkhooneh.org/media/courses/projects/16325569497737558.csv
df = pd.read_csv("heart.csv")

df.head()

# Checking data type and size

In [None]:
print('The dataset has %d rows and %d columns' % (df.shape[0], df.shape[1]))

In [None]:
df.dtypes

In [None]:
# all values are numerical

# Visualize the data

In [None]:
for column in df.columns:
    plt.figure()
    df[column].plot(kind = 'hist', edgecolor='black', bins=10)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Plot the correlation matrix 
import seaborn as sns

correlation_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix[['output']], annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation with Price')
plt.show()

In [None]:
ax = df[df['output'] == 0][0:50].plot(kind='scatter', x='thalachh', y='age', color='Blue', label='low chance');
df[df['output'] == 1][0:50].plot(kind='scatter', x='thalachh', y='age', color='Red', label='high chance', ax=ax);
plt.show()

In [None]:
feature_df = df[['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa', 'thall']]

X = np.asarray(feature_df)
X[0:5]

In [None]:
y = np.asarray(df['output'])
y [0:5]

## Train/Test dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

# Modelling

## 1. SVM with 4 different kernels

In [None]:
from sklearn.metrics import f1_score
from sklearn import svm

kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for ker in kernels:
    clf = svm.SVC(kernel=ker) # clf stands for "classifier" (commonly used abbreviation in ML literature)
    clf.fit(X_train, y_train) 
    
    y_pred = clf.predict(X_test)

    # Evaluate the models
    accuracy = f1_score(y_test, y_pred)
    print(f'Model with {ker} kernel - Accuracy: {accuracy}')

### The best model obtained by the kernel 'linear'

In [None]:
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train) 

## Prediction (SVM)

In [None]:
y_pred = clf.predict(X_test)
print('y_predicted:', y_pred[:20])
print('y_actual   :', y_test[:20])

## Evaluation by Jaccard metric and confusion matrix (SVM)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True Chance')
    plt.xlabel('Predicted Chance')

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, y_pred))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Low chance(0)','High chance(2)'],normalize= False,  title="Confusion matrix for svm with the kernel 'linear'")

In [None]:
from sklearn.metrics import jaccard_score
jaccard_score(y_test, y_pred,pos_label=1)

In [None]:
jaccard_score(y_test, y_pred,pos_label=0)

## 2. Logistic Regression with 4 different solvers

In [None]:
from sklearn.linear_model import LogisticRegression

solvers = ['liblinear', 'newton-cg', 'sag', 'saga']
for slv in solvers:
    LR = LogisticRegression(C=0.01, solver=slv).fit(X_train,y_train) # Jadi: default C = 1
    y_pred = LR.predict(X_test)

    # Evaluate the models
    accuracy = f1_score(y_test, y_pred)
    print(f'Model with {slv} solver - Accuracy: {accuracy}')


### The best model obtained by the solver 'liblinear'

## Prediction(LogReg)

In [None]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train) 
y_pred = LR.predict(X_test)
print(y_pred[0:20])
print(y_test[0:20])

In [None]:
y_pred_prob = LR.predict_proba(X_test)
y_pred_prob[:20]

## Evaluation by Jaccard metric and confusion matrix (LogReg)

In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, y_pred))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Low chance(0)','High chance(2)'],normalize= False,  title="Conf. matrix for Log. Reg. with the solver 'liblinear'")

## 3. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
criterions = ['gini', 'entropy']
for crit in criterions:
    
    heartTree = DecisionTreeClassifier(criterion=crit, max_depth = 4)
    
    heartTree.fit(X_train,y_train)
    y_pred = heartTree.predict(X_test)
    
    # Evaluate the models
    accuracy = f1_score(y_test, y_pred)
    print(f'Model with {crit} criterion - Accuracy: {accuracy}')

In [None]:
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, y_pred))


In [None]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred, labels=[0,1])
np.set_printoptions(precision=2)

print (classification_report(y_test, y_pred))

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Low chance(0)','High chance(2)'],normalize= False,  title="Conf. matrix for Decision Tree Classifier")

In [None]:
from sklearn import metrics
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, y_pred))