In [None]:
# Import all the tools we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from warnings import filterwarnings
filterwarnings(action='ignore')

# Load data

In [None]:
df = pd.read_csv("heart-disease.csv")
df.shape

In [None]:
print("Head Values:")
df.head()

In [None]:
print("Tail Values:")
df.tail()

In [None]:
# are there any missing values
df.isna().sum()

In [None]:
df.describe()

In [None]:
# Let's find out how many of each class there
df["target"].value_counts()

In [None]:
df["target"].value_counts().plot(kind="bar", color=["red", "blue"]);

In [None]:
df.sex.value_counts()

In [None]:
df["sex"].value_counts().plot(kind="bar", color=["Black", "Pink"]);

In [None]:
# Create a plot of crosstab
pd.crosstab(df.target, df.sex).plot(kind="bar",
                                    figsize=(10,6),
                                    color=["pink","blue"])
plt.title("heart disease frequency for sex")
plt.xlabel("0 = no disease, 1 = disease")
plt.ylabel("Amount")
plt.legend(["female", "male"])
plt.xticks(rotation=0);

In [None]:
# check the distribution of the age column with a histogram
df.age.plot.hist();

In [None]:
# make the crosstab more visual
pd.crosstab(df.cp, df.target).plot(kind="bar",
                                    figsize=(10,6),
                                    color=["salmon","lightblue"])

# Add some communication
plt.title("heart disease frequency per chest pain type")
plt.xlabel("Chest Pain Type")
plt.ylabel("Amount")
plt.legend(["No Disease", "Disease"])
plt.xticks(rotation=0);

In [None]:
# make a correlation matrix
df.corr()

In [None]:
# let's make now correlation matrix a little prettier
corr_matrix = df.corr()
fig, ax =plt.subplots(figsize=(15,10))
ax = sns.heatmap(corr_matrix,
                 annot =True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()
ax.set_ylim(bottom+0.5,top-0.5)

In [None]:
sns.countplot(df['age'])
plt.show()

In [None]:
sns.countplot(df['cp'])
plt.show()

In [None]:
sns.kdeplot(df.query('oldpeak > 2').oldpeak)

In [None]:
sns.distplot(df['thalach'])

In [None]:
df.plot(kind ='box',subplots = True, layout =(4,4),sharex = False)

In [None]:
df.plot(kind ='density',subplots = True, layout =(4,4),sharex = False)

In [None]:
df.hist(figsize=(10,10),bins=50)
plt.show()

In [None]:
sns.pairplot(df)

In [None]:
sns.violinplot(x='ca', y='age', data=df)

In [None]:
# split data into x and y
X = df.drop("target",axis=1)
Y = df["target"]
X

In [None]:
Y

In [None]:
# split data into train and test sets
from sklearn.model_selection import train_test_split, cross_val_score
np.random.seed(42)

X_train, X_test, Y_train, Y_test =train_test_split(X,Y,test_size=0.2)

In [None]:
X_train.head()

In [None]:
Y_train.head()

## Using Logistic Regression:

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score,confusion_matrix
print("Accuracy Score:",accuracy_score(Y_test,Y_pred))

In [None]:
confusion_mat = confusion_matrix(Y_test,Y_pred)
print(confusion_mat)

## Using KNN:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(Y_test,y_pred))

## Using SVC:

In [None]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train,Y_train)
pred_y = model.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(Y_test,pred_y))

## Using DecisionTree:

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy',random_state=7)
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(Y_test,y_pred))

## Using GaussianNB:

In [None]:
from sklearn.naive_bayes import GaussianNB
model3 = GaussianNB()
model3.fit(X_train,Y_train)
y_pred3 = model3.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(Y_test,y_pred3))

## Using RandomForest:

In [None]:
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(random_state=1)
model2.fit(X_train, Y_train)
y_pred2 = model2.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(Y_test,y_pred2))

## Using Xgboost:

In [2]:
import xgboost as xgb
model5 = xgb.XGBClassifier(random_state=1)
model5.fit(X_train, Y_train)
y_pred5 = model5.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(Y_test,y_pred5))

ModuleNotFoundError: No module named 'xgboost'

In [1]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['Log-Reg', 'KNN', 'SVC', 'Des-Tree', 'Gaus-NB', 'RandomForest', 'Xgboost']
students = [88.52,65.57,70.49,81.96,86.88,85.24,85.24]
ax.bar(langs,students)
plt.show()

NameError: name 'plt' is not defined

# Hence we will use LogisticRegression algorithms for training my model.