### Simple project to compare different machine learning algorithms on a small dataset using sklearn

## Importing the libraries

In [19]:
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

## Reading and exploring the dataset

In [2]:
df = pd.read_csv('heart.csv')

In [3]:
df.head(5)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
print('Length of the Dataset: {}'.format(len(df)))

Length of the Dataset: 303


## Discover if there are missing values in the dataset

In [6]:
df.isna().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

## Split the dataset into values and labels

In [8]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

## Split the dataset into train and test sets

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

## Scaling the data

In [13]:
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

## Logistic Regression

In [14]:
model_logistic = LogisticRegression(random_state=0)
model_logistic.fit(x_train, y_train)

print(model_logistic.score(x_train, y_train))
print(model_logistic.score(x_test, y_test))

0.8429752066115702
0.8524590163934426


## Decision Tree

In [15]:
model_tree = DecisionTreeClassifier()
model_tree.fit(x_train, y_train)

print(model_tree.score(x_train, y_train))
print(model_tree.score(x_test, y_test))

1.0
0.7540983606557377


## Random Forest

In [16]:
model_forest = RandomForestClassifier(criterion='entropy', n_estimators=9, random_state=0)
model_forest.fit(x_train, y_train)

print(model_forest.score(x_train, y_train))
print(model_forest.score(x_test, y_test))

0.9917355371900827
0.8524590163934426


## SCV

In [17]:
model_svc = SVC(kernel = 'rbf', C = 2.0)
model_svc.fit(x_train, y_train)

print(model_svc.score(x_train, y_train))
print(model_svc.score(x_test, y_test))

0.9462809917355371
0.8852459016393442


## K Neighbors

In [18]:
# p = 1 use manhaten, p = 2 use eucleaden
model_knn = KNeighborsClassifier(n_neighbors=3, p=2, metric='minkowski')
model_knn.fit(x_train, y_train)

print(model_knn.score(x_train, y_train))
print(model_knn.score(x_test, y_test))

0.8677685950413223
0.8360655737704918
