In [1]:
import numpy as np 
import pandas as pd 

In [56]:
data = pd.read_csv("heart.csv")

print(data.shape)

X = data.drop('target', axis=1)  # X is all the data set except the target column
y = data['target']                  # y is the target column

print(data.isnull().sum()) # since there is no null we good to continue

(1190, 12)
age                    0
sex                    0
chest pain type        0
resting bp s           0
cholesterol            0
fasting blood sugar    0
resting ecg            0
max heart rate         0
exercise angina        0
oldpeak                0
ST slope               0
target                 0
dtype: int64


In [91]:
from sklearn.preprocessing import MinMaxScaler # this is normalization which sets number between ( 0 and 1)

#we can normalize the big numbers ,  features like 'resting bp s' , 'cholesterol' , and  'max heart rate'


scaler = MinMaxScaler()


data = scaler.fit_transform(data)

#reshape(-1,1) ensures that each value has a row of its own

print(data)



[[0.24489796 1.         0.33333333 ... 0.29545455 0.33333333 0.        ]
 [0.42857143 0.         0.66666667 ... 0.40909091 0.66666667 1.        ]
 [0.18367347 1.         0.33333333 ... 0.29545455 0.33333333 0.        ]
 ...
 [0.59183673 1.         1.         ... 0.43181818 0.66666667 1.        ]
 [0.59183673 0.         0.33333333 ... 0.29545455 0.66666667 1.        ]
 [0.20408163 1.         0.66666667 ... 0.29545455 0.33333333 0.        ]]


In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.5, random_state=1)

# test_size = 0.5, using 50% of data for testing and 50% for training 
# random_state, ensures that it's not shuffling every time running code

In [82]:

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
cls = GaussianNB()      # initialize Naive bayes  model

cls.fit(X_train, Y_train)  # fit the model

y_pred = cls.predict(X_test)  # use X_test to predict

accuracyOfNaiveBayes = accuracy_score(Y_test, y_pred)  # use y test to check accuracy

In [88]:
from sklearn.tree import DecisionTreeClassifier
#NOW LETS CHECK THE ACCURACY FOR DECISION TREE

tree = DecisionTreeClassifier(criterion='gini')
tree.fit(X_train , Y_train)
y_pred = tree.predict(X_test)
accuracyOfTree = accuracy_score(Y_test,y_pred)


print(f"Naive Bayes:",accuracyOfNaiveBayes) #ACCURACY IS 0.848... 
print(f"Descision Tree:" ,accuracyOfTree)       #Accuracy of Tree is slightly better 0.8521008





Naive Bayes 0.8487394957983193
Descision Tree 0.8504201680672269


In [84]:
#WE CAN YOU GRID SEARCH TO TUNE HYPER PARAMATERS FOR Decision Tree , TO CHECK WHICH HYPER PARAMATER IS BEST FOR BOTH
from sklearn.model_selection import GridSearchCV
param_grid= {'criterion' : ['gini' , 'entropy'] , 'max_depth' : [ 2 , 3 , 40  ] , 'min_samples_leaf' : [1,2,4] ,
             'max_features' : [None , 'sqrt' , 'log2'] 
            } 
grid_search = GridSearchCV( tree , param_grid, cv = 5 , scoring =  'accuracy')

#cv stand from cross validation it seperates data into 5 folds then for each fold its tested on the remaining folds

grid_search.fit(X,y)
print("Best Parameters:" , grid_search.best_params_)
print("Best Score: " , grid_search.best_score_)

#as you can see we clearly improved the score by tunning bunch of hyperparamaters within the DecisionTree

Best Parameters: {'criterion': 'gini', 'max_depth': 40, 'max_features': 'log2', 'min_samples_leaf': 1}
Best Score:  0.8848739495798318


In [85]:
from sklearn.metrics import recall_score
#now lets evaluate another metric recall score
#it measures the actuall true that were correctly identified

best_clf_rs = grid_search.best_estimator_
#this returns the best estimator of the best decison tree we found


# predict the labels of test using the best estimator found by the grid search
y_pred_rs = best_clf_rs.predict(X_test)

# calculate the recall score
rs = recall_score(Y_test, y_pred_rs, average='weighted')
print("Recall Score:", rs)

Recall Score: 1.0


In [49]:
# NOW FOR REGRESSION
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#the higher the MSE means the farther away the predicted values from the actual values , which is worse 

reg = LinearRegression()

reg.fit(X_train , Y_train)

y_pred = reg.predict(X_test)

mse = mean_squared_error(Y_test , y_pred)
print(mse)


0.12787838345725322


In [54]:
#NOW LETS TUNE THE DEGREE OF THE LINEAR REGRESSION TO CHECK WHICH DEGREE IS BEST FOR THE MODEL
from sklearn.preprocessing import PolynomialFeatures

degrees = [1,2,3,4]

for i ,degree in enumerate(degrees , start = 1):
    polyF = PolynomialFeatures(degree = degree)
    X_train_poly = polyF.fit_transform(X_train)  # we learn from data then transform it to the specific degree
    X_test_poly = polyF.transform(X_test)        # since we already learned we use transform only 
    reg.fit(X_train_poly, Y_train)
    y_pred = reg.predict(X_test_poly)
    mse = mean_squared_error(Y_test , y_pred)
    print(f"degree{i}" , mse)

#degree 2 is the best since its mse the lowest


degree1 0.12787838345725336
degree2 0.12496489162542923
degree3 0.6011377159647869
degree4 67.59509961664598
