#### This is a data from [kaggle](https://www.kaggle.com/datasets/dileep070/heart-disease-prediction-using-logistic-regression/data) to predict if a patient has a 10-year risk of coronary heart disease, based on patient's information and health history.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
dataset = pd.read_csv("framingham.csv")

In [3]:
dataset.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  TenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB


In [5]:
dataset.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [6]:
#We could assume education wouldn't have as much impact on health history of patients.

dataset.drop("education", axis = 1, inplace=True)

In [7]:
dataset.select_dtypes(include="object").columns

Index([], dtype='object')

In [8]:
#Other columns containing null values, rather dropping them, we could fill them with average values within the columns

In [9]:
# Let's split our datasets already
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [10]:
print(x)

[[  1.    39.     0.   ...  26.97  80.    77.  ]
 [  0.    46.     0.   ...  28.73  95.    76.  ]
 [  1.    48.     1.   ...  25.34  75.    70.  ]
 ...
 [  0.    48.     1.   ...  22.    84.    86.  ]
 [  0.    44.     1.   ...  19.16  86.      nan]
 [  0.    52.     0.   ...  21.47  80.   107.  ]]


In [11]:
np.isnan(x).sum()

540

In [12]:
#Replacing all null values in our input variables with the mean in their corresponding columns
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy= "mean")
x = imputer.fit_transform(x)

In [13]:
np.isnan(x).sum()

0

In [14]:
#Let's obtain our training and test sets

from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=101)

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [17]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0) 

In [18]:
classifier.fit(x_train, y_train)
predictions = classifier.predict(x_test)

In [19]:
compare = {"y_predictions":predictions, "real_y":y_test}
y_comparisons = pd.DataFrame(data = compare)

In [20]:
y_comparisons.head()

Unnamed: 0,y_predictions,real_y
0,0,0
1,0,1
2,0,0
3,0,0
4,0,0


In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_test, predictions))
accuracy_score(y_test, predictions)

[[733   8]
 [101   6]]


0.8714622641509434