<a href="https://colab.research.google.com/github/patrick251s/MachineLearning/blob/main/heart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv("heart_2020_cleaned.csv")
print(data.shape)
print(data.head())

(319795, 18)
  HeartDisease    BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0           No  16.60     Yes              No     No             3.0   
1           No  20.34      No              No    Yes             0.0   
2           No  26.58     Yes              No     No            20.0   
3           No  24.21      No              No     No             0.0   
4           No  23.71      No              No     No            28.0   

   MentalHealth DiffWalking     Sex  AgeCategory   Race Diabetic  \
0          30.0          No  Female        55-59  White      Yes   
1           0.0          No  Female  80 or older  White       No   
2          30.0          No    Male        65-69  White      Yes   
3           0.0          No  Female        75-79  White       No   
4           0.0         Yes  Female        40-44  White       No   

  PhysicalActivity  GenHealth  SleepTime Asthma KidneyDisease SkinCancer  
0              Yes  Very good        5.0    Yes            No        Y

In [None]:
print(data["BMI"].unique())

columns_with_nan = data.isna().any()
print(columns_with_nan)

for column in data.columns:
  print(column)
  print(data[column].unique())
  print("=====================================================================")

[16.6  20.34 26.58 ... 62.42 51.46 46.56]
HeartDisease        False
BMI                 False
Smoking             False
AlcoholDrinking     False
Stroke              False
PhysicalHealth      False
MentalHealth        False
DiffWalking         False
Sex                 False
AgeCategory         False
Race                False
Diabetic            False
PhysicalActivity    False
GenHealth           False
SleepTime           False
Asthma              False
KidneyDisease       False
SkinCancer          False
dtype: bool
HeartDisease
['No' 'Yes']
BMI
[16.6  20.34 26.58 ... 62.42 51.46 46.56]
Smoking
['Yes' 'No']
AlcoholDrinking
['No' 'Yes']
Stroke
['No' 'Yes']
PhysicalHealth
[ 3.  0. 20. 28.  6. 15.  5. 30.  7.  1.  2. 21.  4. 10. 14. 18.  8. 25.
 16. 29. 27. 17. 24. 12. 23. 26. 22. 19.  9. 13. 11.]
MentalHealth
[30.  0.  2.  5. 15.  8.  4.  3. 10. 14. 20.  1.  7. 24.  9. 28. 16. 12.
  6. 25. 17. 18. 21. 29. 22. 13. 23. 27. 26. 11. 19.]
DiffWalking
['No' 'Yes']
Sex
['Female' 'Male']
AgeCate

In [None]:
columnsForOneHot = ["Smoking", "AlcoholDrinking", "Stroke", "DiffWalking", "Sex", "AgeCategory", "Race", "Diabetic", "PhysicalActivity", "GenHealth", "Asthma", "KidneyDisease", "SkinCancer"]
encoded_data = pd.get_dummies(data, columns=columnsForOneHot)
# Pojawił się nan
cleaned_data = encoded_data.dropna()
print(cleaned_data.head())
print(encoded_data.shape)
print(cleaned_data.shape)

  HeartDisease    BMI  PhysicalHealth  MentalHealth  SleepTime  Smoking_No  \
0           No  16.60             3.0          30.0        5.0           0   
1           No  20.34             0.0           0.0        7.0           1   
2           No  26.58            20.0          30.0        8.0           0   
3           No  24.21             0.0           0.0        6.0           1   
4           No  23.71            28.0           0.0        8.0           1   

   Smoking_Yes  AlcoholDrinking_No  AlcoholDrinking_Yes  Stroke_No  ...  \
0            1                   1                    0          1  ...   
1            0                   1                    0          0  ...   
2            1                   1                    0          1  ...   
3            0                   1                    0          1  ...   
4            0                   1                    0          1  ...   

   GenHealth_Fair  GenHealth_Good  GenHealth_Poor  GenHealth_Very good  \
0     

In [None]:
from sklearn.model_selection import train_test_split

# Ze zbioru danych usuwamy kolumnę, którą chcemy przewidywać
X = cleaned_data.drop("HeartDisease", axis=1)
y = cleaned_data["HeartDisease"]

# Podział na dane treningowe i testowe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

columns_with_nan = cleaned_data.isna().any()
print(columns_with_nan)



HeartDisease                           False
BMI                                    False
PhysicalHealth                         False
MentalHealth                           False
SleepTime                              False
Smoking_No                             False
Smoking_Yes                            False
AlcoholDrinking_No                     False
AlcoholDrinking_Yes                    False
Stroke_No                              False
Stroke_Yes                             False
DiffWalking_No                         False
DiffWalking_Yes                        False
Sex_Female                             False
Sex_Male                               False
AgeCategory_18-24                      False
AgeCategory_25-29                      False
AgeCategory_30-34                      False
AgeCategory_35-39                      False
AgeCategory_40-44                      False
AgeCategory_45-49                      False
AgeCategory_50-54                      False
AgeCategor

In [None]:
from sklearn.linear_model import LogisticRegression

# Inicjalizacja modelu regresji logistycznej
model = LogisticRegression(max_iter=1000)

# Trenowanie modelu na danych treningowych
model.fit(X_train, y_train)

# Predykcja dla danych testowych
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

feature_importance = model.coef_[0]
fi_df = pd.DataFrame({"Cecha":X_train.columns, "Wpływ":feature_importance})
sorted_fi = fi_df.sort_values(by="Wpływ", ascending=False)
print(sorted_fi)

0.9138666958520302
                                  Cecha     Wpływ
26              AgeCategory_80 or older  1.582037
25                    AgeCategory_75-79  1.325327
24                    AgeCategory_70-74  1.114092
42                       GenHealth_Poor  0.855058
23                    AgeCategory_65-69  0.843587
22                    AgeCategory_60-64  0.597148
40                       GenHealth_Fair  0.471016
21                    AgeCategory_55-59  0.346948
9                            Stroke_Yes  0.338296
35                         Diabetic_Yes  0.200637
13                             Sex_Male  0.183979
27  Race_American Indian/Alaskan Native  0.162777
47                    KidneyDisease_Yes  0.108049
20                    AgeCategory_50-54  0.101904
32                           Race_White  0.067474
31                           Race_Other  0.048258
0                                   BMI  0.008615
2                          MentalHealth  0.003982
1                        Physic

In [None]:
from sklearn import tree

# Inicjalizacja modelu drzewa decyzyjnego
model2 = tree.DecisionTreeClassifier()

# Uczenie modelu
model2.fit(X_train, y_train)

# Predykcja na zbiorze testowym
predictions = model2.predict(X_test)

accuracy2 = accuracy_score(y_test, predictions)
accuracy2

0.8639440891821323