<a href="https://colab.research.google.com/github/plavez/Jupyter/blob/ML/heart_attack_risk_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


## Loading Data

In [3]:
data_frame=pd.read_csv('heart_attack_risk_dataset.csv')
data_frame.head(8)

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved,Heart_Attack_Risk
0,69,Female,1,0,Moderate,34.61,1,0,152.1,171,85,0,Moderate,Non-anginal,Reversible defect,0,Normal,0,114,Low
1,32,Male,0,0,Moderate,22.75,0,0,166.8,126,103,0,Low,Asymptomatic,Normal,0,ST-T abnormality,0,173,Moderate
2,89,Male,0,1,Moderate,35.32,0,0,272.3,123,127,0,Low,Typical,Reversible defect,0,ST-T abnormality,0,109,Low
3,78,Male,0,1,Moderate,18.23,1,0,237.7,144,125,0,Low,Typical,Fixed defect,1,Left Ventricular Hypertrophy,0,129,Low
4,38,Female,1,0,Moderate,19.82,0,0,207.7,123,107,0,High,Asymptomatic,Reversible defect,0,ST-T abnormality,0,124,Moderate
5,41,Male,0,1,Moderate,36.11,0,0,271.2,141,119,0,Low,Atypical,Fixed defect,0,Normal,0,101,High
6,20,Male,1,0,Low,15.12,0,0,164.8,154,67,0,Low,Non-anginal,Normal,0,Normal,0,176,Low
7,39,Female,0,0,Moderate,20.78,0,0,297.0,91,112,0,Low,Atypical,Normal,0,ST-T abnormality,1,134,Moderate


In [4]:
data_frame.dtypes

Unnamed: 0,0
Age,int64
Gender,object
Smoking,int64
Alcohol_Consumption,int64
Physical_Activity_Level,object
BMI,float64
Diabetes,int64
Hypertension,int64
Cholesterol_Level,float64
Resting_BP,int64


## Data Cleaning

In [5]:
data_frame.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Smoking,0
Alcohol_Consumption,0
Physical_Activity_Level,0
BMI,0
Diabetes,0
Hypertension,0
Cholesterol_Level,0
Resting_BP,0


In [6]:
data_frame.fillna(0.0,inplace=True)
data_frame.head()

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved,Heart_Attack_Risk
0,69,Female,1,0,Moderate,34.61,1,0,152.1,171,85,0,Moderate,Non-anginal,Reversible defect,0,Normal,0,114,Low
1,32,Male,0,0,Moderate,22.75,0,0,166.8,126,103,0,Low,Asymptomatic,Normal,0,ST-T abnormality,0,173,Moderate
2,89,Male,0,1,Moderate,35.32,0,0,272.3,123,127,0,Low,Typical,Reversible defect,0,ST-T abnormality,0,109,Low
3,78,Male,0,1,Moderate,18.23,1,0,237.7,144,125,0,Low,Typical,Fixed defect,1,Left Ventricular Hypertrophy,0,129,Low
4,38,Female,1,0,Moderate,19.82,0,0,207.7,123,107,0,High,Asymptomatic,Reversible defect,0,ST-T abnormality,0,124,Moderate


In [7]:
data_frame.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Smoking,0
Alcohol_Consumption,0
Physical_Activity_Level,0
BMI,0
Diabetes,0
Hypertension,0
Cholesterol_Level,0
Resting_BP,0


## Column Data Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

categorical_columns=['Gender','Physical_Activity_Level','Stress_Level','Chest_Pain_Type','Thalassemia','ECG_Results']

for col in categorical_columns:
    data_frame[col]=label_encoder.fit_transform(data_frame[col])
data_frame.head()

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved,Heart_Attack_Risk
0,69,0,1,0,2,34.61,1,0,152.1,171,85,0,2,2,2,0,1,0,114,Low
1,32,1,0,0,2,22.75,0,0,166.8,126,103,0,1,0,1,0,2,0,173,Moderate
2,89,1,0,1,2,35.32,0,0,272.3,123,127,0,1,3,2,0,2,0,109,Low
3,78,1,0,1,2,18.23,1,0,237.7,144,125,0,1,3,0,1,0,0,129,Low
4,38,0,1,0,2,19.82,0,0,207.7,123,107,0,0,0,2,0,2,0,124,Moderate


In [9]:
risk_mapping={'Low':0,'Moderate':1,'High':2}
data_frame['Heart_Attack_Risk']=data_frame['Heart_Attack_Risk'].map(risk_mapping)
data_frame.head()

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved,Heart_Attack_Risk
0,69,0,1,0,2,34.61,1,0,152.1,171,85,0,2,2,2,0,1,0,114,0
1,32,1,0,0,2,22.75,0,0,166.8,126,103,0,1,0,1,0,2,0,173,1
2,89,1,0,1,2,35.32,0,0,272.3,123,127,0,1,3,2,0,2,0,109,0
3,78,1,0,1,2,18.23,1,0,237.7,144,125,0,1,3,0,1,0,0,129,0
4,38,0,1,0,2,19.82,0,0,207.7,123,107,0,0,0,2,0,2,0,124,1


In [10]:
from sklearn.preprocessing import StandardScaler
# Числовые столбцы для масштабирования
numerical_columns=['Age','BMI','Cholesterol_Level','Resting_BP','Heart_Rate','Max_Heart_Rate_Achieved']

# Применение стандартизации
scaler=StandardScaler()
data_frame[numerical_columns]=scaler.fit_transform(data_frame[numerical_columns])

In [11]:
print(data_frame.head())

        Age  Gender  Smoking  Alcohol_Consumption  Physical_Activity_Level  \
0  0.750106       0        1                    0                        2   
1 -1.028843       1        0                    0                        2   
2  1.711700       1        0                    1                        2   
3  1.182823       1        0                    1                        2   
4 -0.740365       0        1                    0                        2   

        BMI  Diabetes  Hypertension  Cholesterol_Level  Resting_BP  \
0  0.981390         1             0          -1.683786    1.411391   
1 -0.660109         0             0          -1.344427   -0.325544   
2  1.079658         0             0           1.091110   -0.441340   
3 -1.285706         1             0           0.292347    0.369230   
4 -1.065640         0             0          -0.400223   -0.441340   

   Heart_Rate  Family_History  Stress_Level  Chest_Pain_Type  Thalassemia  \
0   -0.472112               0    

In [12]:
data_frame.to_csv('postproces_heart_attack_risk_dataset_original.csv',index=False)

In [13]:
data_processed=pd.read_csv('postproces_heart_attack_risk_dataset_original.csv')
data_processed.head()

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved,Heart_Attack_Risk
0,0.750106,0,1,0,2,0.98139,1,0,-1.683786,1.411391,-0.472112,0,2,2,2,0,1,0,-1.224884,0
1,-1.028843,1,0,0,2,-0.660109,0,0,-1.344427,-0.325544,0.419555,0,1,0,1,0,2,0,0.822063,1
2,1.7117,1,0,1,2,1.079658,0,0,1.09111,-0.44134,1.608443,0,1,3,2,0,2,0,-1.398354,0
3,1.182823,1,0,1,2,-1.285706,1,0,0.292347,0.36923,1.509369,0,1,3,0,1,0,0,-0.704474,0
4,-0.740365,0,1,0,2,-1.06564,0,0,-0.400223,-0.44134,0.617703,0,0,0,2,0,2,0,-0.877944,1


## Model

In [14]:
X=data_processed.drop(columns='Heart_Attack_Risk')
X.head(3)

Unnamed: 0,Age,Gender,Smoking,Alcohol_Consumption,Physical_Activity_Level,BMI,Diabetes,Hypertension,Cholesterol_Level,Resting_BP,Heart_Rate,Family_History,Stress_Level,Chest_Pain_Type,Thalassemia,Fasting_Blood_Sugar,ECG_Results,Exercise_Induced_Angina,Max_Heart_Rate_Achieved
0,0.750106,0,1,0,2,0.98139,1,0,-1.683786,1.411391,-0.472112,0,2,2,2,0,1,0,-1.224884
1,-1.028843,1,0,0,2,-0.660109,0,0,-1.344427,-0.325544,0.419555,0,1,0,1,0,2,0,0.822063
2,1.7117,1,0,1,2,1.079658,0,0,1.09111,-0.44134,1.608443,0,1,3,2,0,2,0,-1.398354


In [15]:
y=data_processed['Heart_Attack_Risk']
y.head(3)

Unnamed: 0,Heart_Attack_Risk
0,0
1,1
2,0


## Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
# Разделение на обучающую и тестовую выборки
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X.shape

(50000, 19)

In [17]:
# Настройка сетки гиперпараметров
param_grid = {
    'n_estimators': [50, 100,150,200],
    'max_depth': [10,20,30,40, None],
    'min_samples_split': [2, 5,10],
    'min_samples_leaf': [1, 2,4]
}

In [18]:
rf_model = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=20,  # Проверить 20 случайных комбинаций
    cv=3,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)


In [19]:
X_test.shape

(10000, 19)

In [20]:
# Настройка GridSearchCV

rf_model.fit(X_train,y_train)
# model2.fit(X_train,y_train)
# model2

In [21]:
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

In [22]:
predictions = best_model.predict(X_test)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
model_score = accuracy_score(y_test,predictions)
model_score

0.5082