In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hearth-disease-recognition/sample_submission.csv
/kaggle/input/hearth-disease-recognition/train.csv
/kaggle/input/hearth-disease-recognition/test.csv


In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
train_df = pd.read_csv('/kaggle/input/hearth-disease-recognition/train.csv')

In [4]:
print(train_df.isnull().sum())

ID                                            0
History of HeartDisease or Attack          1694
High Blood Pressure                           0
Told High Cholesterol                     32186
Cholesterol Checked                           0
Body Mass Index                           11782
Smoked 100+ Cigarettes                        1
Diagnosed Stroke                              0
Diagnosed Diabetes                            3
Leisure Physical Activity                     0
Heavy Alcohol Consumption                     0
Health Care Coverage                          0
Doctor Visit Cost Barrier                     1
General Health                                1
Difficulty Walking                            3
Sex                                           0
Education Level                               0
Income Level                                  0
Age                                           0
Vegetable or Fruit Intake (1+ per Day)        0
dtype: int64


In [5]:
train_df = train_df.dropna()

In [6]:
print(train_df.isnull().sum())

ID                                        0
History of HeartDisease or Attack         0
High Blood Pressure                       0
Told High Cholesterol                     0
Cholesterol Checked                       0
Body Mass Index                           0
Smoked 100+ Cigarettes                    0
Diagnosed Stroke                          0
Diagnosed Diabetes                        0
Leisure Physical Activity                 0
Heavy Alcohol Consumption                 0
Health Care Coverage                      0
Doctor Visit Cost Barrier                 0
General Health                            0
Difficulty Walking                        0
Sex                                       0
Education Level                           0
Income Level                              0
Age                                       0
Vegetable or Fruit Intake (1+ per Day)    0
dtype: int64


In [7]:
train_df.loc[(train_df['Cholesterol Checked'] == 'No') & (train_df['Told High Cholesterol'].isnull()), 'Told High Cholesterol'] = 'No'
train_df.loc[(train_df['Cholesterol Checked'] == 'Yes') & (train_df['Told High Cholesterol'].isnull()), 'Told High Cholesterol'] = train_df['Told High Cholesterol'].mode()[0]

In [8]:
cols_to_impute = ['Body Mass Index', 'Smoked 100+ Cigarettes', 'Diagnosed Diabetes', 'Doctor Visit Cost Barrier', 'General Health', 'Difficulty Walking']
for col in cols_to_impute:
    train_df[col] = train_df[col].fillna(train_df[col].sample(n=1).values[0])

In [9]:
train_df = train_df.dropna(subset=['History of HeartDisease or Attack'])

In [10]:
print(train_df.isnull().sum())

ID                                        0
History of HeartDisease or Attack         0
High Blood Pressure                       0
Told High Cholesterol                     0
Cholesterol Checked                       0
Body Mass Index                           0
Smoked 100+ Cigarettes                    0
Diagnosed Stroke                          0
Diagnosed Diabetes                        0
Leisure Physical Activity                 0
Heavy Alcohol Consumption                 0
Health Care Coverage                      0
Doctor Visit Cost Barrier                 0
General Health                            0
Difficulty Walking                        0
Sex                                       0
Education Level                           0
Income Level                              0
Age                                       0
Vegetable or Fruit Intake (1+ per Day)    0
dtype: int64


In [11]:
train_df

Unnamed: 0,ID,History of HeartDisease or Attack,High Blood Pressure,Told High Cholesterol,Cholesterol Checked,Body Mass Index,Smoked 100+ Cigarettes,Diagnosed Stroke,Diagnosed Diabetes,Leisure Physical Activity,Heavy Alcohol Consumption,Health Care Coverage,Doctor Visit Cost Barrier,General Health,Difficulty Walking,Sex,Education Level,Income Level,Age,Vegetable or Fruit Intake (1+ per Day)
0,train_000001,No,Yes,Yes,Yes,40.68,Yes,No,No,No,No,Yes,No,Very Poor,Yes,Female,High school graduate,"$15,000 to less than $20,000",64,Yes
1,train_000002,No,No,No,No,24.36,Yes,No,No,Yes,No,No,Yes,Fair,No,Female,College graduate,"Less than $10,000",50,No
2,train_000003,No,Yes,Yes,Yes,27.33,No,No,No,No,No,Yes,Yes,Very Poor,Yes,Female,High school graduate,"$75,000 or more",61,Yes
3,train_000004,No,Yes,No,Yes,27.01,No,No,No,Yes,No,Yes,No,Good,No,Female,Some high school,"$35,000 to less than $50,000",74,Yes
5,train_000006,No,Yes,Yes,Yes,25.11,Yes,No,No,Yes,No,Yes,No,Good,No,Male,College graduate,"$75,000 or more",67,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223079,train_223080,No,No,No,Yes,28.20,No,No,No,No,No,Yes,No,Excellent,No,Female,College graduate,"$25,000 to less than $35,000",34,Yes
223080,train_223081,No,Yes,Yes,Yes,45.01,No,No,No,No,No,Yes,No,Fair,No,Male,College graduate,"$50,000 to less than $75,000",43,Yes
223081,train_223082,No,Yes,Yes,Yes,18.94,No,No,Yes,No,No,Yes,No,Poor,Yes,Female,Elementary,"$20,000 to less than $25,000",72,No
223082,train_223083,No,No,No,Yes,29.29,No,No,No,Yes,No,Yes,No,Excellent,No,Female,Some college or technical school,"($10,000 to less than $15,000",28,Yes


In [12]:
categorical_columns = ['High Blood Pressure', 'Told High Cholesterol', 'Cholesterol Checked', 'Smoked 100+ Cigarettes', 
                       'Diagnosed Stroke', 'Diagnosed Diabetes', 'Leisure Physical Activity', 'Heavy Alcohol Consumption', 
                       'Health Care Coverage', 'Doctor Visit Cost Barrier', 'General Health', 'Difficulty Walking', 
                       'Sex', 'Education Level','Income Level', 'Vegetable or Fruit Intake (1+ per Day)']

le = LabelEncoder()
for col in categorical_columns:
    train_df[col] = le.fit_transform(train_df[col])

In [13]:
scaler = MinMaxScaler()
train_df[['Body Mass Index']] = scaler.fit_transform(train_df[['Body Mass Index']])

In [14]:
X = train_df.drop(['ID', 'History of HeartDisease or Attack'], axis=1)
y = train_df['History of HeartDisease or Attack'].map({'No': 0, 'Yes': 1})

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
smote = SMOTE(sampling_strategy=0.7, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [17]:
X_train

Unnamed: 0,High Blood Pressure,Told High Cholesterol,Cholesterol Checked,Body Mass Index,Smoked 100+ Cigarettes,Diagnosed Stroke,Diagnosed Diabetes,Leisure Physical Activity,Heavy Alcohol Consumption,Health Care Coverage,Doctor Visit Cost Barrier,General Health,Difficulty Walking,Sex,Education Level,Income Level,Age,Vegetable or Fruit Intake (1+ per Day)
222711,1,1,1,0.103492,1,0,0,1,0,1,1,3,1,0,1,0,47,1
27470,1,0,1,0.270979,1,1,1,1,0,1,0,1,1,0,2,0,90,1
90488,0,1,1,0.179279,0,0,0,1,0,1,0,0,0,0,2,5,51,1
184804,0,1,1,0.067544,0,0,0,1,0,1,0,2,0,0,0,5,48,1
71278,1,1,1,0.168861,0,0,0,1,0,1,0,1,0,1,2,5,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190460,0,1,1,0.089067,0,0,0,1,0,1,0,2,0,0,0,5,60,0
62446,1,0,1,0.340813,1,0,0,1,0,1,0,1,0,0,4,2,69,1
159429,0,0,1,0.212364,1,0,0,1,0,0,0,0,0,0,4,5,44,0
124480,0,1,1,0.207899,0,0,0,1,0,1,0,2,0,1,4,5,41,1


In [18]:
test_df = pd.read_csv('/kaggle/input/hearth-disease-recognition/test.csv')

In [19]:
categorical_columns_test = ['High Blood Pressure','Told High Cholesterol', 'Cholesterol Checked', 'Smoked 100+ Cigarettes', 
                       'Diagnosed Stroke', 'Diagnosed Diabetes', 'Leisure Physical Activity', 'Heavy Alcohol Consumption', 
                       'Health Care Coverage', 'Doctor Visit Cost Barrier', 'General Health', 'Difficulty Walking', 
                       'Sex', 'Education Level','Income Level', 'Vegetable or Fruit Intake (1+ per Day)']

le = LabelEncoder()
for col in categorical_columns_test:
    test_df[col] = le.fit_transform(test_df[col])

In [20]:
scaler = MinMaxScaler()
test_df[['Body Mass Index']] = scaler.fit_transform(test_df[['Body Mass Index']])

In [21]:
X_test = test_df.drop(['ID'], axis=1)

In [22]:
X_test

Unnamed: 0,High Blood Pressure,Told High Cholesterol,Cholesterol Checked,Body Mass Index,Smoked 100+ Cigarettes,Diagnosed Stroke,Diagnosed Diabetes,Leisure Physical Activity,Heavy Alcohol Consumption,Health Care Coverage,Doctor Visit Cost Barrier,General Health,Difficulty Walking,Sex,Education Level,Income Level,Age,Vegetable or Fruit Intake (1+ per Day)
0,1,1,1,0.156213,0,0,0,1,0,1,0,2,0,0,4,1,71,1
1,1,0,1,0.205827,1,0,0,0,0,1,0,1,0,0,0,4,61,0
2,1,1,1,0.277791,1,0,0,0,0,1,0,1,1,0,4,7,67,1
3,0,0,1,0.155511,1,0,0,0,0,1,0,1,0,0,4,4,50,1
4,0,0,1,0.188158,1,0,0,0,0,1,0,1,0,1,4,2,40,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74356,1,0,1,0.132928,0,0,0,0,0,1,0,1,0,0,4,6,47,1
74357,1,1,1,0.196349,1,0,0,0,0,1,0,1,0,0,2,4,71,1
74358,1,1,1,0.168383,0,0,1,0,0,1,0,4,1,0,0,1,90,1
74359,1,1,1,0.143342,0,1,0,0,0,1,1,3,0,1,5,6,59,0


In [23]:
param_grid = {
'num_leaves': [20, 31, 50, 100],
    'max_depth': [-1, 5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 300, 500, 1000],
}

In [24]:
model = lgb.LGBMClassifier(objective='binary', metric='binary_logloss', boosting_type='gbdt')

In [25]:
f2_scorer = make_scorer(fbeta_score, beta=2)

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=f2_scorer)
grid_search.fit(X_train_smote, y_train_smote)

[LightGBM] [Info] Number of positive: 72746, number of negative: 103924
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 384
[LightGBM] [Info] Number of data points in the train set: 176670, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411762 -> initscore=-0.356686
[LightGBM] [Info] Start training from score -0.356686
[LightGBM] [Info] Number of positive: 72746, number of negative: 103924
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 384
[LightGBM] [Info] Number of data points in the train set: 176670, number of used features: 18
[LightGBM] [Info

In [None]:
best_params = grid_search.best_params_
model = lgb.LGBMClassifier()
model.fit(X_train_smote, y_train_smote)

In [None]:
y_pred = test_df['History of HeartDisease or Attack'] = model.predict(X_test)
test_df['History of HeartDisease or Attack'] = test_df['History of HeartDisease or Attack'].map({0: 'No', 1: 'Yes'})

In [None]:
submission_df = test_df[['ID', 'History of HeartDisease or Attack']]
submission_df.to_csv('submissionnn.csv', index=False)
print("เสด")

In [None]:
submit = pd.read_csv("/kaggle/working/submissionnn.csv")
print(submit["History of HeartDisease or Attack"].value_counts())

In [None]:
cm = confusion_matrix(y_val, y_pred)

# แสดงผลลัพธ์
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Disease", "Heart Disease"], yticklabels=["No Disease", "Heart Disease"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# แสดงผลลัพธ์เพิ่มเติม
print("🔹 Classification Report:")
print(classification_report(y_val, y_pred))