In [1]:
# 모듈로딩

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
# 데이터 로드 및 전처리
DATA_FILE = 'diabetes_binary_health_indicators_BRFSS2021.csv'
df = pd.read_csv(DATA_FILE)


In [3]:
# 'HighChol'을 타겟으로 설정하고 나머지를 특성으로 사용
X = df.drop(['Smoker'], axis=1)
#X= df[['Diabetes_binary','HighBP','HeartDiseaseorAttack','GenHlth','Age']]
y = df['Smoker']

In [4]:
# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
# 학습, 검증, 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [7]:
# 커스텀 데이터셋 클래스
class DiabetesDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets.values).reshape(-1, 1)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [8]:
# 데이터셋 및 데이터로더 생성
train_dataset = DiabetesDataset(X_train, y_train)
val_dataset = DiabetesDataset(X_val, y_val)
test_dataset = DiabetesDataset(X_test, y_test)

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [9]:
# 모델 정의
class DiabetesModel(nn.Module):
    def __init__(self, input_dim):
        super(DiabetesModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, 64)
        self.layer2 = nn.Linear(64, 32)
        self.layer3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.sigmoid(self.layer3(x))
        return x

In [10]:
# 모델, 손실 함수, 옵티마이저 설정
input_dim = X_train.shape[1]
model = DiabetesModel(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
print(model)

DiabetesModel(
  (layer1): Linear(in_features=21, out_features=64, bias=True)
  (layer2): Linear(in_features=64, out_features=32, bias=True)
  (layer3): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)


In [13]:
#학습 함수
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for features, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * features.size(0)

        train_loss = train_loss / len(train_loader.dataset)

        # 검증
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets in val_loader:
                outputs = model(features)
                loss = criterion(outputs, targets)
                val_loss += loss.item() * features.size(0)

        val_loss = val_loss / len(val_loader.dataset)

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')



In [14]:
# 모델 학습
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)


Epoch 1/10, Train Loss: 0.6119, Val Loss: 0.6189
Epoch 2/10, Train Loss: 0.6115, Val Loss: 0.6197
Epoch 3/10, Train Loss: 0.6110, Val Loss: 0.6205
Epoch 4/10, Train Loss: 0.6107, Val Loss: 0.6194
Epoch 5/10, Train Loss: 0.6103, Val Loss: 0.6202
Epoch 6/10, Train Loss: 0.6098, Val Loss: 0.6200
Epoch 7/10, Train Loss: 0.6095, Val Loss: 0.6210
Epoch 8/10, Train Loss: 0.6093, Val Loss: 0.6211
Epoch 9/10, Train Loss: 0.6088, Val Loss: 0.6228
Epoch 10/10, Train Loss: 0.6085, Val Loss: 0.6228


In [15]:
# 테스트 데이터로 모델 평가
model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for features, targets in test_loader:
        outputs = model(features)
        loss = criterion(outputs, targets)
        test_loss += loss.item() * features.size(0)
        predicted = (outputs > 0.5).float()
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

test_loss = test_loss / len(test_loader.dataset)
accuracy = correct / total

print(f'Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}')

Test Loss: 0.6188, Accuracy: 0.6569


In [16]:
torch.save({
    'model_state_dict': model.state_dict(),
    'input_dim': input_dim,
    'scaler': scaler
}, 'model_best_dict.pt')