In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2022-ai-midterm-p3/train.csv
/kaggle/input/2022-ai-midterm-p3/test.csv
/kaggle/input/2022-ai-midterm-p3/submit_sample.csv


In [2]:
#랜덤시드 고정
import random
import torch

seed = 1
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [3]:
#데이터 로드
sample = pd.read_csv('/kaggle/input/2022-ai-midterm-p3/submit_sample.csv')
train = pd.read_csv('/kaggle/input/2022-ai-midterm-p3/train.csv')
test = pd.read_csv('/kaggle/input/2022-ai-midterm-p3/test.csv')

In [4]:
#gpu를 사용가능하면 사용하는 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
#데이터 확인
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   index     242 non-null    int64  
 1   age       242 non-null    int64  
 2   sex       242 non-null    int64  
 3   cp        242 non-null    int64  
 4   trestbps  242 non-null    int64  
 5   chol      242 non-null    int64  
 6   fbs       242 non-null    int64  
 7   restecg   242 non-null    int64  
 8   thalach   242 non-null    int64  
 9   exang     242 non-null    int64  
 10  oldpeak   242 non-null    float64
 11  slope     242 non-null    int64  
 12  ca        242 non-null    object 
 13  thal      242 non-null    object 
 14  target    242 non-null    int64  
dtypes: float64(1), int64(12), object(2)
memory usage: 28.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    -----------

In [6]:
#결측값 확인
print(train['ca'].unique())
print(test['ca'].unique())
print(train['thal'].unique())
print(test['thal'].unique())

['0' '3' '2' '1' '?']
['0' '2' '1' '3' '?']
['6' '3' '7' '?']
['3' '7' '6' '?']


In [7]:
#결측값 -1로 처리
box1 = train['ca'].copy()
box = train[train['ca']=='?'].index
for i in box:
    box1[i] = -1
train['ca'] = box1.astype(int)

box1 = test['ca'].copy()
box = test[test['ca']=='?'].index
for i in box:
    box1[i] = -1
test['ca'] = box1.astype(int)

box1 = train['thal'].copy()
box = train[train['thal']=='?'].index
for i in box:
    box1[i] = -1
train['thal'] = box1.astype(int)

box1 = test['thal'].copy()
box = test[test['thal']=='?'].index
for i in box:
    box1[i] = -1
test['thal'] = box1.astype(int)

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   index     61 non-null     int64  
 1   age       61 non-null     int64  
 2   sex       61 non-null     int64  
 3   cp        61 non-null     int64  
 4   trestbps  61 non-null     int64  
 5   chol      61 non-null     int64  
 6   fbs       61 non-null     int64  
 7   restecg   61 non-null     int64  
 8   thalach   61 non-null     int64  
 9   exang     61 non-null     int64  
 10  oldpeak   61 non-null     float64
 11  slope     61 non-null     int64  
 12  ca        61 non-null     int64  
 13  thal      61 non-null     int64  
dtypes: float64(1), int64(13)
memory usage: 6.8 KB


In [9]:
#데이터 전처리
xtrain = train.drop(['index','target'],axis=1)
ytrain = train['target']
xtest = test.drop(['index'],axis=1)

In [10]:
ytrain.unique()

array([0, 2, 1, 3, 4])

In [11]:
#제출하는 정답은 심장병 유무이므로 정답데이터역시 0혹은 1로만 있도록 전처리
box = ytrain.copy()
box[box >= 1] = 1
ytrain = box

In [12]:
X_train = torch.FloatTensor(np.array(xtrain)).to(device)
y_train = torch.FloatTensor(np.array(ytrain)).to(device)
X_test = torch.FloatTensor(np.array(xtest)).to(device)

In [13]:
#모델 빌드를 위한 설정값 확인
print(X_train.shape)
print(y_train.shape)
print(len(y_train.unique()))

torch.Size([242, 13])
torch.Size([242])
2


In [14]:
#모델 빌드
import torch.nn as nn

class DNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(13, 64, bias=True)
        self.linear2 = nn.Linear(64, 64, bias=True)
        self.linear3 = nn.Linear(64, 32, bias=True)
        self.linear4 = nn.Linear(32, 1, bias=True)
        
        self.active = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.2)
        
        nn.init.xavier_normal_(self.linear1.weight)
        nn.init.xavier_normal_(self.linear2.weight)
        nn.init.xavier_normal_(self.linear3.weight)
        nn.init.xavier_normal_(self.linear4.weight)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.active(x)
        x = self.dropout(x)
        x = self.linear2(x)
        x = self.active(x)
        x = self.dropout(x)
        x = self.linear3(x)
        x = self.active(x)
        x = self.dropout(x)
        x = self.linear4(x)
        x = self.sigmoid(x)
        
        return x

In [15]:
#모델 정의
model = DNN().to(device)

In [16]:
import torch.optim as optim

#Optimizer 정의
optimizer = optim.Adam(model.parameters(), lr=1e-3)

#Loss 정의
loss = nn.BCELoss()

In [17]:
#모델 학습
epochs = 10000
for epoch in range(epochs + 1):
    
    optimizer.zero_grad()
    
    hypothesis = model(X_train)
    
    cost = loss(hypothesis, y_train.unsqueeze(1))
    
    cost.backward()
    
    optimizer.step()
    
    if epoch % 1000 == 0:
        print(epoch, cost.item())

0 17.76585578918457
1000 0.447252094745636
2000 0.24425284564495087
3000 0.15992723405361176
4000 0.07091192156076431
5000 0.06518223136663437
6000 0.07438194006681442
7000 0.054621435701847076
8000 0.03916163742542267
9000 0.06671217828989029
10000 0.04392324760556221


In [18]:
#학습된 모델을 이용해서 Test 데이터 예측
with torch.no_grad():
    model.eval()
    hypothesis = model(X_test)
    predict = (hypothesis >= torch.FloatTensor([0.45]).to(device)).type(torch.uint8)

In [19]:
#예측한 값을 csv로 변환
sample['target'] = predict.cpu().detach().numpy()
sample.to_csv('submit.csv',index=False)