In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

train = pd.read_csv('C:/Users/user/python_인공지능/Hyundai_AI_Challenge/train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('C:/Users/user/python_인공지능/Hyundai_AI_Challenge/test.csv').drop(columns=['SAMPLE_ID'])

# datetime 컬럼 처리
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

# datetime을 여러 파생 변수로 변환
for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

# datetime 컬럼 제거
train.drop(columns='ATA', inplace=True)
test.drop(columns='ATA', inplace=True)

# Categorical 컬럼 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

# 결측치 처리
train.fillna(train.mean(), inplace=True)
test.fillna(train.mean(), inplace=True)

Encoding features: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.22it/s]


In [2]:
# 학습용 데이터 정답 분리
X = train.drop('CI_HOUR', axis=1)
y = train['CI_HOUR']

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,dataloader

In [6]:
# 토치텐서로 변환
x_train = torch.tensor(x_train.values,dtype=torch.float32)
y_train = torch.tensor(y_train.values,dtype=torch.float32)
x_test = torch.tensor(x_test.values,dtype=torch.float32)
y_test = torch.tensor(y_test.values,dtype=torch.float32)

In [7]:
# 다층 퍼셉트론
class MLP(nn.Module):
  def __init__(self):
    super(MLP,self).__init__()
    self.fc1 = nn.Linear(in_features=len(X.columns), out_features=64)
    self.fc2 = nn.Linear(in_features=64, out_features=32)
    self.fc3 = nn.Linear(in_features=32, out_features=1)
    self.relu = nn.ReLU()
  def forward(self,x):
    x = self.relu(self.fc1(x))
    x = self.relu(self.fc2(x))
    x = self.fc3(x)
    return x
model = MLP()

In [8]:
y_train.reshape(-1,1)

tensor([[  6.6269],
        [  0.0000],
        [  0.0000],
        ...,
        [126.1011],
        [  3.3358],
        [ 15.0289]])

In [9]:
train_dataset = torch.utils.data.TensorDataset(x_train,y_train.reshape(-1,1))
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=64, shuffle=True)

opt = optim.Adam(model.parameters(), lr = 1e-3)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.to(device)
for epoch in range(10):
  iter = tqdm(train_loader)
  for data, label in iter:
    opt.zero_grad()
    pred = model(data.to(device))
    loss = nn.MSELoss()(pred,label.to(device))
    loss.backward()
    opt.step()
    iter.set_description(f"epoch{epoch} loss:{loss.item()}")
torch.save(model.state_dict(), 'MLP.pth')

epoch0 loss:15292.8115234375: 100%|███████████████████████████████████████████████| 4593/4593 [00:14<00:00, 312.35it/s]
epoch1 loss:56782.00390625: 100%|█████████████████████████████████████████████████| 4593/4593 [00:15<00:00, 303.75it/s]
epoch2 loss:58582.3046875: 100%|██████████████████████████████████████████████████| 4593/4593 [00:15<00:00, 299.91it/s]
epoch3 loss:37901.3671875: 100%|██████████████████████████████████████████████████| 4593/4593 [00:15<00:00, 291.90it/s]
epoch4 loss:12015.0234375: 100%|██████████████████████████████████████████████████| 4593/4593 [00:15<00:00, 303.60it/s]
epoch5 loss:106523.0859375: 100%|█████████████████████████████████████████████████| 4593/4593 [00:14<00:00, 307.87it/s]
epoch6 loss:49907.38671875: 100%|█████████████████████████████████████████████████| 4593/4593 [00:14<00:00, 308.87it/s]
epoch7 loss:39937.828125: 100%|███████████████████████████████████████████████████| 4593/4593 [00:14<00:00, 312.70it/s]
epoch8 loss:5568.81591796875: 100%|█████

In [10]:
# 모델 불러오기
model.load_state_dict(torch.load('MLP.pth', map_location=device))

<All keys matched successfully>

In [11]:
test_data =  torch.tensor(test.values,dtype=torch.float32)

In [12]:
with torch.no_grad():
  pred = model(test_data)

In [13]:
pred.numpy().shape

(244989, 1)

In [14]:
submit = pd.read_csv('C:/Users/user/python_인공지능/Hyundai_AI_Challenge/sample_submission.csv')
submit['CI_HOUR'] = pred.numpy()
submit.to_csv('C:/Users/user/python_인공지능/Hyundai_AI_Challenge/baseline_submit.csv', index=False)