In [17]:
import pandas as pd
import numpy as np
import torch
from torch import nn, Tensor, optim, cuda
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler

In [18]:
train_set = pd.read_csv("./data/train_data.csv")
print(train_set.shape)

(39457, 294)


In [19]:
if torch.backends.mps.is_available():
    device = 'mps'
    mps_device = torch.device(device)
    torch.cuda.manual_seed_all(777)
    print(device)
else:
    print ("MPS device not found.")

torch.manual_seed(777)

mps


<torch._C.Generator at 0x159de9c50>

In [20]:
# 불필요한 열 제거
train_set_cleaned = data = train_set.loc[:, train_set.nunique() != 1]

# 결과 확인
print(train_set_cleaned.shape)

(39457, 204)


In [21]:
# 결측값 채우기 함수 (벡터화된 방식)
def fill_missing_values_vectorized(data):
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    columns_to_process = [col for col in numeric_columns if col not in ['id']]
    
    data = data.copy()  # 원본 데이터 수정 방지

    even_id_indices = data.index[data['id'] % 2 == 0]
    
    for col in columns_to_process:
        col_data = data[col].values
        for i in even_id_indices:
            if np.isnan(col_data[i]):
                if i == 0:
                    col_data[i] = col_data[i + 1]
                elif i == len(col_data) - 1:
                    col_data[i] = col_data[i - 1]
                else:
                    prev_val = col_data[i - 1]
                    next_val = col_data[i + 1]
                    if not np.isnan(prev_val) and not np.isnan(next_val):
                        col_data[i] = (prev_val + next_val) / 2
                    elif not np.isnan(prev_val):
                        col_data[i] = prev_val
                    elif not np.isnan(next_val):
                        col_data[i] = next_val
        data[col] = col_data
    
    return data

In [22]:
# Apply the function to fill missing values
filled_trained_set_subset = fill_missing_values_vectorized(train_set_cleaned)

In [23]:
# 데이터 셋 읽기
data = filled_trained_set_subset  # 이미 처리된 데이터셋을 사용하는 것으로 가정
x_data = data.iloc[:, 3:]
y_data = data.iloc[:, [2]]

# 숫자형 열만 선택
numeric_columns = x_data.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
x_data[numeric_columns] = scaler.fit_transform(x_data[numeric_columns])
x_data.fillna(0, inplace=True)
x_train = torch.Tensor(x_data[numeric_columns].values).to(device)
y_train = torch.Tensor(y_data.values).to(device)
print(x_data.shape)
print(y_data.shape)

(39457, 201)
(39457, 1)


In [24]:
class BatteryOutputPredictionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(201, 150)
        self.batch_norm1 = nn.BatchNorm1d(150)
        self.linear2 = nn.Linear(150, 100)
        self.batch_norm2 = nn.BatchNorm1d(100)
        self.linear3 = nn.Linear(100, 50)
        self.batch_norm3 = nn.BatchNorm1d(50)
        self.linear4 = nn.Linear(50, 1)

        # Initialize weights using Xavier uniform initialization
        torch.nn.init.xavier_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
        torch.nn.init.xavier_uniform_(self.linear3.weight)
        torch.nn.init.xavier_uniform_(self.linear4.weight)

        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.batch_norm1(x)
        x = self.relu(x)

        x = self.linear2(x)
        x = self.batch_norm2(x)
        x = self.relu(x)

        x = self.linear3(x)
        x = self.batch_norm3(x)
        x = self.relu(x)

        x = self.linear4(x)  # No batch norm after the last linear layer
        return x

In [25]:
model = BatteryOutputPredictionModel().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.1)
criterion = nn.MSELoss()

In [26]:
# 배치 크기 정의
batch_size = 64

# TensorDataset과 DataLoader를 사용하여 배치 생성
dataset = TensorDataset(x_train, y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [27]:
def train(dataloader,model,optimizer,criterion):

  total_batch = len(dataloader)
  model.train()

  for epoch in range(1000):
    avg_cost = 0

    for X, Y in dataloader:
      X = X.to(device)
      Y = Y.to(device)

      optimizer.zero_grad()

      prediction = model(X)

      loss = criterion(prediction,Y)

      loss.backward()
      optimizer.step()

      avg_cost += loss / total_batch

    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

In [28]:
train(dataloader,model,optimizer,criterion)

Epoch: 0001 cost = 4.581457138
Epoch: 0002 cost = 4.478216648
Epoch: 0003 cost = 4.442858219
Epoch: 0004 cost = 4.419226646
Epoch: 0005 cost = 4.390472889
Epoch: 0006 cost = 4.370916843
Epoch: 0007 cost = 4.346486092
Epoch: 0008 cost = 4.313405514
Epoch: 0009 cost = 4.302952290
Epoch: 0010 cost = 4.256981373
Epoch: 0011 cost = 4.237605572
Epoch: 0012 cost = 4.202495098
Epoch: 0013 cost = 4.166481495
Epoch: 0014 cost = 4.155059814
Epoch: 0015 cost = 4.101372242
Epoch: 0016 cost = 4.080035210
Epoch: 0017 cost = 4.074145317
Epoch: 0018 cost = 4.022490978
Epoch: 0019 cost = 4.013061523
Epoch: 0020 cost = 3.987369776
Epoch: 0021 cost = 3.949738979
Epoch: 0022 cost = 3.937933445
Epoch: 0023 cost = 3.880517483
Epoch: 0024 cost = 3.874110937
Epoch: 0025 cost = 3.873162031
Epoch: 0026 cost = 3.856519699
Epoch: 0027 cost = 3.830402851
Epoch: 0028 cost = 3.829378366
Epoch: 0029 cost = 3.789845943
Epoch: 0030 cost = 3.798535109
Epoch: 0031 cost = 3.731928825
Epoch: 0032 cost = 3.700844765
Epoch: 0

In [29]:
test_set = pd.read_csv("./data/test_data.csv")
print(test_set.shape)

(9888, 293)


In [30]:
# 불필요한 열 제거
test_set_cleaned = data = test_set.loc[:, test_set.nunique() != 1]

# 결과 확인
print(test_set_cleaned.shape)

(9888, 203)


In [31]:
filled_test_set_subset = fill_missing_values_vectorized(test_set_cleaned)
print(filled_test_set_subset.shape)
print(type(filled_test_set_subset))

(9888, 203)
<class 'pandas.core.frame.DataFrame'>


In [32]:
# 데이터 셋 읽기
data = filled_test_set_subset  # 이미 처리된 데이터셋을 사용하는 것으로 가정
x_data = data.iloc[:, 2:]

# 숫자형 열만 선택
numeric_columns = x_data.select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
x_data[numeric_columns] = scaler.fit_transform(x_data[numeric_columns])
x_data.fillna(0, inplace=True)
x_data = torch.Tensor(x_data[numeric_columns].values).to(device)
print(x_data.shape)

torch.Size([9888, 201])


In [33]:
# 배치 크기 정의
batch_size = 64

# TensorDataset과 DataLoader를 사용하여 배치 생성
dataset = TensorDataset(x_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=False)

In [37]:
def evaluate_and_save(dataloader, model, file_path):
    model.eval()  # 모델을 평가 모드로 설정
    predictions = []

    with torch.no_grad():  # 기울기 계산 비활성화
        for X, in dataloader:  # 데이터 로더에서 데이터 배치를 반복
            X = X.to(device)  # 데이터를 적절한 디바이스로 이동 (CPU 또는 GPU)
            output = model(X)  # 모델로부터 예측 수행
            predictions.extend(output.cpu().numpy())  # 예측값을 리스트에 저장

    # 예측 결과를 DataFrame으로 변환
    predicted_df = pd.DataFrame(predictions, columns=['battery_output'])
    # "id" 컬럼 추가
    predicted_df["id"] = range(39457, 39457 + len(predicted_df))
    # "id" 컬럼을 첫 번째 컬럼으로 이동
    predicted_df = predicted_df[["id", "battery_output"]]
    print(predicted_df.shape)
    
    # 결과를 CSV 파일로 저장
    predicted_df.to_csv(file_path, index=False)


In [38]:
# 함수 호출
file_path = "./submission/Linear_Prediction.csv"
evaluate_and_save(dataloader, model, file_path)

(9888, 2)
