In [1953]:
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [1954]:
# gpu
if torch.backends.mps.is_available():
    device = 'mps'
    mps_device = torch.device(device)
    torch.cuda.manual_seed_all(777)
    print(device)
else:
    print ("MPS device not found.")

device = 'cpu'

mps


In [1955]:
# Random seed to make results deterministic and reproducible
torch.manual_seed(0)

<torch._C.Generator at 0x10fe08af0>

In [1956]:
# hyper parameters

seq_length = 10
data_dim = 200 # Exclude label
hidden_dim = 30
output_dim = 1
learning_rate = 0.015
iterations = 20

In [1957]:
# np.min(data, 0)  ->  열(column)별 최소값
# np.max(data, 0)  ->  열(column)별 최대값
# scaling function for input data
def minmax_scaler(data):
    # numerator: 원본 데이터 - 최소값 = 데이터를 0 기준으로 이동
    numerator = data - np.min(data, axis=0)
    # denominator: 최대값과 최소값의 차: 데이터의 범위
    denominator = np.max(data, axis=0) - np.min(data, axis=0)

    # 데이터의 값을 0과 1 사이로 스케일링
    # 1e-7: ZeroDivisionError 방지
    return numerator / (denominator + 1e-7), np.min(data, axis=0), np.max(data, axis=0)

In [1958]:
# inverse scaling function
def inverse_minmax_scaler(data, min_val, max_val):
    return data * (max_val - min_val + 1e-7) + min_val

In [1959]:
# make dataset to input
# 시계열 데이터를 입력으로 받아 학습에 사용할 데이터셋을 생성
def build_dataset(time_series, seq_length):

    dataX = []
    dataY = []

    # 반복문 0~(데이터 길이 - 시퀀스 길이)
    for i in range(0, len(time_series) - seq_length):

        # row: i부터 시퀀스 길이(i+시퀀스 길이 직전) | column: 전부(label 행 포함)
        #_x = time_series[i:i + seq_length, :] # Include the close price column
        _x = time_series[i:i + seq_length, :-1] # Exclude the close price column

        # row: i+시퀀스 길이 열(1개 열) | column: 마지막 행 (label, 종가)
        _y = time_series[i + seq_length, [-1]]  # Next close price

        # print(_x, "->", _y) # 시각화

        dataX.append(_x) # input data set에 추가
        dataY.append(_y) # label set에 추가

    # numpy 배열로 리턴
    return np.array(dataX), np.array(dataY)

In [1960]:
# make dataset to input
# 시계열 데이터를 입력으로 받아 학습에 사용할 데이터셋을 생성
def build_dataset_test(time_series, seq_length):

    dataX = []

    # 반복문 0~(데이터 길이 - 시퀀스 길이)
    for i in range(0, len(time_series) - seq_length):

        # row: i부터 시퀀스 길이(i+시퀀스 길이 직전) | column: 전부(label 행 포함)
        #_x = time_series[i:i + seq_length, :] # Include the close price column
        _x = time_series[i:i + seq_length, :] # Exclude the close price column

        # print(_x, "->  ?") # 시각화

        dataX.append(_x) # input data set에 추가

    # numpy 배열로 리턴
    return np.array(dataX)

In [1961]:
# 결측값 채우기 함수 (벡터화된 방식)
def fill_missing_values_vectorized(data):
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    columns_to_process = [col for col in numeric_columns if col not in ['id']]
    
    data = data.copy()  # 원본 데이터 수정 방지

    even_id_indices = data.index[data['id'] % 2 == 0]
    
    for col in columns_to_process:
        col_data = data[col].values
        for i in even_id_indices:
            if np.isnan(col_data[i]):
                if i == 0:
                    col_data[i] = col_data[i + 1]
                elif i == len(col_data) - 1:
                    col_data[i] = col_data[i - 1]
                else:
                    prev_val = col_data[i - 1]
                    next_val = col_data[i + 1]
                    if not np.isnan(prev_val) and not np.isnan(next_val):
                        col_data[i] = (prev_val + next_val) / 2
                    elif not np.isnan(prev_val):
                        col_data[i] = prev_val
                    elif not np.isnan(next_val):
                        col_data[i] = next_val
        data[col] = col_data
    
    return data

In [1962]:
# load train data
train_set = pd.read_csv("./data/train_data.csv")
print(train_set.shape)
# print(train_set.head())

# load test data
test_set = pd.read_csv("./data/test_data.csv")
print(test_set.shape)
# print(test_set.head())

(39457, 294)
(9888, 293)


In [1963]:
# 불필요한 열 제거
# column에 모든 value가 같으면 해당 column 제거

train_set = train_set.loc[:, train_set.nunique() != 1]
print(train_set.shape)

test_set = test_set.loc[:, test_set.nunique() != 1]
print(test_set.shape)

(39457, 204)
(9888, 203)


In [1964]:
# 비어있는 셀에 같은 column 앞,뒤 셀의 평균을 채워넣기

filled_train_set = fill_missing_values_vectorized(train_set)
print(filled_train_set.shape)
# print(type(filled_train_set))

filled_test_set = fill_missing_values_vectorized(test_set)
print(filled_test_set.shape)
# print(type(filled_test_set))

(39457, 204)
(9888, 203)


In [1965]:
# id, time column 버리기

PP_train_data = filled_train_set.iloc[:,2:]
print(PP_train_data.shape)
# print(type(filled_train_set))

PP_test_data = filled_test_set.iloc[:,2:]
print(PP_test_data.shape)
# print(type(PP_test_data))

(39457, 202)
(9888, 201)


In [1966]:
# PP_train_data 만 !!!
# 첫 번째 열(battery_output)을 마지막(y_data 위치)으로 이동
cols = PP_train_data.columns.tolist()
cols.append(cols.pop(0))
PP_train_data = PP_train_data[cols]
print(PP_train_data.shape)

(39457, 202)


In [1967]:
# J번째 column 버리기 (값이 유실된 cell 다수)
PP_train_data = PP_train_data.drop(PP_train_data.columns[9], axis=1)
PP_test_data = PP_test_data.drop(PP_test_data.columns[9], axis=1)

# 아직도 빈 곳을 column의 평균으로 채우기
PP_train_data = PP_train_data.apply(lambda col: col.fillna(col.mean()), axis=0)
PP_test_data = PP_test_data.apply(lambda col: col.fillna(col.mean()), axis=0)

# column name 담긴 row 앞에 '#' 추가
PP_train_data = PP_train_data.rename(columns={'East_Midlands_price': '#East_Midlands_price'})
PP_test_data = PP_test_data.rename(columns={'East_Midlands_price': '#East_Midlands_price'})

print(PP_train_data.shape)
print(type(PP_train_data))
print(PP_test_data.shape)
print(type(PP_test_data))

(39457, 201)
<class 'pandas.core.frame.DataFrame'>
(9888, 200)
<class 'pandas.core.frame.DataFrame'>


In [1968]:
# train_data의 마지막 seq_length 만큼의 행을 선택
last_rows = PP_train_data.iloc[-seq_length:].copy()

# test_data의 0번째 행 앞에 last_82_rows를 추가
# 컬럼 수가 다르므로 일치시키기 위해 마지막 열을 제거합니다.
last_rows_trimmed = last_rows.iloc[:, :-1]

# test_data와 동일한 컬럼 수로 맞춘 데이터프레임을 새로 생성
PP_test_data = pd.concat([last_rows_trimmed, PP_test_data], ignore_index=True)

# 결과 출력
print(PP_test_data.shape)
# print(PP_test_data.head())

(9898, 200)


In [1969]:
print(PP_train_data.shape)
print(type(PP_train_data))
print(PP_test_data.shape)
print(type(PP_test_data))

# 전처리 된 data set 추출 
PP_train_data.to_csv('./PP_data/PP_train_data.csv', index=False)
PP_test_data.to_csv('./PP_data/PP_test_data.csv', index=False)

(39457, 201)
<class 'pandas.core.frame.DataFrame'>
(9898, 200)
<class 'pandas.core.frame.DataFrame'>


In [1970]:
# load data
train_set = np.loadtxt("./PP_data/PP_train_data.csv", delimiter=",")
test_set = np.loadtxt("./PP_data/PP_test_data.csv", delimiter=",")

print(train_set.shape)
print(test_set.shape)

(39457, 201)
(9898, 200)


In [1971]:
# make train-test dataset to input
trainX, trainY = build_dataset(train_set, seq_length)
print(trainX.shape)
print(trainY.shape)

print()

testX = build_dataset_test(test_set, seq_length)
print(testX.shape)

(39447, 10, 200)
(39447, 1)

(9888, 10, 200)


In [1972]:
# scaling data (정규화)
trainX, _, _ = minmax_scaler(trainX)
trainY, trainY_min, trainY_max = minmax_scaler(trainY)
print(trainX.shape)
print(trainY.shape)

print()

testX, _, _ = minmax_scaler(testX)
print(testX.shape)

(39447, 10, 200)
(39447, 1)

(9888, 10, 200)


In [1973]:
# convert to tensor
# MSE loss 사용할거라 Y data가 FloatTensor 타입이어도 괜찮음.
# 애초에 주식 종가를 LongTendor 타입으로 변환해서 label로 쓰기에는 무리.
trainX_tensor = torch.FloatTensor(trainX).to(device)
trainY_tensor = torch.FloatTensor(trainY).to(device)
print(trainX_tensor.shape)
print(trainY_tensor.shape)

print()

testX_tensor = torch.FloatTensor(testX).to(device)
print(testX_tensor.shape)

torch.Size([39447, 10, 200])
torch.Size([39447, 1])

torch.Size([9888, 10, 200])


In [1974]:
train_dataset = torch.utils.data.TensorDataset(trainX_tensor, trainY_tensor)

batch_size = 1024
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [1975]:
class Net(torch.nn.Module): # RNN 모델
    def __init__(self, input_dim, hidden_dim, output_dim, layers):
        super(Net, self).__init__()
        # LSTM 사용
        self.rnn = torch.nn.LSTM(input_dim, hidden_dim, num_layers=layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim, bias=True)
        torch.nn.init.xavier_uniform_(self.fc.weight)

    def forward(self, x):
        x, _status = self.rnn(x)
        x = self.fc(x[:, -1])
        return x


net = Net(data_dim, hidden_dim, output_dim, 1).to(device)

In [1976]:
# loss & optimizer setting
criterion = torch.nn.MSELoss() # MSE 사용
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

In [1977]:
# start training
for i in range(iterations):
    for batch_idx, (batch_X, batch_Y) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = net(batch_X)
        loss = criterion(outputs, batch_Y)
        loss.backward()
        optimizer.step()
    print(i+1, loss.item())

1 0.6184326410293579
1 3.7209739685058594
1 0.7355270385742188
1 0.005181744694709778
1 0.15929530560970306
1 0.3059770464897156
1 0.3721890151500702
1 0.3627510666847229
1 0.32862725853919983
1 0.27913516759872437
1 0.22902272641658783
1 0.1796029806137085
1 0.13378500938415527
1 0.09590889513492584
1 0.0657758042216301
1 0.04218364134430885
1 0.02456279657781124
1 0.01230937521904707
1 0.005033011548221111
1 0.001796352444216609
1 0.001948365243151784
1 0.005659844726324081
1 0.011048361659049988
1 0.015434244647622108
1 0.019203243777155876
1 0.02184596285223961
1 0.021595265716314316
1 0.020551113411784172
1 0.016501668840646744
1 0.012960178777575493
1 0.008890136145055294
1 0.005657113157212734
1 0.0032882920932024717
1 0.001906386110931635
1 0.0015129207167774439
1 0.0017003670800477266
1 0.0023788968101143837
1 0.0024787981528788805
1 0.0033324642572551966
2 0.003783704712986946
2 0.004698282107710838
2 0.004690882749855518
2 0.0047365897335112095
2 0.004140859004110098
2 0.003

In [1978]:
# Predict on test data
net.eval()

#testX_tensor.cpu()
#net.cpu()

with torch.no_grad():
    predicted = net(testX_tensor).detach().cpu().numpy()

# 예측값, 레이블 역정규화 (Kaggle 제출을 위해)
predicted = inverse_minmax_scaler(predicted, trainY_min, trainY_max)
print(predicted.shape)
print(type(predicted))

(9888, 1)
<class 'numpy.ndarray'>


In [1979]:
predicted_df = pd.DataFrame(predicted, columns=["battery_output"])
predicted_df = predicted_df.round(3)
print(predicted_df.shape)
print(predicted_df.head())


(9888, 1)
   battery_output
0          -0.338
1          -0.334
2          -0.331
3          -0.324
4          -0.323


In [1980]:
# "id" 컬럼 추가
predicted_df["id"] = range(39457, 39457 + len(predicted_df))
# "id" 컬럼을 첫 번째 컬럼으로 이동
predicted_df = predicted_df[["id", "battery_output"]]
print(predicted_df.shape)

(9888, 2)


In [1981]:
# DataFrame을 CSV 파일로 저장
csv_file_path = "./submission/Prediction.csv"
predicted_df.to_csv(csv_file_path, index=False)