In [5]:
import os
import pandas as pd

# 1. Data Load

- setting the data path
- load the data with **pandas**

In [6]:
data_dir = '../data/'
k_gas_dir = os.path.join(data_dir, 'k-gas') # k-gas data directory
k_gas_df = pd.read_csv(os.path.join(k_gas_dir, 'Gas sales with temperature.csv'))

## show the data

In [7]:
print('k-gas data shape: ', k_gas_df.shape)
print()
print('k-gas data columns: ', k_gas_df.columns)
print()
print('k-gas data head: ', k_gas_df.head())

k-gas data shape:  (252, 21)

k-gas data columns:  Index(['Year', 'Month', 'Temperature', 'Gangwondo', 'Seoul', 'Gyeonggido',
       'Incheon', 'Gyeongsangnamdo', 'Gyeongsangbukdo', 'Gwangju', 'Daegu',
       'Daejeon', 'Busan', 'Sejong', 'Ulsan', 'Jeollanamdo', 'Jeollabukdo',
       'Jeju', 'Chungcheongnamdo', 'Chungcheongbukdo', 'Sum'],
      dtype='object')

k-gas data head:     Year  Month  Temperature  Gangwondo   Seoul  Gyeonggido  Incheon  \
0  2000      1        -1.20      16219  662424      363014   139454   
1  2000      2        -0.99      16280  689177      375937   145834   
2  2000      3         6.64      14105  566528      323145   121324   
3  2000      4        12.11      10364  396231      237437    91421   
4  2000      5        17.59       6843  262940      169495    67239   

   Gyeongsangnamdo  Gyeongsangbukdo  Gwangju  ...  Daejeon  Busan  Sejong  \
0            42129            55362    39465  ...    52992  85787       0   
1            42604            52863  

# 2. Split the data based on **your purpose**

- This dataset is sequential data, so we need to split the data based on `Year`.

- So, I split the data based on `Year`.

In [8]:
# Create train, validation, and test sets
train_df = k_gas_df[k_gas_df['Year'] < 2018]
val_df = k_gas_df[k_gas_df['Year'].isin([2018, 2019])]
test_df = k_gas_df[k_gas_df['Year'] == 2020]

# 3. Make the Model

- import torch library

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

- We will use **MLP** model.

In [10]:
# Define the MLP model
class My_MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(My_MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        return x

- set the **hyper-parameters**

In [11]:
features = ['Gangwondo', 'Seoul', 'Gyeonggido', 'Incheon', 'Gyeongsangnamdo', 'Gyeongsangbukdo', 'Gwangju', 'Daegu', 'Daejeon', 'Busan', 'Sejong', 'Ulsan', 'Jeollanamdo', 'Jeollabukdo', 'Jeju', 'Chungcheongnamdo', 'Chungcheongbukdo']

In [12]:
input_size = len(features)  # number of features
hidden_size = 50  # size of hidden state of RNN
output_size = 1  # output size
learning_rate = 0.001
batch_size = 8

- Check your `My_MLP` model

In [13]:
model = My_MLP(input_size, hidden_size, output_size)

In [14]:
print(model)

My_MLP(
  (fc1): Linear(in_features=17, out_features=50, bias=True)
  (bn1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=50, out_features=1, bias=True)
)


- Loss and Optimizer

In [15]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 3. Data Pre-processing (Cusmtom Dataset)

- A Year is 12 months, so I need to split the data based on `Month`.

- I split the data based on `Month`.

In [16]:
from torch.utils.data import TensorDataset, DataLoader

- Each data, we will adapt the `create_sequence` function.

## Convert the data to **torch tensor**.

In [18]:
X_train_tensor = torch.tensor(x_train).float()
y_train_tensor = torch.tensor(y_train).float()

In [19]:
print('X_train_tensor shape: ', X_train_tensor.shape)
print('Y_train_tensor shape: ', y_train_tensor.shape)

X_train_tensor shape:  torch.Size([216, 17])
Y_train_tensor shape:  torch.Size([216])


In [20]:
X_val_tensor = torch.tensor(x_val).float()

In [21]:
print('X_val_tensor shape: ', X_val_tensor.shape)
print('Y_val_tensor shape: ', Y_val_tensor.shape)

X_val_tensor shape:  torch.Size([24, 17])
Y_val_tensor shape:  torch.Size([24])


In [22]:
X_test_tensor = torch.tensor(x_test).float()
Y_test_tensor = torch.tensor(y_test).float()

In [23]:
print('X_test_tensor shape: ', X_test_tensor.shape)
print('Y_test_tensor shape: ', Y_test_tensor.shape)

X_test_tensor shape:  torch.Size([12, 17])
Y_test_tensor shape:  torch.Size([12])


# 5. Data Loader

- Check the data shape.

In [24]:
print('X_train_tensor shape: ', X_train_tensor.shape)
print('X_val_tensor shape: ', X_val_tensor.shape)
print('X_test_tensor shape: ', X_test_tensor.shape)

X_train_tensor shape:  torch.Size([216, 17])
X_val_tensor shape:  torch.Size([24, 17])
X_test_tensor shape:  torch.Size([12, 17])


## Create TensorDatasets

- We will use **TensorDataset** directly. Because, we already converted DataFrame to tensor and for using *mini-Batch*.

- But, In practice, we need to make **Custom Dataset** like 'GasDataset'.

In [25]:
train_data = TensorDataset(X_train_tensor, y_train_tensor)
val_data = TensorDataset(X_val_tensor, Y_val_tensor)
test_data = TensorDataset(X_test_tensor, Y_test_tensor)

## Create DataLoader

In [26]:
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

# 6. Training and Evaluation

In [27]:
# Training
for epoch in range(1000):  # number of epochs
    for inputs, labels in train_loader:
        model.train()
        optimizer.zero_grad()
        y_pred = model(inputs)
        loss = criterion(y_pred.squeeze(), labels)
        loss.backward()
        optimizer.step()

    # Evaluation
    if (epoch+1) % 100 == 0:
        model.eval()
        with torch.no_grad():
            val_losses = []
            for inputs, labels in val_loader:
                y_pred_val = model(inputs)
                val_loss = criterion(y_pred_val.squeeze(), labels)
                val_losses.append(val_loss.item())
        print(f'Epoch {epoch+1} | Train Loss: {loss.item()} | Validation Loss: {sum(val_losses) / len(val_losses)}')



Epoch 100 | Train Loss: 2.3675618171691895 | Validation Loss: 13.211014032363892
Epoch 200 | Train Loss: 10.703320503234863 | Validation Loss: 6.323081175486247
Epoch 300 | Train Loss: 3.464829683303833 | Validation Loss: 5.621517260869344
Epoch 400 | Train Loss: 18.29298973083496 | Validation Loss: 9.035704294840494
Epoch 500 | Train Loss: 3.0756070613861084 | Validation Loss: 11.997018496195475
Epoch 600 | Train Loss: 6.984744071960449 | Validation Loss: 15.576728185017904
Epoch 700 | Train Loss: 17.257841110229492 | Validation Loss: 6.588147163391113
Epoch 800 | Train Loss: 47.46575164794922 | Validation Loss: 10.206048647562662
Epoch 900 | Train Loss: 9.024078369140625 | Validation Loss: 6.076721111933391
Epoch 1000 | Train Loss: 3.566286087036133 | Validation Loss: 8.063810110092163


In [28]:
# Testing
model.eval()
with torch.no_grad():
    test_losses = []
    for inputs, labels in test_loader:
        y_pred_test = model(inputs)
        test_loss = criterion(y_pred_test.squeeze(), labels)
        test_losses.append(test_loss.item())
print(f'Test Loss: {sum(test_losses) / len(test_losses)}')

Test Loss: 37.465203285217285
