#Data preprocessed

In [2]:
import os
import numpy as np
import pandas as pd
import time
import functools

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, BatchSampler
from torch.utils.data import Dataset
from torch.utils.data.sampler import BatchSampler
import torch.optim as optim
from sklearn.model_selection import KFold
import torch.utils.data as data_utils


from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.model_selection import StratifiedKFold, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [3]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
df_sub2 = pd.read_csv('/content/sample_submission.csv')

In [None]:
df2 = pd.read_csv('/content/file_name.csv')
#df2=df2.drop('Unnamed: 8',axis=1)
df2.head()

In [None]:
df_test2 = pd.read_csv('/content/test_public.csv')

In [None]:
df_test2.head()

In [None]:
SEQ_LEN = 168

In [None]:
test_list1 = df_test2.groupby('train or test')['time'].first().reset_index()
test_list1 = test_list1['time'].values.tolist()
test_list2 = df_test2.groupby('train or test')['time'].last().reset_index()
test_list2 = test_list2['time'].values.tolist()
test_list1.extend(test_list2)
test_list1.sort()
test_list1

In [None]:
COLUMNS_Y = (["PM2.5","PM10",	"SO2", "NO2",	"CO",	"O3"])
COLUMNS_X = COLUMNS_Y + ['day', 'hour', 'dayofweek']
COLUMNS_X, COLUMNS_Y

In [None]:
COLUMNS_Y = (["PM2.5","PM10",	"SO2", "NO2",	"CO",	"O3"])
COLUMNS_X = COLUMNS_Y + ['day', 'hour', 'dayofweek']
COLUMNS_X, COLUMNS_Y

In [None]:
def add_time_feat(data):
    data['time'] = pd.to_datetime(data['time'])
    data['day'] = data['time'].dt.day
    data['hour'] = data['time'].dt.hour
    data['minute'] = data['time'].dt.minute
    data['dayofweek'] = data['time'].dt.dayofweek
    return data.sort_values('time').reset_index(drop=True)

def add_other_feat(data, columns):
    data['flow_sum'] = data[columns].sum()
    data['flow_median'] = data[columns].median()
    data['flow_mean'] = data[columns].mean()
    return data

In [None]:
df2 = add_time_feat(df2)

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
class Trans:
    def __init__(self, data, name):
        self.min = max(0, np.percentile(data, 1))
        self.max = np.percentile(data, 99)
        self.base = self.max-self.min

    def transform(self, data, scale=True):
        _data = np.clip(data, self.min, self.max)
        if not scale:
            return _data
        return (_data-self.min)/self.base

class TransUtil:
    def __init__(self, data, exclude_cols=None):
        self.columns = data.columns
        self.exclude_cols = exclude_cols
        self.trans = {}
        for c in self.columns:
            if data[c].dtype not in [int, float]:
                print('column "{}" not init trans...'.format(c))
                continue

            if exclude_cols is None or (exclude_cols is not None and c not in exclude_cols):
                print('init trans column...', c)
                self.trans[c] = Trans(data[c].fillna(method='backfill').fillna(method='ffill'), c)

    def transform(self, data, col_name, scale=True):
        if self.exclude_cols is not None and col_name in self.exclude_cols:
            return data

        for t in self.trans:
            if t.startswith(col_name):
                return self.trans[t].transform(data, scale=scale)
        
        return data

In [None]:
trans_util = TransUtil(df2, exclude_cols=None) # data standardization

In [None]:
def generate_xy_pair(final_df, seq_len, trans_util, columns_x, columns_y):
    data_x = pd.DataFrame()
    for c in columns_x:
        data_x[c] = trans_util.transform(final_df[c].fillna(final_df[c].median()), c)

    data_y = pd.DataFrame()
    for c in columns_y:
        data_y[c] = trans_util.transform(final_df[c].fillna(final_df[c].median()), c, scale=False)

    data_x = data_x.values
    data_y = data_y.values
    
    print(data_x.shape, data_y.shape)

    d_x = []
    d_y = []
    for i in range(len(data_x)-seq_len*2+1):
        _x = data_x[i:i+seq_len]
        _y = data_y[i+seq_len:i+seq_len+seq_len]

        assert len(_x) == len(_y) == seq_len, (_x, _y, _x.shape, _y.shape, i, len(data_x))

        d_x.append(_x.T)
        d_y.append(_y.T)

    return np.asarray(d_x).transpose((0, 2, 1)), np.asarray(d_y).transpose((0, 2, 1))

In [None]:
data_x, data_y = generate_xy_pair(df2, seq_len=SEQ_LEN, trans_util=trans_util, columns_x=COLUMNS_X, columns_y=COLUMNS_Y)

In [None]:
data_x.shape, data_y.shape

In [None]:
data_x[0], data_y[0]

In [None]:
# Extract the idx of the corresponding training data/test data according to each test set
_train_idx_1 = df2[df2['time']<test_list1[0]].index.values.tolist()
_train_idx_2 = df2[(df2['time']>test_list1[1])&(df2['time']<test_list1[2])].index.values.tolist()
_train_idx_3 = df2[(df2['time']>test_list1[3])&(df2['time']<test_list1[4])].index.values.tolist()

# Define the three training data periods
train_idx_1 = _train_idx_1[:-SEQ_LEN*2]
train_idx_2 = train_idx_1 + _train_idx_2[:-SEQ_LEN*2]
train_idx_3 = train_idx_2 + _train_idx_3[:-SEQ_LEN*2]

#Define the three test data periods
test_idx_1 = _train_idx_1[-SEQ_LEN]
test_idx_2 = _train_idx_2[-SEQ_LEN]
test_idx_3 = _train_idx_3[-SEQ_LEN]

In [None]:
len(_train_idx_1), len(_train_idx_2), len(_train_idx_3)#, len(_train_idx_4)

In [None]:
len(train_idx_1), len(train_idx_2), len(train_idx_3)#, len(train_idx_4)

In [None]:
test_idx_1, test_idx_2, test_idx_3#, test_idx_4

In [None]:
train_x_1 = data_x[train_idx_1]
train_y_1 = data_y[train_idx_1]
train_x_2 = data_x[train_idx_2]
train_y_2 = data_y[train_idx_2]
train_x_3 = data_x[train_idx_3]
train_y_3 = data_y[train_idx_3]

test_x_1 = data_x[test_idx_1]
test_x_2 = data_x[test_idx_2]
test_x_3 = data_x[test_idx_3]

FEATURE_SIZE = train_x_1.shape[-1]
OUTPUT_SIZE = train_y_1.shape[-1]

In [None]:
class Tt(nn.Module):
    def __init__(self,
                 seq_len,
                 feature_size,
                 output_size,
                 device,
                 use_model='lstm',
                 hidden_size=576,
                 num_hidden_layers=6,
                 num_attention_heads=6,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 ):
        super(Tt, self).__init__()

        self.device = device
        self.use_model = use_model
        self.feature_size = feature_size

        # location code
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size).to(self.device)
        self.layer_norm = nn.LayerNorm(hidden_size).to(self.device)
        self.fc_inputs = nn.Linear(feature_size, hidden_size).to(self.device)

        encoder_layer = nn.TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers).to(self.device)

        self.lstm = torch.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=2).to(self.device)

        self.fc_output_1 = nn.Linear(hidden_size, hidden_size).to(self.device)
        self.fc_output_2 = nn.Linear(hidden_size, hidden_size).to(self.device)
        self.fc_output_3 = nn.Linear(hidden_size, output_size).to(self.device)

    def forward(self,
                inputs,
                position_ids=None,
                attention_mask=None):

        if position_ids is None:
            # print(inputs.shape[:2])
            ones = torch.ones(inputs.size()[:2], dtype=torch.long, device=self.device)
            seq_length = torch.cumsum(ones, axis=1)
            # seq_length = torch.mean(seq_length, axis=1)
            position_ids = seq_length - ones
            position_ids.stop_gradient = True
        
        # print("positionids",position_ids.size())
        position_embeddings = self.position_embeddings(position_ids)

        # print(self.fc_inputs.weight.dtype)
        inputs = self.fc_inputs(inputs)
        inputs = nn.Tanh()(inputs)

        #print(position_embeddings.size())
        inputs = inputs + position_embeddings

        inputs = self.layer_norm(inputs)

        # Choose to use LSTM or Transformer
        if self.use_model == 'lstm':
            encoder_outputs, (h, c) = self.lstm(inputs)
        elif self.use_model == 'transformer':
            if attention_mask is None:
                attention_mask = torch.unsqueeze(
                    (torch.zeros(inputs.shape[:2])).astype(
                        self.fc_inputs.weight.dtype) * -1e4,
                    axis=[1, 2])

            encoder_outputs = self.encoder(
                inputs,
                src_mask=attention_mask)

        output = self.fc_output_1(encoder_outputs)
        output = nn.ReLU()(output)
        output = self.fc_output_2(output)
        output = self.fc_output_3(output)

        return output


In [None]:
SEQ_LEN = 168
FEATURE_SIZE = 9
OUTPUT_SIZE = 6
model = Tt(seq_len=SEQ_LEN, feature_size=FEATURE_SIZE, output_size=OUTPUT_SIZE, device=DEVICE)

In [None]:
def calc_score(y_true, y_pred):
    y_true = np.nan_to_num(y_true)
    y_pred = np.nan_to_num(y_pred)
    return 1/ (1+msle(np.clip(np.reshape(y_true, -1), 0, None), np.clip(np.reshape(y_pred, -1), 0, None)))

def eval_model(model, data_loader):
    model.eval()

    y_pred = []
    y_true = []
    for step, (data, label) in enumerate(data_loader, start=1):
        data = data.to(torch.float32).to(DEVICE)
        label = label.to(torch.float32).to(DEVICE)

        # Computational model output
        output = model(inputs=data)
        y_pred.extend(output.cpu().detach().numpy())
        y_true.extend(label.cpu().detach().numpy())
    
    score = calc_score(y_true, y_pred)
    model.train()
    return score

In [None]:
# class CustomDataset(Dataset):
#     def __init__(self, data_x, index, data_y=None):
#         self.data_x = data_x
#         self.index = index
#         self.data_y = data_y

#     def __len__(self):
#         return len(self.index)

#     def __getitem__(self, idx):
#         x = self.data_x[self.index[idx]]
#         if self.data_y is not None:
#             y = self.data_y[self.index[idx]]
#         else:
#             y = None
#         return x, y

In [None]:
# def create_dataloader(data_x, index, batch_size, data_y=None, shuffle=True):
#     data = [{
#         'data': data_x[i], 
#         'label': 0 if data_y is None else data_y[i]} 
#         for i in idx]
#     if data_y is not None:
#         dataset = CustomDataset(data_x, index, data_y)
#     else:
#         dataset = CustomDataset(data_x, index, None)
#     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
#     return dataloader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data_x, index, data_y):
        self.data_x = data_x
        self.index = index
        self.data_y = data_y

    def __len__(self):
        return len(self.index)

    def __getitem__(self, idx):
        x = self.data_x[self.index[idx]]
        if self.data_y is not None:
            y = self.data_y[self.index[idx]]
            return x, y
        else:
            return x

def create_dataloader(data_x, index, batch_size, data_y=None, shuffle=True):
    data = [{
        'data': data_x[i], 
        'label': 0 if data_y is None else data_y[i]} 
        for i in index]
    dataset = CustomDataset(data_x, index, data_y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader


In [None]:
EPOCHS = 30 # 30 epochs they used
BATCH_SIZE = 256
CKPT_DIR = 'work/output'
K_FOLD = 5
epoch_base = 0
step_eval = 1
step_log = 1

def do_train(train_x, train_y, prefix):
    print('-'*5)
    print('training ...', prefix)
    print('train x:', train_x.shape, 'train y:', train_y.shape)

    torch.manual_seed(2022)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for kfold, tv_idx in enumerate(KFold(n_splits=K_FOLD, shuffle=True, random_state=2022).split(train_x)):
        print('training fold...', kfold)

        train_idx, valid_idx = tv_idx

        model = Tt(seq_len=SEQ_LEN, feature_size=FEATURE_SIZE, output_size=OUTPUT_SIZE, device=DEVICE)
        model = model.to(device)

        train_data_loader = create_dataloader(
            train_x, train_idx, BATCH_SIZE, data_y=train_y, shuffle=True)
        valid_data_loader = create_dataloader(
            train_x, valid_idx, BATCH_SIZE, data_y=train_y, shuffle=False)

        optimizer = optim.AdamW(model.parameters(), lr=1e-4)
        criterion = nn.MSELoss()

        epochs = EPOCHS # training rounds
        save_dir = CKPT_DIR #Folder to save model parameters during training
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        global_step = 0 #iterations
        tic_train = time.time()

        model.train()

        best_score = 0
        for epoch in range(1+epoch_base, epochs+epoch_base+1):
            for step, (data, label) in enumerate(train_data_loader, start=1):
                data = data.to(torch.float32).to(device)
                label = label.to(torch.float32).to(device)

                # Computational model output
                # print(data.dtype)
                output = model(inputs=data)
                loss = criterion(output, label)
                # print(loss)

                # Print loss function value, accuracy rate, calculation speed
                global_step += 1
                if global_step % step_eval == 0:
                    score = eval_model(model, valid_data_loader)            
                    if score > best_score:
                        # print('saving best model...', score)
                        save_path = os.path.join(save_dir, f'{prefix}_kfold_{kfold}_best_model.pth')
                        torch.save(model.state_dict(), save_path)
                        best_score = score
                    if global_step % step_log == 0:
                        print(
                            'global step %d, epoch: %d, batch: %d, loss: %.5f, valid score: %.5f, speed: %.2f step/s'
                            % (global_step, epoch, step, loss.item(), score,
                                10 / (time.time() - tic_train)))
                        tic_train = time.time()

                # Reverse gradient return, update parameters
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

In [None]:
def do_pred(test_x, prefix):
    print('-'*6)
    print('predict ...', prefix)
    print('predict x:', test_x.shape)

    # predict
    test_data_loader = create_dataloader(
            [test_x], [0], BATCH_SIZE, data_y=None, shuffle=False)

    sub_df = []
    save_dir = CKPT_DIR

    for kfold in range(K_FOLD):
        print('predict kfold...', kfold)
        model = Tt(seq_len=SEQ_LEN, feature_size=FEATURE_SIZE, output_size=OUTPUT_SIZE, device=DEVICE)
        model.load_state_dict(torch.load(os.path.join(save_dir, '{}_kfold_{}_best_model.pth'.format(prefix, kfold))))
        model = model.to(DEVICE)
        model.eval()

        y_pred = []
        with torch.no_grad():
            for step, (data, label) in enumerate(test_data_loader, start=1):
                data = data.to(torch.float).to(DEVICE)
                label = label.to(torch.float).to(DEVICE)

                # Computational model output
                output = model(inputs=data)
                y_pred.extend(output.cpu().numpy())

        sub_df.append(np.clip(y_pred, 0, None))

    return sub_df

In [None]:
# Train the model corresponding to each test set in turn
do_train(train_x_1, train_y_1, 'm1')
do_train(train_x_2, train_y_2, 'm2')
do_train(train_x_3, train_y_3, 'm3')

-----
training ... m1
train x: (2084, 168, 9) train y: (2084, 168, 6)
training fold... 0
global step 1, epoch: 1, batch: 1, loss: 5725.63330, valid score: 0.08200, speed: 11.29 step/s
global step 2, epoch: 1, batch: 2, loss: 5907.46777, valid score: 0.08233, speed: 7.54 step/s
global step 3, epoch: 1, batch: 3, loss: 6074.21045, valid score: 0.08280, speed: 7.59 step/s
global step 4, epoch: 1, batch: 4, loss: 5916.11133, valid score: 0.08329, speed: 7.65 step/s
global step 5, epoch: 1, batch: 5, loss: 5814.62354, valid score: 0.08382, speed: 7.59 step/s
global step 6, epoch: 1, batch: 6, loss: 5775.87646, valid score: 0.08441, speed: 7.17 step/s
global step 7, epoch: 1, batch: 7, loss: 6083.91309, valid score: 0.08508, speed: 7.47 step/s
global step 8, epoch: 2, batch: 1, loss: 6010.03320, valid score: 0.08586, speed: 8.32 step/s
global step 9, epoch: 2, batch: 2, loss: 5869.51611, valid score: 0.08680, speed: 6.47 step/s
global step 10, epoch: 2, batch: 3, loss: 5794.95654, valid scor

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data_x, index, data_y):
        self.data_x = data_x
        self.index = index
        self.data_y = data_y

    def __len__(self):
        return len(self.index)

    def __getitem__(self, idx):
        x = self.data_x[self.index[idx]]
        y = self.data_y[self.index[idx]]
        return x, y

def create_dataloader(data_x, index, batch_size, data_y=None, shuffle=True):
    data = [{
        'data': data_x[i], 
        'label': 0 if data_y is None else data_y[i]} 
        for i in index]
    dataset = CustomDataset(data_x, index, [data[0]['label']])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [None]:
# predict the data
pred_1 = do_pred(test_x_1, 'm1')
pred_2 = do_pred(test_x_2, 'm2')
pred_3 = do_pred(test_x_3, 'm3')

------
predict ... m1
predict x: (168, 9)
predict kfold... 0
predict kfold... 1
predict kfold... 2
predict kfold... 3
predict kfold... 4
------
predict ... m2
predict x: (168, 9)
predict kfold... 0
predict kfold... 1
predict kfold... 2
predict kfold... 3
predict kfold... 4
------
predict ... m3
predict x: (168, 9)
predict kfold... 0
predict kfold... 1
predict kfold... 2
predict kfold... 3
predict kfold... 4


In [None]:
pred_1

In [None]:
pred_2

[array([[[ 5.7126174 , 11.432822  ,  0.74037045,  2.012467  ,
           0.1133861 ,  2.83134   ],
         [ 7.512301  , 15.097921  ,  0.9574025 ,  2.6407466 ,
           0.13524534,  3.7198822 ],
         [ 5.7220187 , 11.4822855 ,  0.7426857 ,  2.015801  ,
           0.11239999,  2.847542  ],
         ...,
         [12.260164  , 24.71205   ,  1.539854  ,  4.317631  ,
           0.1793435 ,  6.0706224 ],
         [15.031891  , 30.293568  ,  1.8744619 ,  5.289918  ,
           0.22183798,  7.427077  ],
         [ 8.806782  , 17.700756  ,  1.1224313 ,  3.1013658 ,
           0.15126386,  4.3564687 ]]], dtype=float32),
 array([[[ 8.67934   , 17.02969   ,  1.1043988 ,  3.0831711 ,
           0.12959509,  4.272644  ],
         [ 9.806995  , 19.239462  ,  1.2361174 ,  3.4675603 ,
           0.13783985,  4.82109   ],
         [ 8.548341  , 16.771044  ,  1.0922241 ,  3.0246313 ,
           0.12258688,  4.208567  ],
         ...,
         [17.577496  , 34.469547  ,  2.1572995 ,  6.2058973 ,
 

In [None]:
pred_3

[array([[[ 7.2866135 , 14.600386  ,  0.9471701 ,  2.586327  ,
           0.1274373 ,  3.6428785 ],
         [ 9.54134   , 19.182636  ,  1.2247956 ,  3.3812757 ,
           0.1540589 ,  4.766201  ],
         [ 7.1971927 , 14.450498  ,  0.93819463,  2.5538664 ,
           0.12619999,  3.6092045 ],
         ...,
         [11.033541  , 22.209675  ,  1.411948  ,  3.917984  ,
           0.16225484,  5.52308   ],
         [13.393649  , 26.952156  ,  1.7012949 ,  4.7516913 ,
           0.19811055,  6.6881866 ],
         [ 9.680764  , 19.448484  ,  1.2498126 ,  3.4359608 ,
           0.1581724 ,  4.834304  ]]], dtype=float32),
 array([[[11.982237  , 23.59045   ,  1.3857777 ,  3.7512968 ,
           0.11408469,  5.598055  ],
         [11.994546  , 23.607393  ,  1.3816817 ,  3.750216  ,
           0.11051969,  5.6097593 ],
         [14.546179  , 28.645458  ,  1.6695065 ,  4.547466  ,
           0.13951106,  6.801162  ],
         ...,
         [18.129778  , 35.708668  ,  2.072081  ,  5.6653743 ,
 

In [None]:
result = np.vstack((
    np.mean(pred_1, axis=0).squeeze(),
    np.mean(pred_2, axis=0).squeeze(),
    np.mean(pred_3, axis=0).squeeze()))

result[result<0] = 0
result = pd.concat([df_sub2['time'], pd.DataFrame(result)], axis=1)
result.columns = df_sub2.columns
result.to_csv('/content/sample_data/result_0929_1.csv', index=False, encoding='utf-8')
result

Unnamed: 0,time,PM2.5,PM10,SO2,NO2,CO,O3
0,2020-05-01 01:00,12.544618,25.014267,1.424340,4.664826,0.192623,6.490225
1,2020-05-01 02:00,13.804285,27.509327,1.566731,5.134905,0.206922,7.134004
2,2020-05-01 03:00,13.611275,27.106724,1.523612,5.000238,0.201798,6.969096
3,2020-05-01 04:00,14.629138,29.265095,1.663809,5.417943,0.217221,7.560815
4,2020-05-01 05:00,12.811696,25.552176,1.460434,4.723420,0.191951,6.567334
...,...,...,...,...,...,...,...
499,2020-07-07 20:00,14.988207,30.180338,1.745675,5.104405,0.188626,7.432242
500,2020-07-07 21:00,13.750424,27.757803,1.608932,4.705729,0.176000,6.858172
501,2020-07-07 22:00,15.496849,31.198059,1.795986,5.253649,0.194991,7.687603
502,2020-07-07 23:00,13.126920,26.449692,1.525295,4.480589,0.169580,6.526965


In [None]:
result.to_csv('/content/drive/MyDrive/Data Science/prediction_0929_1.csv', index=False, encoding='utf-8')

In [None]:
from google.colab import drive
drive.mount('/content/drive')