In [None]:
import os
import numpy as np
import pandas as pd
import time
import functools

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, BatchSampler
from torch.utils.data.dataset import Dataset
import torch.optim as optim
from sklearn.model_selection import KFold
import torch.utils.data as data_utils

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.model_selection import StratifiedKFold, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
df1 = pd.read_csv('/content/Jodhpur_1.csv')
df1 = df1[['PM2.5','PM10','SO2','NO2','CO','O3']]
df1.head()

Unnamed: 0,PM2.5,PM10,SO2,NO2,CO,O3
0,41.96,96.4,4.37,17.9,0.17,75.1
1,33.22,67.0,2.62,17.79,0.18,103.08
2,23.38,51.1,0.74,19.69,0.47,48.62
3,21.06,44.3,2.83,15.98,0.34,35.02
4,22.89,43.8,3.38,18.25,0.21,16.57


In [None]:
df2 = pd.read_csv('/content/Jodhpur_2.csv')
df2 = df2[['PM2.5','PM10','SO2','NO2','CO','O3']]
df3 = pd.read_csv('/content/Jodhpur_3.csv')
df3 = df3[['PM2.5','PM10','SO2','NO2','CO','O3']]
df4 = pd.read_csv('/content/Jodhpur_4.csv')
df4 = df4[['PM2.5','PM10','SO2','NO2','CO','O3']]

In [None]:
final_df = pd.concat([df1, df2, df3, df4], axis=0)

In [None]:
final_df.shape

(26425, 6)

In [None]:
class Trans:
    def __init__(self, data, name):
        self.min = max(0, np.percentile(data, 1))
        self.max = np.percentile(data, 99)
        self.base = self.max-self.min

    def transform(self, data, scale=True):
        _data = np.clip(data, self.min, self.max)
        if not scale:
            return _data
        return (_data-self.min)/self.base

class TransUtil:
    def __init__(self, data, exclude_cols=None):
        self.columns = data.columns
        self.exclude_cols = exclude_cols
        self.trans = {}
        for c in self.columns:
            if data[c].dtype not in [int, float]:
                print('column "{}" not init trans...'.format(c))
                continue

            if exclude_cols is None or (exclude_cols is not None and c not in exclude_cols):
                print('init trans column...', c)
                self.trans[c] = Trans(data[c].fillna(method='backfill').fillna(method='ffill'), c)

    def transform(self, data, col_name, scale=True):
        if self.exclude_cols is not None and col_name in self.exclude_cols:
            return data

        for t in self.trans:
            if t.startswith(col_name):
                return self.trans[t].transform(data, scale=scale)
        
        return data

In [None]:
trans_util = TransUtil(final_df, exclude_cols=None) # data standardization

init trans column... PM2.5
init trans column... PM10
init trans column... SO2
init trans column... NO2
init trans column... CO
init trans column... O3


In [None]:
def generate_xy_pair(final_df, seq_len, trans_util, columns_x, columns_y):
    data_x = pd.DataFrame()
    for c in columns_x:
        data_x[c] = trans_util.transform(final_df[c].fillna(final_df[c].median()), c)

    data_y = pd.DataFrame()
    for c in columns_y:
        data_y[c] = trans_util.transform(final_df[c].fillna(final_df[c].median()), c, scale=False)

    data_x = data_x.values
    data_y = data_y.values
    
    print(data_x.shape, data_y.shape)

    d_x = []
    d_y = []
    for i in range(len(data_x)-seq_len*2+1):
        _x = data_x[i:i+seq_len]
        _y = data_y[i+seq_len:i+seq_len+seq_len]

        assert len(_x) == len(_y) == seq_len, (_x, _y, _x.shape, _y.shape, i, len(data_x))

        d_x.append(_x.T)
        d_y.append(_y.T)

    return np.asarray(d_x).transpose((0, 2, 1)), np.asarray(d_y).transpose((0, 2, 1))

In [None]:
SEQ_LEN =168

In [None]:
COLUMNS_Y = (["PM2.5","PM10",	"SO2", "NO2",	"CO",	"O3"])
COLUMNS_X = COLUMNS_Y
COLUMNS_X, COLUMNS_Y

(['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3'],
 ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3'])

In [None]:
data_x, data_y = generate_xy_pair(final_df, seq_len=SEQ_LEN, trans_util=trans_util, columns_x=COLUMNS_Y, columns_y=COLUMNS_Y)

(26425, 6) (26425, 6)


In [None]:
data_x.shape, data_y.shape

((26090, 168, 6), (26090, 168, 6))

In [None]:
data_x[0], data_y[0]

(array([[0.12481404, 0.14591875, 0.14976392, 0.13948847, 0.03088803,
         0.54196938],
        [0.09712546, 0.09898208, 0.08791597, 0.1384102 , 0.03281853,
         0.75513799],
        [0.06595204, 0.07359797, 0.02147361, 0.15703481, 0.08880309,
         0.34022868],
        ...,
        [0.17943087, 0.26820951, 0.12785207, 0.11429624, 0.06756757,
         0.22511154],
        [0.20715113, 0.27762877, 0.1094744 , 0.08596724, 0.03281853,
         0.14961115],
        [0.20328613, 0.27778842, 0.08826939, 0.0878297 , 0.05019305,
         0.18343776]]),
 array([[5.503e+01, 1.458e+02, 7.500e-01, 1.152e+01, 7.200e-01, 2.063e+01],
        [5.728e+01, 1.508e+02, 2.350e+00, 9.920e+00, 6.100e-01, 1.747e+01],
        [4.825e+01, 1.535e+02, 3.890e+00, 1.318e+01, 5.400e-01, 3.069e+01],
        ...,
        [2.152e+01, 7.910e+01, 5.270e+00, 2.408e+01, 4.000e-01, 1.971e+01],
        [4.687e+01, 9.200e+01, 3.750e+00, 2.919e+01, 1.120e+00, 2.070e+01],
        [6.275e+01, 1.018e+02, 1.324e-01, 3.00

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.33, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17480, 168, 6), (8610, 168, 6), (17480, 168, 6), (8610, 168, 6))

In [None]:
X_train[0].shape

(168, 6)

In [None]:
X_train = np.array(X_train, dtype=np.float)
y_train = np.array(y_train, dtype=np.float)
X_test = np.array(X_test, dtype=np.float)
y_test = np.array(y_test, dtype=np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_train = np.array(X_train, dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_train = np.array(y_train, dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_test = np.array(X_test, dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_test = np.array(y_test, dtype=np.float)


In [None]:
X_train[0], X_test[1], y_train[0], y_test[1]

(array([[0.08819161, 0.08780668, 0.37595069, 0.34867216, 0.02702703,
         0.39828246],
        [0.0737454 , 0.13570124, 0.41305946, 0.33308631, 0.02702703,
         0.39173046],
        [0.10004004, 0.19205717, 0.23423054, 0.16154392, 0.17567568,
         0.41367205],
        ...,
        [0.22628603, 0.24458153, 0.25578897, 0.2112422 , 0.15830116,
         0.26289982],
        [0.22628603, 0.24458153, 0.25578897, 0.2112422 , 0.15830116,
         0.26289982],
        [0.22628603, 0.24458153, 0.25578897, 0.2112422 , 0.15830116,
         0.26289982]]),
 array([[0.85894157, 0.8565143 , 0.57951879, 0.2112422 , 0.05984556,
         0.14214491],
        [0.61402119, 0.57281888, 0.44628064, 0.2112422 , 0.01544402,
         0.20903628],
        [0.48197771, 0.41604404, 0.16990868, 0.2112422 , 0.01544402,
         0.30464979],
        ...,
        [0.17265129, 0.22430617, 0.49752608, 0.58040143, 0.3030888 ,
         0.36323687],
        [0.2205519 , 0.27922526, 0.52120501, 0.5997122 , 0.330

In [None]:
class Tt(nn.Module):
    def __init__(self,
                 seq_len,
                 feature_size,
                 output_size,
                 device,
                 use_model='lstm',
                 hidden_size=576,
                 num_hidden_layers=6,
                 num_attention_heads=6,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                #  max_1=25,
                #  max_2=61,
                #  max_3=8,
                #  max_4=1441
                 ):
        super(Tt, self).__init__()

        self.device = device
        self.use_model = use_model
        self.feature_size = feature_size

        # If there is a corresponding time embedding, it can be used
        # self.th_embeddings = nn.Embedding(max_hour, hidden_size)
        # self.tm_embeddings = nn.Embedding(max_min, hidden_size)
        # self.td_embeddings = nn.Embedding(max_dow, hidden_size)
        # self.tt_embeddings = nn.Embedding(max_ts, hidden_size)

        # location code
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size).to(self.device)
        self.layer_norm = nn.LayerNorm(hidden_size).to(self.device)
        self.fc_inputs = nn.Linear(feature_size, hidden_size).to(self.device)

        encoder_layer = nn.TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers).to(self.device)

        self.lstm = torch.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=2).to(self.device)

        self.fc_output_1 = nn.Linear(hidden_size, hidden_size).to(self.device)
        self.fc_output_2 = nn.Linear(hidden_size, hidden_size).to(self.device)
        self.fc_output_3 = nn.Linear(hidden_size, output_size).to(self.device)

    def forward(self,
                inputs,
                # inputs_th=None,
                # inputs_tm=None,
                # inputs_td=None,
                # inputs_tt=None,
                position_ids=None,
                attention_mask=None):

        if position_ids is None:
            # print(inputs.shape[:2])
            ones = torch.ones(inputs.size()[:2], dtype=torch.long, device=self.device)
            seq_length = torch.cumsum(ones, axis=1)
            # seq_length = torch.mean(seq_length, axis=1)
            position_ids = seq_length - ones
            position_ids.stop_gradient = True
        
        # print("positionids",position_ids.size())
        position_embeddings = self.position_embeddings(position_ids)

        # print(self.fc_inputs.weight.dtype)
        inputs = self.fc_inputs(inputs)
        inputs = nn.Tanh()(inputs)

        #print(position_embeddings.size())
        inputs = inputs + position_embeddings

        # # If there is a corresponding time embedding, it can be used
        # if inputs_th is not None:
        #     inputs += self.th_embeddings(inputs_th)
        
        # if inputs_tm is not None:
        #     inputs += self.tm_embeddings(inputs_tm)

        # if inputs_td is not None:
        #     inputs += self.td_embeddings(inputs_td)

        # if inputs_tt is not None:
        #     inputs += self.tt_embeddings(inputs_tt)

        inputs = self.layer_norm(inputs)

        # Choose to use LSTM or Transformer
        if self.use_model == 'lstm':
            encoder_outputs, (h, c) = self.lstm(inputs)
        elif self.use_model == 'transformer':
            if attention_mask is None:
                attention_mask = torch.unsqueeze(
                    (torch.zeros(inputs.shape[:2])).astype(
                        self.fc_inputs.weight.dtype) * -1e4,
                    axis=[1, 2])

            encoder_outputs = self.encoder(
                inputs,
                src_mask=attention_mask)

        output = self.fc_output_1(encoder_outputs)
        output = nn.ReLU()(output)
        output = self.fc_output_2(output)
        output = self.fc_output_3(output)

        return output


In [None]:
SEQ_LEN = 168
FEATURE_SIZE = 6
OUTPUT_SIZE = 6
model = Tt(seq_len=SEQ_LEN, feature_size=FEATURE_SIZE, output_size=OUTPUT_SIZE, device=DEVICE)

In [None]:
x = torch.randn((64, 168, 6))
# x_numpy = x.detach().numpy()
y = model(inputs=x.to(DEVICE))
y.size()

torch.Size([64, 168, 6])

In [None]:
# xt = torch.ones(x_numpy.shape[:2], dtype=torch.long)
y.size()

torch.Size([64, 168, 6])

In [None]:
def calc_score(y_true, y_pred):
    y_true = np.nan_to_num(y_true)
    y_pred = np.nan_to_num(y_pred)
    return 1/ (1+msle(np.clip(np.reshape(y_true, -1), 0, None), np.clip(np.reshape(y_pred, -1), 0, None)))

def eval_model(model, data_loader):
    model.eval()

    y_pred = []
    y_true = []
    for step, (data, label) in enumerate(data_loader, start=1):
        # print(batch)
        # print(batch[0].shape)
        data = data.to(torch.float32).to(DEVICE)
        label = label.to(torch.float32).to(DEVICE)

        # Computational model output
        output = model(inputs=data)
        y_pred.extend(output.cpu().detach().numpy())
        y_true.extend(label.cpu().detach().numpy())
    
    score = calc_score(y_true, y_pred)
    model.train()
    return score

# def make_data_loader(data_x, idx, batch_size, data_y=None, shuffle=False):
#     data = [{'data': torch.Tensor(data_x[i]), 'label': 0 if data_y is None else data_y[i]} for i in idx]
#     ds = MapDataset(data)
#     batch_sampler = BatchSampler(ds, batch_size=batch_size, shuffle=shuffle)
#     return DataLoader(dataset=ds, batch_sampler=batch_sampler)

In [None]:
from torch.utils.data import Dataset
import torchvision.transforms as T

class CustomDataset(Dataset):
  def __init__(self, X=X_train, y=y_train):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, index):
      return torch.tensor(self.X[index], requires_grad=True).type(torch.float), torch.tensor(self.y[index], requires_grad=True).type(torch.float)

In [None]:
len(y_train)

17480

In [None]:
dataset = CustomDataset()
x, y = dataset[0]
x

tensor([[0.0882, 0.0878, 0.3760, 0.3487, 0.0270, 0.3983],
        [0.0737, 0.1357, 0.4131, 0.3331, 0.0270, 0.3917],
        [0.1000, 0.1921, 0.2342, 0.1615, 0.1757, 0.4137],
        ...,
        [0.2263, 0.2446, 0.2558, 0.2112, 0.1583, 0.2629],
        [0.2263, 0.2446, 0.2558, 0.2112, 0.1583, 0.2629],
        [0.2263, 0.2446, 0.2558, 0.2112, 0.1583, 0.2629]],
       grad_fn=<ToCopyBackward0>)

In [None]:
EPOCHS = 10 # 30 epochs they used
BATCH_SIZE = 256
CKPT_DIR = 'work/output'
K_FOLD = 5
epoch_base = 0
step_eval = 1
step_log = 1

def do_train(X_train, y_train, prefix):
    print('-'*5)
    print('training ...', prefix)
    print('train x:', X_train.shape, 'train y:', y_train.shape)

    torch.manual_seed(2022)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for kfold, tv_idx in enumerate(KFold(n_splits=K_FOLD, shuffle=True, random_state=2022).split(X_train)):
        print('training fold...', kfold)

        train_idx, valid_idx = tv_idx

        model = Tt(seq_len=SEQ_LEN, feature_size=FEATURE_SIZE, output_size=OUTPUT_SIZE, device=DEVICE)
        model = model.to(device)

        # Assuming BATCH_SIZE is defined somewhere else in the code
        train_dataset = CustomDataset(X_train, y_train)
        valid_dataset = CustomDataset(X_test, y_test)        

        # train_dataset = MyDataset(train_x[train_idx], train_y[train_idx])
        # valid_dataset = MyDataset(train_x[valid_idx], train_y[valid_idx])

        train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        valid_data_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

        optimizer = optim.AdamW(model.parameters(), lr=1e-4)
        criterion = nn.MSELoss()

        epochs = EPOCHS # training rounds
        save_dir = CKPT_DIR #Folder to save model parameters during training
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        global_step = 0 #iterations
        tic_train = time.time()

        model.train()

        best_score = 0
        for epoch in range(1+epoch_base, epochs+epoch_base+1):
            for step, (data, label) in enumerate(train_data_loader, start=1):
                data = data.to(torch.float32).to(device)
                label = label.to(device)

                # Computational model output
                # print(data.dtype)
                output = model(inputs=data)
                loss = criterion(output, label)
                # print(loss)

                # Print loss function value, accuracy rate, calculation speed
                global_step += 1
                if global_step % step_eval == 0:
                    score = eval_model(model, valid_data_loader)            
                    if score > best_score:
                        # print('saving best model...', score)
                        save_path = os.path.join(save_dir, f'{prefix}_kfold_{kfold}_best_model.pth')
                        torch.save(model.state_dict(), save_path)
                        best_score = score
                    if global_step % step_log == 0:
                        print(
                            'global step %d, epoch: %d, batch: %d, loss: %.5f, valid score: %.5f, speed: %.2f step/s'
                            % (global_step, epoch, step, loss.item(), score,
                                10 / (time.time() - tic_train)))
                        tic_train = time.time()

                # Reverse gradient return, update parameters
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

# class Tt(nn.Module):
#     def __init__(self, seq_len, feature_size, output_size):
#         super(Tt, self).__init__()
#         self.seq_len = seq_len
#         self.feature_size = feature_size
#         self.output_size = output_size

#         self.fc1 = nn.Linear(seq_len * feature_size, 512)
#         self.fc2 = nn.Linear(512, 256)
#         self.fc3 = nn.Linear(256, output_size)

#     def forward(self, inputs):
#         x = inputs.view(-1, self.seq_len * self.feature_size)


In [None]:
def do_pred(test_x, prefix):
    print('-'*6)
    print('predict ...', prefix)
    print('predict x:', test_x.shape)

    # predict
    test_dataset = CustomDataset(X_test,y_test)
    test_data_loader = DataLoader(test_dataset, BATCH_SIZE, pin_memory=True, num_workers = 2)

    sub_df = []
    save_dir = CKPT_DIR

    for kfold in range(K_FOLD):
        print('predict kfold...', kfold)
        model = Tt(seq_len=SEQ_LEN, feature_size=FEATURE_SIZE, output_size=OUTPUT_SIZE, device=DEVICE)
        model.load_state_dict(torch.load(os.path.join(save_dir, '{}_kfold_{}_best_model.pth'.format(prefix, kfold))))
        model = model.to(DEVICE)
        model.eval()

        y_pred = []
        with torch.no_grad():
            for step, (data, label) in enumerate(test_data_loader, start=1):
                data = data.to(torch.float).to(DEVICE)
                label = label.to(torch.float).to(DEVICE)

                # Computational model output
                output = model(inputs=data)
                y_pred.extend(output.cpu().numpy())

        sub_df.append(np.clip(y_pred, 0, None))

    return sub_df


In [None]:
# Train the model corresponding to each test set in turn
do_train(X_train, y_train, 'm1')

-----
training ... m1
train x: (17480, 168, 6) train y: (17480, 168, 6)
training fold... 0
global step 1, epoch: 1, batch: 1, loss: 9461.26465, valid score: 0.07532, speed: 1.51 step/s
global step 2, epoch: 1, batch: 2, loss: 9205.20508, valid score: 0.07555, speed: 1.37 step/s
global step 3, epoch: 1, batch: 3, loss: 9730.97266, valid score: 0.07590, speed: 1.43 step/s
global step 4, epoch: 1, batch: 4, loss: 9878.49121, valid score: 0.07629, speed: 1.45 step/s
global step 5, epoch: 1, batch: 5, loss: 9693.14941, valid score: 0.07670, speed: 1.40 step/s
global step 6, epoch: 1, batch: 6, loss: 9772.27051, valid score: 0.07717, speed: 1.35 step/s
global step 7, epoch: 1, batch: 7, loss: 9179.75879, valid score: 0.07770, speed: 1.37 step/s
global step 8, epoch: 1, batch: 8, loss: 9684.18066, valid score: 0.07834, speed: 1.40 step/s
global step 9, epoch: 1, batch: 9, loss: 9526.07812, valid score: 0.07912, speed: 1.36 step/s
global step 10, epoch: 1, batch: 10, loss: 9872.28223, valid sc

In [1]:
pred_1 = do_pred(X_test, 'm1')

NameError: ignored

In [None]:
pred_1