# Air pollution GRU and LSTM model

In [180]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.preprocessing import MinMaxScaler

In [166]:
pollution_df  = pd.read_csv("data/LSTM-Multivariate_pollution.csv")

In [114]:
pollution_df.head()

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
1,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
3,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
4,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0


In [117]:
pollution_df.shape

(43800, 9)

In [116]:
pollution_df.dtypes

date          object
pollution    float64
dew            int64
temp         float64
press        float64
wnd_dir       object
wnd_spd      float64
snow           int64
rain           int64
dtype: object

In [121]:
pollution_df.describe()

Unnamed: 0,pollution,dew,temp,press,wnd_spd,snow,rain
count,43800.0,43800.0,43800.0,43800.0,43800.0,43800.0,43800.0
mean,94.013516,1.828516,12.459041,1016.447306,23.894307,0.052763,0.195023
std,92.252276,14.429326,12.193384,10.271411,50.022729,0.760582,1.416247
min,0.0,-40.0,-19.0,991.0,0.45,0.0,0.0
25%,24.0,-10.0,2.0,1008.0,1.79,0.0,0.0
50%,68.0,2.0,14.0,1016.0,5.37,0.0,0.0
75%,132.25,15.0,23.0,1025.0,21.91,0.0,0.0
max,994.0,28.0,42.0,1046.0,585.6,27.0,36.0


In [167]:
pollution_df.wnd_dir.unique()

array(['SE', 'cv', 'NW', 'NE'], dtype=object)

In [124]:
pollution_df.isna().sum()

date         0
pollution    0
dew          0
temp         0
press        0
wnd_dir      0
wnd_spd      0
snow         0
rain         0
dtype: int64

## Pytorch custom dataset class

In [179]:
class PollutionData(Dataset):
    def __init__(self, csv_dir, seq_len, test=False):
        self.df = pd.read_csv(csv_dir)

        if test:
            self.X = self.df.drop(['pollution'], axis = 1)
        else:
            self.X = self.df.drop(['date', 'pollution'], axis = 1)
        
        self.y = self.df.pollution
        self.seq_len = seq_len

        mapping = {'NE': 1, 'NW': 2, 'SE': 3, 'cv': 4}

        self.X['wnd_dir'] = self.X.wnd_dir.map(mapping)

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):

        self.X = pd.concat([self.X, self.y], axis=1)
        
        if idx >= self.seq_len - 1:
            sample = self.X.iloc[(idx-self.seq_len+1):idx].values       
        else:
            sample = self.X.iloc[:idx].values
            padding = np.zeros((self.seq_len + 1 - idx, sample.shape[1]))
            sample = np.concatenate([padding, sample], axis = 0)
        
        return sample, self.y[idx]

In [181]:
class LSTMModel(nn.Module):
    def __init__(self, in_size, h_size, num_l):
        super().__init__()
        self.input_size = in_size
        self.hidden_size = h_size
        self.num_layers = num_l

        self.lstm = nn.LSTM(input_size = self.input_size, hidden_size = self.hidden_size, num_layers = self.num_layers)
        self.linear1 = nn.Linear(in_features = self.hidden_size, out_features = 1)
    
    def forward(self, X):
        X = self.lstm(X)
        X = self.linear1(X)

        return X

In [176]:
train_df = PollutionData(csv_dir="data/LSTM-Multivariate_pollution.csv", seq_len=5)
test_df = PollutionData(csv_dir="data/pollution_test_data1.csv", seq_len=5, test = True)

In [177]:
train_df[2]

(array([[   0.  ,    0.  ,    0.  ,    0.  ,    0.  ,    0.  ,    0.  ,
            0.  ],
        [   0.  ,    0.  ,    0.  ,    0.  ,    0.  ,    0.  ,    0.  ,
            0.  ],
        [   0.  ,    0.  ,    0.  ,    0.  ,    0.  ,    0.  ,    0.  ,
            0.  ],
        [   0.  ,    0.  ,    0.  ,    0.  ,    0.  ,    0.  ,    0.  ,
            0.  ],
        [ -16.  ,   -4.  , 1020.  ,    3.  ,    1.79,    0.  ,    0.  ,
          129.  ],
        [ -15.  ,   -4.  , 1020.  ,    3.  ,    2.68,    0.  ,    0.  ,
          148.  ]]),
 159.0)