# Air pollution GRU and LSTM model

In [253]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import MinMaxScaler

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [166]:
pollution_df  = pd.read_csv("data/LSTM-Multivariate_pollution.csv")

In [114]:
pollution_df.head()

Unnamed: 0,date,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,2010-01-02 00:00:00,129.0,-16,-4.0,1020.0,SE,1.79,0,0
1,2010-01-02 01:00:00,148.0,-15,-4.0,1020.0,SE,2.68,0,0
2,2010-01-02 02:00:00,159.0,-11,-5.0,1021.0,SE,3.57,0,0
3,2010-01-02 03:00:00,181.0,-7,-5.0,1022.0,SE,5.36,1,0
4,2010-01-02 04:00:00,138.0,-7,-5.0,1022.0,SE,6.25,2,0


In [117]:
pollution_df.shape

(43800, 9)

In [116]:
pollution_df.dtypes

date          object
pollution    float64
dew            int64
temp         float64
press        float64
wnd_dir       object
wnd_spd      float64
snow           int64
rain           int64
dtype: object

In [121]:
pollution_df.describe()

Unnamed: 0,pollution,dew,temp,press,wnd_spd,snow,rain
count,43800.0,43800.0,43800.0,43800.0,43800.0,43800.0,43800.0
mean,94.013516,1.828516,12.459041,1016.447306,23.894307,0.052763,0.195023
std,92.252276,14.429326,12.193384,10.271411,50.022729,0.760582,1.416247
min,0.0,-40.0,-19.0,991.0,0.45,0.0,0.0
25%,24.0,-10.0,2.0,1008.0,1.79,0.0,0.0
50%,68.0,2.0,14.0,1016.0,5.37,0.0,0.0
75%,132.25,15.0,23.0,1025.0,21.91,0.0,0.0
max,994.0,28.0,42.0,1046.0,585.6,27.0,36.0


In [167]:
pollution_df.wnd_dir.unique()

array(['SE', 'cv', 'NW', 'NE'], dtype=object)

In [124]:
pollution_df.isna().sum()

date         0
pollution    0
dew          0
temp         0
press        0
wnd_dir      0
wnd_spd      0
snow         0
rain         0
dtype: int64

## Pytorch codes

### Custom dataset class

In [289]:
class PollutionData(Dataset):
    def __init__(self, csv_dir, seq_len, test=False, scaler=None, device='cpu'):
        self.df = pd.read_csv(csv_dir)
        self.test = test
        self.device = device

        if scaler:
            scale_columns = ['dew', 'temp', 'press', 'wnd_spd', 'pollution']
            self.scaler = scaler
            if test:
                self.df[scale_columns] = self.scaler.transform(self.df[scale_columns])
            else:
                self.df[scale_columns] = self.scaler.fit_transform(self.df[scale_columns])

        if self.test:
            self.X = self.df.drop(['pollution'], axis = 1)
        else:
            self.X = self.df.drop(['date', 'pollution'], axis = 1)
        
        self.y = self.df.pollution
        self.seq_len = seq_len

        mapping = {'NE': 1, 'NW': 2, 'SE': 3, 'cv': 4}

        self.X['wnd_dir'] = self.X.wnd_dir.map(mapping)

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):

        X_sample = pd.concat([self.X, self.y], axis=1)
        
        if idx >= self.seq_len - 1:
            X_sample = X_sample.iloc[(idx-self.seq_len+1):idx].values       
        else:
            X_sample = X_sample.iloc[:idx].values
            padding = np.zeros((self.seq_len - idx - 1, X_sample.shape[1]))
            X_sample = np.concatenate([padding, X_sample], axis = 0)
        
        return torch.tensor(X_sample).to(torch.float32).to(device), torch.tensor(self.y[idx]).unsqueeze(0).to(torch.float32).to(device)

In [290]:
minmax_scaler = MinMaxScaler()

In [291]:
train_df = PollutionData(csv_dir="data/LSTM-Multivariate_pollution.csv", seq_len=5, scaler=minmax_scaler)
test_df = PollutionData(csv_dir="data/pollution_test_data1.csv", seq_len=5, test = True, scaler = train_df.scaler)

In [292]:
train_loader = DataLoader(train_df, shuffle=True, batch_size=256)

### LSTM model class

In [293]:
class LSTMModel(nn.Module):
    def __init__(self, in_size, h_size, num_l, out_f):
        super().__init__()
        self.input_size = in_size
        self.hidden_size = h_size
        self.num_layers = num_l
        self.out_features = out_f

        self.lstm = nn.LSTM(input_size = self.input_size, 
                            hidden_size = self.hidden_size, 
                            num_layers = self.num_layers, 
                            batch_first = True)
        self.linear1 = nn.Linear(in_features = self.hidden_size, out_features = self.out_features)
    
    def forward(self, X):
        X, _ = self.lstm(X)
        X = self.linear1(X[:,-1,:])

        return X

### Training loop

In [294]:
torch.manual_seed(42)

model = LSTMModel(in_size = 8, h_size = 10, num_l = 2, out_f = 1).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

loss_fn = nn.MSELoss()

epochs = 2

for epoch in range(epochs):

    model.train()

    train_loss = 0.0

    for batch, (X, y) in enumerate(train_loader):
        
        y_pred = model(X)
        
        loss = loss_fn(y_pred, y)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
    
    train_loss = train_loss / len(train_loader)

    print(f"Training loss: {train_loss}")

Training loss: 0.005931827737841495
Training loss: 0.000996644743620066
