In [1]:
import pickle
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
from sklearn.ensemble import RandomTreesEmbedding, RandomForestRegressor
import matplotlib.pyplot as plt
import pandas as pd
import re
from utils import *
from rnn import RNN
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("./data/final_lucknow_with_lat_long_stationwise_split_train_timestamp_station_names.pkl", "rb") as f:
    train_data = pickle.load(f)

with open("./data/final_lucknow_with_lat_long_stationwise_split_test_timestamp_station_names.pkl", "rb") as f:
    test_data = pickle.load(f)

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Selected device: {device}')

Selected device: cuda


# Dataframe with Latlong features included

In [5]:
train_df = create_dataframe(train_data, latlong=True)
test_df = create_dataframe(test_data, latlong=True)

In [6]:
cols = {'Timestamp': np.datetime64, 'Latitude': np.float32, 'Longitude': np.float32, 'PM25': np.float32}

In [12]:
train_df = train_df.astype(cols)
test_df = test_df.astype(cols)

In [13]:
# train_df.groupby(['Latitude', 'Longitude']).size()
train_df.dtypes

Timestamp    datetime64[ns]
Latitude            float32
Longitude           float32
Meteo                object
PM25                float32
dtype: object

In [14]:
station_indexing_train = station_indexing(train_df)
station_indexing_test = station_indexing(test_df)

In [15]:
data_train = create_timeseries_data(train_df, station_indexing_train)
data_test = create_timeseries_data(test_df, station_indexing_test)

### Train RNN without creating Sparse RT Embedding

In [16]:
train_dataset = TimeSeriesDataset(data=data_train)
test_dataset = TimeSeriesDataset(data=data_test)

In [17]:
BATCH_SIZE = 1
LEARNING_RATE = 1e-4
INPUT_DIM = len(data_train[0][0]['Meteo'])
HIDDEN_DIM = 64
LAYER_DIM = 1
NUM_EPOCHS = 20
TYPE = 'GRU'
BIDIRECTIONAL = True

In [18]:
model = RNN(TYPE, INPUT_DIM, LAYER_DIM, HIDDEN_DIM, BIDIRECTIONAL, device)
model.to(device)

RNN(
  (series): GRU(5, 64, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [19]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [20]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [21]:
train_losses, test_losses = [], []

In [22]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    for i, (X, y) in enumerate(train_loader):
        X, y = X.type(torch.float32), y.type(torch.float32)
        X, y = X.to(device), y.to(device)

        y_hat = model(X)
        y_hat = y_hat.squeeze(2)

        train_loss = torch.sqrt(criterion(y, y_hat))
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

    train_losses.append(train_loss.item())

    for i, (X, y) in enumerate(test_loader):
        X, y = X.type(torch.float32), y.type(torch.float32)
        X, y = X.to(device), y.to(device)

        with torch.no_grad():
            y_hat = model(X)
            y_hat = y_hat.squeeze(2)

        test_loss = torch.sqrt(criterion(y, y_hat))

    test_losses.append(test_loss.item())

    if (epoch+1)%5 == 0:
        print(f'Epoch: {epoch+1} | {NUM_EPOCHS} \t Train Loss: {train_losses[-1]:.4f} \t  Test Loss: {test_losses[-1]:.4f} \t \
              Time taken: {(time.time()-start_time)/60:.2f} mins')

Epoch: 5 | 20 	 Train Loss: 58.5507 	  Test Loss: 49.7091 	               Time taken: 0.36 mins
Epoch: 10 | 20 	 Train Loss: 63.4682 	  Test Loss: 72.0951 	               Time taken: 0.67 mins
Epoch: 15 | 20 	 Train Loss: 107.5188 	  Test Loss: 51.2263 	               Time taken: 1.00 mins
Epoch: 20 | 20 	 Train Loss: 73.4270 	  Test Loss: 57.8725 	               Time taken: 1.33 mins


### Train RNN after creating Sparse RT Embedding

In [23]:
train_df

Unnamed: 0,Timestamp,Latitude,Longitude,Meteo,PM25
0,2021-12-25,26.901733,80.951874,"[15.845, 77.34499999999998, 754.71, 26.901733,...",147.089996
1,2021-12-27,26.848328,80.923332,"[15.845, 77.34499999999998, 754.71, 26.848328,...",161.360001
2,2021-12-27,26.837669,80.934502,"[15.845, 77.34499999999998, 754.71, 26.83767, ...",141.600006
3,2021-12-27,26.870195,80.904137,"[15.845, 77.34499999999998, 754.71, 26.870196,...",159.160004
4,2021-12-27,26.870195,80.904137,"[15.845, 77.34499999999998, 754.71, 26.870196,...",159.160004
...,...,...,...,...,...
6791,2022-12-12,26.833998,80.891739,"[19.02, 50.82333333333333, 753.7633333333333, ...",87.410004
6792,2022-12-12,26.833998,80.891739,"[19.02, 50.82333333333333, 753.7633333333333, ...",87.410004
6793,2022-12-12,26.833998,80.891739,"[19.02, 50.82333333333333, 753.7633333333333, ...",87.410004
6794,2022-12-14,26.833998,80.891739,"[17.33, 42.76166666666666, 754.3766666666667, ...",55.880001


In [24]:
X = []
X.extend(train_df['Meteo'].to_list())
X.extend(test_df['Meteo'].to_list())

In [25]:
X_transform = random_tree_embedding(X, n_estimators=800, max_depth=2)
train_df['Meteo'] = X_transform.tolist()[:len(train_df)]
test_df['Meteo'] = X_transform.tolist()[len(train_df):]

In [26]:
station_indexing_train = station_indexing(train_df)
station_indexing_test = station_indexing(test_df)
data_train = create_timeseries_data(train_df, station_indexing_train)
data_test = create_timeseries_data(test_df, station_indexing_test)

In [27]:
train_dataset = TimeSeriesDataset(data=data_train)
test_dataset = TimeSeriesDataset(data=data_test)

In [28]:
BATCH_SIZE = 1
LEARNING_RATE = 1e-4
INPUT_DIM = len(data_train[0][0]['Meteo'])
HIDDEN_DIM = 64
LAYER_DIM = 1
NUM_EPOCHS = 20
TYPE = 'LSTM'
BIDIRECTIONAL = False

In [29]:
model = RNN(TYPE, INPUT_DIM, LAYER_DIM, HIDDEN_DIM, BIDIRECTIONAL, device)
model.to(device)

RNN(
  (series): LSTM(3196, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

In [30]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [31]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [32]:
train_losses, test_losses = [], []

In [33]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    for i, (X, y) in enumerate(train_loader):
        X, y = X.type(torch.float32), y.type(torch.float32)
        X, y = X.to(device), y.to(device)

        y_hat = model(X)
        y_hat = y_hat.squeeze(2)

        train_loss = torch.sqrt(criterion(y, y_hat))
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

    train_losses.append(train_loss.item())

    for i, (X, y) in enumerate(test_loader):
        X, y = X.type(torch.float32), y.type(torch.float32)
        X, y = X.to(device), y.to(device)

        with torch.no_grad():
            y_hat = model(X)
            y_hat = y_hat.squeeze(2)

        test_loss = torch.sqrt(criterion(y, y_hat))

    test_losses.append(test_loss.item())

    if (epoch+1)%5 == 0:
        print(f'Epoch: {epoch+1} | {NUM_EPOCHS} \t Train Loss: {train_losses[-1]:.4f} \t  Test Loss: {test_losses[-1]:.4f} \t \
              Time taken: {(time.time()-start_time)/60:.2f} mins')

Epoch: 5 | 20 	 Train Loss: 57.3131 	  Test Loss: 62.8755 	               Time taken: 0.73 mins
Epoch: 10 | 20 	 Train Loss: 49.7904 	  Test Loss: 100.6442 	               Time taken: 1.36 mins
Epoch: 15 | 20 	 Train Loss: 65.3982 	  Test Loss: 53.4580 	               Time taken: 2.00 mins
Epoch: 20 | 20 	 Train Loss: 73.4249 	  Test Loss: 68.2473 	               Time taken: 2.63 mins


# Dataframe without latlong features

In [34]:
train_df = create_dataframe(train_data, latlong=False)
test_df = create_dataframe(test_data, latlong=False)

In [35]:
train_df

Unnamed: 0,Timestamp,Latitude,Longitude,Meteo,PM25
0,2021-12-25,26.901733,80.951876,"[15.845, 77.34499999999998, 754.71]",147.09
1,2021-12-27,26.848328,80.923331,"[15.845, 77.34499999999998, 754.71]",161.36
2,2021-12-27,26.837670,80.934498,"[15.845, 77.34499999999998, 754.71]",141.60
3,2021-12-27,26.870196,80.904134,"[15.845, 77.34499999999998, 754.71]",159.16
4,2021-12-27,26.870196,80.904134,"[15.845, 77.34499999999998, 754.71]",159.16
...,...,...,...,...,...
6791,2022-12-12,26.833997,80.891736,"[19.02, 50.82333333333333, 753.7633333333333]",87.41
6792,2022-12-12,26.833997,80.891736,"[19.02, 50.82333333333333, 753.7633333333333]",87.41
6793,2022-12-12,26.833997,80.891736,"[19.02, 50.82333333333333, 753.7633333333333]",87.41
6794,2022-12-14,26.833997,80.891736,"[17.33, 42.76166666666666, 754.3766666666667]",55.88


In [36]:
station_indexing_train = station_indexing(train_df)
station_indexing_test = station_indexing(test_df)

In [37]:
data_train = create_timeseries_data(train_df, station_indexing_train)
data_test = create_timeseries_data(test_df, station_indexing_test)

In [38]:
train_dataset = TimeSeriesDataset(data=data_train)
test_dataset = TimeSeriesDataset(data=data_test)

In [39]:
BATCH_SIZE = 1
LEARNING_RATE = 1e-4
INPUT_DIM = len(data_train[0][0]['Meteo'])
HIDDEN_DIM = 64
LAYER_DIM = 1
NUM_EPOCHS = 20
TYPE = 'LSTM'
BIDIRECTIONAL = False

In [40]:
model = RNN(TYPE, INPUT_DIM, LAYER_DIM, HIDDEN_DIM, BIDIRECTIONAL, device)
model.to(device)

RNN(
  (series): LSTM(3, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

In [41]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [42]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [43]:
train_losses, test_losses = [], []

In [44]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    for i, (X, y) in enumerate(train_loader):
        X, y = X.type(torch.float32), y.type(torch.float32)
        X, y = X.to(device), y.to(device)

        y_hat = model(X)
        y_hat = y_hat.squeeze(2)

        train_loss = torch.sqrt(criterion(y, y_hat))
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

    train_losses.append(train_loss.item())

    for i, (X, y) in enumerate(test_loader):
        X, y = X.type(torch.float32), y.type(torch.float32)
        X, y = X.to(device), y.to(device)

        with torch.no_grad():
            y_hat = model(X)
            y_hat = y_hat.squeeze(2)

        test_loss = torch.sqrt(criterion(y, y_hat))

    test_losses.append(test_loss.item())

    if (epoch+1)%5 == 0:
        print(f'Epoch: {epoch+1} | {NUM_EPOCHS} \t Train Loss: {train_losses[-1]:.4f} \t  Test Loss: {test_losses[-1]:.4f} \t \
              Time taken: {(time.time()-start_time)/60:.2f} mins')

Epoch: 5 | 20 	 Train Loss: 72.9755 	  Test Loss: 64.8247 	               Time taken: 0.32 mins
Epoch: 10 | 20 	 Train Loss: 46.2666 	  Test Loss: 74.0853 	               Time taken: 0.65 mins
Epoch: 15 | 20 	 Train Loss: 47.4359 	  Test Loss: 73.7782 	               Time taken: 0.94 mins
Epoch: 20 | 20 	 Train Loss: 45.5161 	  Test Loss: 57.4790 	               Time taken: 1.23 mins


### Train RNN after creating Sparse RT Embedding

In [45]:
X = []
X.extend(train_df['Meteo'].to_list())
X.extend(test_df['Meteo'].to_list())

In [46]:
X_transform = random_tree_embedding(X, n_estimators=800, max_depth=2)
train_df['Meteo'] = X_transform.tolist()[:len(train_df)]
test_df['Meteo'] = X_transform.tolist()[len(train_df):]

In [47]:
station_indexing_train = station_indexing(train_df)
station_indexing_test = station_indexing(test_df)
data_train = create_timeseries_data(train_df, station_indexing_train)
data_test = create_timeseries_data(test_df, station_indexing_test)

In [48]:
train_dataset = TimeSeriesDataset(data=data_train)
test_dataset = TimeSeriesDataset(data=data_test)

In [49]:
BATCH_SIZE = 1
LEARNING_RATE = 1e-4
INPUT_DIM = len(data_train[0][0]['Meteo'])
HIDDEN_DIM = 64
LAYER_DIM = 1
NUM_EPOCHS = 20
TYPE = 'LSTM'
BIDIRECTIONAL = False

In [50]:
model = RNN(TYPE, INPUT_DIM, LAYER_DIM, HIDDEN_DIM, BIDIRECTIONAL, device)
model.to(device)

RNN(
  (series): LSTM(3175, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)

In [51]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [52]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [53]:
train_losses, test_losses = [], []

In [54]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    for i, (X, y) in enumerate(train_loader):
        X, y = X.type(torch.float32), y.type(torch.float32)
        X, y = X.to(device), y.to(device)

        y_hat = model(X)
        y_hat = y_hat.squeeze(2)

        train_loss = torch.sqrt(criterion(y, y_hat))
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

    train_losses.append(train_loss.item())

    for i, (X, y) in enumerate(test_loader):
        X, y = X.type(torch.float32), y.type(torch.float32)
        X, y = X.to(device), y.to(device)

        with torch.no_grad():
            y_hat = model(X)
            y_hat = y_hat.squeeze(2)

        test_loss = torch.sqrt(criterion(y, y_hat))

    test_losses.append(test_loss.item())

    if (epoch+1)%5 == 0:
        print(f'Epoch: {epoch+1} | {NUM_EPOCHS} \t Train Loss: {train_losses[-1]:.4f} \t  Test Loss: {test_losses[-1]:.4f} \t \
              Time taken: {(time.time()-start_time)/60:.2f} mins')

Epoch: 5 | 20 	 Train Loss: 75.8622 	  Test Loss: 58.3734 	               Time taken: 0.65 mins
Epoch: 10 | 20 	 Train Loss: 72.7588 	  Test Loss: 56.8021 	               Time taken: 1.29 mins
Epoch: 15 | 20 	 Train Loss: 59.7223 	  Test Loss: 68.5785 	               Time taken: 1.92 mins
Epoch: 20 | 20 	 Train Loss: 40.2820 	  Test Loss: 52.7256 	               Time taken: 2.57 mins
