In [1]:
import time
import numpy as np
import time
from pathlib import Path
import math
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
from utils import *
from constants import *
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from models.LSTM import LSTM, CustomDataset, FrobeniusNorm

import argparse
import warnings
warnings.filterwarnings('ignore')

In [28]:
def data_processing(df, test_locs, WS, FW):
    df_grouped = df.groupby(['latitude', 'longitude'])
    test_data = []

    start_time = time.time()
    print(f"---------\t Dataset processing started; FORECAST WINDOW = {FW}\t---------")

    for loc, group in df_grouped:

        if loc not in test_locs: continue
        else:
            
            print(loc)

            data = group.to_numpy()
            # Since first three columns are timestamp, latitude and longitude respectively
            X, y = data[:, 3:-1], data[:, -1]

            '''
                Vectorized code for making different windows of data
            '''
            y = np.lib.stride_tricks.sliding_window_view(y, (FW,))
            X = X[:y.shape[0], :]
            X = np.lib.stride_tricks.sliding_window_view(X, (WS, X.shape[1]))
            y = np.lib.stride_tricks.sliding_window_view(y, (WS, y.shape[1]))
            X, y = np.squeeze(X), np.squeeze(y)

            if FW == 1:
                y = y.reshape(y.shape[0], -1, 1)

            X, y = X.astype(np.float32), y.astype(np.float32)

            assert X.shape[0] == y.shape[0] and X.shape[1] == y.shape[1]

            test_data.extend([{'meteo': X_w, 'pm25': y_w} for X_w, y_w in zip(X, y)])
            break
        
    print("---------\t Dataset processing completed \t---------")
    print(f'Time taken: {(time.time()-start_time)/60:.2f} mins')

    return test_data


In [29]:
def get_stats(test_loader, input_size, output_size, hidden_size, num_layers, FW):

    model = LSTM(input_size, hidden_size, num_layers, output_size, bidirectional=True)
    model.to(device)

    start_time = time.time()
    # start_epoch, train_losses, val_losses = 0, [], []

    model_path = f'{model_dir}/BLSTM_{FW}.pth.tar'
    model_file = Path(model_path)

    if model_file.is_file():
        state = torch.load(model_path)
        model.load_state_dict(state['state'])
        # start_epoch, train_losses, val_losses = state['epoch'], state['train_losses'], state['val_losses']
    else:
        print("No pth file found")

    print(f"---------\t Stats \t---------")
    start_time = time.time()
    y_test, y_test_pred = [], []

    with torch.no_grad():
        for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                labels = labels[:, -1, :]

                preds = model(inputs)
                preds = torch.clamp(preds, LOWER_BOUND, UPPER_BOUND)

                y_test.extend(labels.cpu().tolist())
                y_test_pred.extend(preds.cpu().tolist())

    y_test, y_test_pred = np.array(y_test), np.array(y_test_pred)
    y_test, y_test_pred = y_test.reshape(-1), y_test_pred.reshape(-1)
    
    # print(y_train.shape, y_train_pred.shape, y_val.shape, y_val_pred.shape, y_test.shape, y_test_pred.shape)

    print(f"Stats (RMSE, R_squared, p_value, R_squared_pearson, p_value_pearson)")
    print(eval_stat(y_test, y_test_pred))

    print(f"---------\t Stats Completed\t Time Taken={(time.time()-start_time)/60:.2f} mins\t---------\n")

    return y_test, y_test_pred

In [30]:
FORECAST_WINDOWS = [1, 6, 12, 24]
BATCH_SIZE = 128

data_file = f'{data_bihar}/bihar_512_sensor_era5_rnn.pkl'
df = pd.read_pickle(data_file)

# train_locs = load_locs_as_tuples(f'{data_bihar}/train_locations.txt')
# val_locs = load_locs_as_tuples(f'{data_bihar}/val_locations.txt')
test_locs = load_locs_as_tuples(f'{data_bihar}/test_locations.txt')

scaler = StandardScaler()
data = df[[x for x in df.columns if x not in {'timestamp', 'latitude', 'longitude', 'pm25'}]].to_numpy()
data = scaler.fit_transform(data)
df[[x for x in df.columns if x not in {'timestamp', 'latitude', 'longitude', 'pm25'}]] = data

In [31]:
for FW in FORECAST_WINDOWS:
    test_data = data_processing(df, test_locs, WS=168, FW=FW)

    test_dataset = CustomDataset(test_data)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    input_size, hidden_size, num_layers, output_size = None, 64, 2, None

    for inputs, labels in test_loader:
        input_size, output_size = inputs.shape[-1], labels.shape[-1]
        input_shape, output_shape = inputs.shape, labels.shape
        break

    y_test, y_test_pred = get_stats(test_loader, input_size, output_size, hidden_size, num_layers, FW)
    # print(y_test.shape, y_test_pred.shape)
    y, y_pred = [], []
    
    for i in range(0, y_test.size-FW+1, FW*100):
        y.append(y_test[i])
        y_pred.append(y_test_pred[i])

    # x = np.array([i for i in range(1, y_test.size+1)])

    plt.plot(y, '-.', color='blue', label='True Value')
    plt.plot(y_pred, '-.', color='orange', label='Predicted Value')
    plt.xlabel('Time Series')
    plt.ylabel('PM2.5 Concentration')
    plt.legend()

    path = f'{plot_dir}/True_vs_Predicted_{FW}'
    plt.savefig(f'{path}.jpg', dpi=400)
    plt.close()

    # break

---------	 Dataset processing started; FORECAST WINDOW = 1	---------
(24.517, 84.668)
---------	 Dataset processing completed 	---------
Time taken: 0.02 mins
---------	 Stats 	---------
Stats (RMSE, R_squared, p_value, R_squared_pearson, p_value_pearson)
(10.093675124579034, 0.940294090697613, 0.0, 0.9832421344399213, 0.0)
---------	 Stats Completed	 Time Taken=0.01 mins	---------

---------	 Dataset processing started; FORECAST WINDOW = 6	---------
(24.517, 84.668)
---------	 Dataset processing completed 	---------
Time taken: 0.02 mins
---------	 Stats 	---------
Stats (RMSE, R_squared, p_value, R_squared_pearson, p_value_pearson)
(14.13640899632664, 0.8835842370861284, 0.0, 0.9469023173687533, 0.0)
---------	 Stats Completed	 Time Taken=0.01 mins	---------

---------	 Dataset processing started; FORECAST WINDOW = 12	---------
(24.517, 84.668)
---------	 Dataset processing completed 	---------
Time taken: 0.03 mins
---------	 Stats 	---------
Stats (RMSE, R_squared, p_value, R_squar