In [1]:
import sys
sys.path.append('../')

In [2]:
import pandas as pd
import numpy as np
from constants import *
import os
from sklearn.ensemble import RandomTreesEmbedding, RandomForestRegressor
from utils import eval_stat
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.model_selection import train_test_split
from eda_utils import *
import dateutil.parser
import impyute
import time
import random

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dir_list = os.listdir(root_data)
dir_list

['East_Champaran_data',
 'Gopalganj_data',
 'Sheohar_data',
 'Sitamarhi_data',
 'Vaishali_data',
 'West_Champaran_data']

In [4]:
class InvalidImputerException(Exception):
    "Imputer type can only be KNN, Mean or Iterative"
    pass

In [5]:
def read(filename):
    xls = pd.ExcelFile(filename)
    df_pm25 = pd.read_excel(xls, xls.sheet_names[0])
    df_latlong = pd.read_excel(xls, xls.sheet_names[1])
    return df_pm25, df_latlong

In [6]:
def create_dataframe(region_name, df_pm25, df_latlong):
    lat_long_dict = {}

    for col in df_latlong.columns[1:]:
        lat_long_dict[col] = [df_latlong[col][0], df_latlong[col][1]]
    
    data = []
    
    for i in range(len(df_pm25)):
        timestamp = df_pm25.loc[i, 'dt_time']
        for j in range(1, len(df_pm25.columns), 3):
            point = {}
            point['Timestamp'], point['Region'] = timestamp, region_name
            if df_pm25.columns[j] not in lat_long_dict:
                continue
            lat_long = lat_long_dict[df_pm25.columns[j]]
            point['Latitude'], point['Longitude'] = lat_long[0], lat_long[1]
            point['Meteo'] = [df_pm25.loc[i, df_pm25.columns[j+1]], df_pm25.loc[i, df_pm25.columns[j+2]],\
                            lat_long[0], lat_long[1]]
            point['PM25'] = df_pm25.loc[i, df_pm25.columns[j]]
            data.append(point)
    
    return data

In [7]:
data = []

for dir in dir_list:
    if dir == 'Sheohar_data' or dir == 'Sitamarhi_data' or dir == 'Vaishali_data':
        continue
    print(dir)
    for filename in os.listdir(os.path.join(root_data, dir)):
        root_path = os.path.join(root_data, dir)
        if ('sensor_data' in filename):
            df_pm25, df_latlong = read(os.path.join(root_path, filename))
            data.extend(create_dataframe(dir, df_pm25, df_latlong))

East_Champaran_data
Gopalganj_data
West_Champaran_data


In [8]:
cols = {'Timestamp': np.datetime64}

In [9]:
df = pd.DataFrame(data)
df = df.astype(cols)

In [10]:
df

Unnamed: 0,Timestamp,Region,Latitude,Longitude,Meteo,PM25
0,2023-05-05 14:00:00,East_Champaran_data,26.672,85.157,"[nan, nan, 26.672, 85.157]",
1,2023-05-05 14:00:00,East_Champaran_data,26.837,85.067,"[nan, nan, 26.837, 85.067]",
2,2023-05-05 14:00:00,East_Champaran_data,26.558,85.083,"[nan, nan, 26.558, 85.083]",
3,2023-05-05 14:00:00,East_Champaran_data,26.530,85.196,"[nan, nan, 26.53, 85.196]",
4,2023-05-05 14:00:00,East_Champaran_data,26.552,84.933,"[nan, nan, 26.552, 84.933]",
...,...,...,...,...,...,...
62259,2023-06-21 12:00:00,West_Champaran_data,26.994,84.410,"[38.2, 48.6, 26.994, 84.41]",31.0
62260,2023-06-21 12:00:00,West_Champaran_data,27.133,84.060,"[35.0, 59.5, 27.133, 84.06]",26.0
62261,2023-06-21 12:00:00,West_Champaran_data,27.060,84.023,"[35.6, 54.8, 27.06, 84.023]",17.0
62262,2023-06-21 12:00:00,West_Champaran_data,26.907,84.136,"[36.2, 50.5, 26.907, 84.136]",15.0


In [11]:
for _, data in df.iterrows():
    assert data['Latitude'] == data['Meteo'][-2], "Latitude doesn't match"
    assert data['Longitude'] == data['Meteo'][-1], "Longitude doesn't match"

In [12]:
df.isna().sum()

Timestamp        0
Region           0
Latitude         0
Longitude        0
Meteo            0
PM25         12383
dtype: int64

In [13]:
df.groupby(['Region']).count()

Unnamed: 0_level_0,Timestamp,Latitude,Longitude,Meteo,PM25
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East_Champaran_data,27072,27072,27072,27072,20133
Gopalganj_data,20538,20538,20538,20538,16471
West_Champaran_data,14654,14654,14654,14654,13277


In [14]:
df.dtypes

Timestamp    datetime64[ns]
Region               object
Latitude            float64
Longitude           float64
Meteo                object
PM25                float64
dtype: object

## Region Wise Stats (random split)

In [17]:
stat_df = pd.DataFrame(region_wise_stat(df.copy(deep=True), drop_nan=True))
stat_df

Unnamed: 0,Region,Train_RMSE,Train_Pearson_R,Test_RMSE,Test_Pearson_R
0,East_Champaran_data,22.983977,0.862865,30.55691,0.749165
1,Gopalganj_data,22.022782,0.89892,32.112518,0.815076
2,West_Champaran_data,24.041354,0.854471,31.820413,0.739409


## Region Wise Stats (lat long split)

In [18]:
stat_df = pd.DataFrame(region_wise_stat(df.copy(deep=True), drop_nan=True, lat_long_split=True))
stat_df

Unnamed: 0,Region,Train_RMSE,Train_Pearson_R,Test_RMSE,Test_Pearson_R
0,East_Champaran_data,23.936072,0.866903,30.462783,0.639842
1,Gopalganj_data,24.514307,0.891329,41.220988,0.753932
2,West_Champaran_data,28.04011,0.83367,29.480427,0.596313


# RNN Framework

In [None]:
from utils import *
from rnn import RNN
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [None]:
method = 'knn'
time_data = []

In [None]:
grp = df.groupby('Region')

for name, group in grp:
    grp_data = []

    for _, data in group.iterrows():
        row = []
        date = dateutil.parser.parse(data['Timestamp'].strftime('%Y-%m-%d %X'))
        row.append(date.timestamp())
        row.extend(data['Meteo'])
        row.append(data['PM25'])
        grp_data.append(row)
    
    grp_data = np.array(grp_data)
    imputed_data = impute(grp_data, method=method)

    for data in imputed_data:
        row = {}
        row['Timestamp'] = data[0]
        row['Latitude'] = data[-3]
        row['Longitude'] = data[-2]
        row['Meteo'] = data[1:-1]
        row['PM25'] = data[-1]
        time_data.append(row)

In [None]:
time_df = pd.DataFrame(time_data)
time_df

In [None]:
station_indexing = station_indexing(time_df)
data = create_timeseries_data(time_df, station_indexing)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Selected device: {device}')

In [None]:
BATCH_SIZE = 1
LEARNING_RATE = 1e-4
INPUT_DIM = len(data[0][0]['Meteo'])
HIDDEN_DIM = 10
LAYER_DIM = 1
NUM_EPOCHS = 30
TYPE = 'LSTM'
BIDIRECTIONAL = True

In [None]:
model = RNN(TYPE, INPUT_DIM, LAYER_DIM, HIDDEN_DIM, BIDIRECTIONAL, device)
model.to(device)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
dataset = TimeSeriesDataset(data=data)
loader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
train_losses = []

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):

    for i, (X, y) in enumerate(loader):
        X, y = X.type(torch.float32), y.type(torch.float32)
        X, y = X.to(device), y.to(device)

        y_hat = model(X)
        y_hat = y_hat.squeeze(2)

        train_loss = torch.sqrt(criterion(y, y_hat))
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

    train_losses.append(train_loss.item())

    if (epoch+1)%5 == 0:
        print(f'Epoch: {epoch+1} | {NUM_EPOCHS} \t Train Loss: {train_losses[-1]:.4f} \t\
              Time taken: {(time.time()-start_time)/60:.2f} mins')

# XGBOOST

In [23]:
stat_df = pd.DataFrame(region_wise_stat_xgboost(df.copy(deep=True), drop_nan=True))
stat_df

Unnamed: 0,Region,Train_RMSE,Train_Pearson_R,Test_RMSE,Test_Pearson_R
0,East_Champaran_data,12.176655,0.937062,27.922323,0.878341
1,Gopalganj_data,12.372637,0.959304,22.777086,0.910517
2,West_Champaran_data,11.225264,0.937827,28.950167,0.860086


# XGBoost (Lat Long split)

In [24]:
stat_df = pd.DataFrame(region_wise_stat_xgboost(df.copy(deep=True), lat_long_split=True, drop_nan=True))
stat_df

Unnamed: 0,Region,Train_RMSE,Train_Pearson_R,Test_RMSE,Test_Pearson_R
0,East_Champaran_data,13.275788,0.930743,30.954658,0.758559
1,Gopalganj_data,11.13785,0.961407,31.002945,0.883527
2,West_Champaran_data,11.323953,0.94452,27.493831,0.78921


## Comparison between different imputation methods (XGBoost)

In [29]:
methods = ['knn', 'mean', 'iterative']

for method in methods:
    stat_df = pd.DataFrame(region_wise_stat_xgboost(df.copy(deep=True), method=method, lat_long_split=True, drop_nan=True))
    print(f'{method} ->\n{stat_df}\n')

knn ->
                Region  Train_RMSE  Train_Pearson_R  Test_RMSE  Test_Pearson_R
0  East_Champaran_data   13.772074         0.934199  30.545550        0.819352
1       Gopalganj_data   11.029887         0.962536  30.055302        0.865333
2  West_Champaran_data   11.692090         0.945268  28.873359        0.713565

mean ->
                Region  Train_RMSE  Train_Pearson_R  Test_RMSE  Test_Pearson_R
0  East_Champaran_data   14.058468         0.934432  28.524244        0.747981
1       Gopalganj_data    9.310026         0.969016  31.908014        0.854678
2  West_Champaran_data   12.796801         0.928651  28.838875        0.786296

iterative ->
                Region  Train_RMSE  Train_Pearson_R  Test_RMSE  Test_Pearson_R
0  East_Champaran_data   11.727901         0.944860  30.764285        0.735061
1       Gopalganj_data   13.181589         0.954377  22.308682        0.929529
2  West_Champaran_data    9.133890         0.951985  43.909119        0.705334



## Comparison between different imputation methods (RT RF)

In [30]:
methods = ['knn', 'mean', 'iterative']

for method in methods:
    stat_df = pd.DataFrame(region_wise_stat(df.copy(deep=True), method=method, lat_long_split=True, drop_nan=True))
    print(f'{method} ->\n{stat_df}\n')

knn ->
                Region  Train_RMSE  Train_Pearson_R  Test_RMSE  Test_Pearson_R
0  East_Champaran_data   23.297102         0.863648  33.693879        0.649882
1       Gopalganj_data   21.976894         0.896714  42.212212        0.692839
2  West_Champaran_data   24.796727         0.852689  32.331552        0.662770

mean ->
                Region  Train_RMSE  Train_Pearson_R  Test_RMSE  Test_Pearson_R
0  East_Champaran_data   24.032744         0.836817  35.426646        0.628723
1       Gopalganj_data   22.823778         0.898750  43.087513        0.734462
2  West_Champaran_data   27.897564         0.845297  26.999185        0.653882

iterative ->
                Region  Train_RMSE  Train_Pearson_R  Test_RMSE  Test_Pearson_R
0  East_Champaran_data   25.965812         0.860293  27.264527        0.616792
1       Gopalganj_data   22.695508         0.893984  38.443309        0.773636
2  West_Champaran_data   22.681374         0.847762  37.853471        0.632867

