In [1]:
import sys
sys.path.append('data/')
import time
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from constants import *
from utils import *

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Original Data Analysis (without any additional parameters)

In [3]:
cols = ['timestamp', 'latitude', 'longitude', 'rh', 'temp', 'pm25']
data_file = f'{data_bihar}/bihar_512_sensor_data_imputed.pkl'
df = pd.read_pickle(data_file)
df['pm25'] = df['pm25'].astype(float)
df['pm25'] = df['pm25'].clip(LOWER_BOUND, UPPER_BOUND)

In [4]:
stats = []

## lat_long split

In [5]:
print('******\t\t Performance on original \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='lat_long', normalize=False, load_locs=False)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])


print('\n\n******\t\t Performance on normalized \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='lat_long', normalize=True)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])

******		 Performance on original 		******
{'Train_RMSE': 34.6417789927141, 'Train_Pearson_R': 0.9062731690731304, 'Val_RMSE': 59.46430601140271, 'Val_Pearson_R': 0.8095528238365539, 'Test_RMSE': 54.14890506762033, 'Test_Pearson_R': 0.8352584306438354}


******		 Performance on normalized 		******
{'Train_RMSE': 33.11580804278895, 'Train_Pearson_R': 0.9062923696478449, 'Val_RMSE': 60.82679876426093, 'Val_Pearson_R': 0.8194090040770168, 'Test_RMSE': 54.482271376405166, 'Test_Pearson_R': 0.8341910174462247}


## timestamp split

In [6]:
print('******\t\t Performance on original \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='timestamp', normalize=False)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])


print('\n\n******\t\t Performance on normalized \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='timestamp', normalize=True)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])

******		 Performance on original 		******
{'Train_RMSE': 13.839895128298009, 'Train_Pearson_R': 0.7938019881025533, 'Val_RMSE': 40.149439162053234, 'Val_Pearson_R': 0.21073631940813978, 'Test_RMSE': 144.2630658332126, 'Test_Pearson_R': 0.016780180696473895}


******		 Performance on normalized 		******
{'Train_RMSE': 13.898842376096407, 'Train_Pearson_R': 0.7928614143650572, 'Val_RMSE': 40.62459161979755, 'Val_Pearson_R': 0.19959304233823727, 'Test_RMSE': 141.90734932110928, 'Test_Pearson_R': 0.05833593153202376}


# New params Analysis

In [7]:
cols = ['timestamp', 'latitude', 'longitude', 'rh', 'temp', 'blh', 'u10', 'v10', 'kx', 'sp', 'tp', 'pm25']
data_file = f'{data_bihar}/bihar_512_sensor_era5_image_imputed.pkl'
df = pd.read_pickle(data_file)
df['pm25'] = df['pm25'].clip(LOWER_BOUND, UPPER_BOUND)

## lat_long split

In [8]:
print('******\t\t Performance on original \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='lat_long', normalize=False)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])


print('\n\n******\t\t Performance on normalized \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='lat_long', normalize=True)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])
# train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test, model_name='bihar_xgb_iterative_lat_long')

******		 Performance on original 		******
{'Train_RMSE': 10.948289329955117, 'Train_Pearson_R': 0.9668287702884067, 'Val_RMSE': 15.696270242799962, 'Val_Pearson_R': 0.9440875554624639, 'Test_RMSE': 15.12868583838661, 'Test_Pearson_R': 0.9410306878525788}


******		 Performance on normalized 		******
{'Train_RMSE': 11.02514546366582, 'Train_Pearson_R': 0.9666565975575047, 'Val_RMSE': 16.77258049244069, 'Val_Pearson_R': 0.9461533496055617, 'Test_RMSE': 15.269466584658954, 'Test_Pearson_R': 0.9422396846662804}


## timestamp split

In [9]:
print('******\t\t Performance on original \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='timestamp', normalize=False)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])


print('\n\n******\t\t Performance on normalized \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='timestamp', normalize=True)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])

******		 Performance on original 		******


{'Train_RMSE': 8.428116197904476, 'Train_Pearson_R': 0.8988324958575408, 'Val_RMSE': 15.308572098030307, 'Val_Pearson_R': 0.8370906337991709, 'Test_RMSE': 89.81912022749167, 'Test_Pearson_R': 0.7641775346871197}


******		 Performance on normalized 		******
{'Train_RMSE': 8.473763041454523, 'Train_Pearson_R': 0.8987748785534025, 'Val_RMSE': 14.819578384809024, 'Val_Pearson_R': 0.836465629583798, 'Test_RMSE': 94.15923232632954, 'Test_Pearson_R': 0.7733359257106727}


In [10]:
df = pd.DataFrame(data=stats).round(decimals=4)
df

Unnamed: 0,Train_RMSE,Train_Pearson_R,Val_RMSE,Val_Pearson_R,Test_RMSE,Test_Pearson_R
0,34.6418,0.9063,59.4643,0.8096,54.1489,0.8353
1,33.1158,0.9063,60.8268,0.8194,54.4823,0.8342
2,13.8399,0.7938,40.1494,0.2107,144.2631,0.0168
3,13.8988,0.7929,40.6246,0.1996,141.9073,0.0583
4,10.9483,0.9668,15.6963,0.9441,15.1287,0.941
5,11.0251,0.9667,16.7726,0.9462,15.2695,0.9422
6,8.4281,0.8988,15.3086,0.8371,89.8191,0.7642
7,8.4738,0.8988,14.8196,0.8365,94.1592,0.7733
