In [1]:
import sys
sys.path.append('data/')
import time
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from constants import *
from utils import *

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Original Data Analysis (without any additional parameters)

In [3]:
cols = ['timestamp', 'latitude', 'longitude', 'rh', 'temp', 'pm25']
data_file = f'{data_bihar}/bihar_512_sensor_data_imputed.pkl'
df = pd.read_pickle(data_file)

In [4]:
stats = []

## lat_long split

In [5]:
print('******\t\t Performance on original \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='lat_long', normalize=False)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])


print('\n\n******\t\t Performance on normalized \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='lat_long', normalize=True)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])

******		 Performance on original 		******
{'Train_RMSE': 36.45631852464035, 'Train_Pearson_R': 0.9054327827962774, 'Val_RMSE': 59.400774952222605, 'Val_Pearson_R': 0.8309238997768421, 'Test_RMSE': 57.61797507552839, 'Test_Pearson_R': 0.8418961828804797}


******		 Performance on normalized 		******
{'Train_RMSE': 35.495487014769154, 'Train_Pearson_R': 0.9017676497788707, 'Val_RMSE': 62.943166222565395, 'Val_Pearson_R': 0.8043000131234359, 'Test_RMSE': 58.52429240088282, 'Test_Pearson_R': 0.8426461701852536}


## timestamp split

In [6]:
print('******\t\t Performance on original \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='timestamp', normalize=False)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])


print('\n\n******\t\t Performance on normalized \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='timestamp', normalize=True)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])

******		 Performance on original 		******
{'Train_RMSE': 14.031789359262207, 'Train_Pearson_R': 0.7943986827577331, 'Val_RMSE': 39.280714970709404, 'Val_Pearson_R': 0.1884204956592611, 'Test_RMSE': 148.23358190062777, 'Test_Pearson_R': 0.013873666472861814}


******		 Performance on normalized 		******
{'Train_RMSE': 14.129193595477801, 'Train_Pearson_R': 0.7940620849345265, 'Val_RMSE': 40.38057875837224, 'Val_Pearson_R': 0.16376241085769452, 'Test_RMSE': 144.45298413795717, 'Test_Pearson_R': 0.059641600092728876}


# New params Analysis

In [7]:
cols = ['timestamp', 'latitude', 'longitude', 'rh', 'temp', 'blh', 'u10', 'v10', 'kx', 'sp', 'tp', 'pm25']
data_file = f'{data_bihar}/bihar_512_sensor_era5_image_imputed.pkl'
df = pd.read_pickle(data_file)

## lat_long split

In [8]:
print('******\t\t Performance on original \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='lat_long', normalize=False)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])


print('\n\n******\t\t Performance on normalized \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='lat_long', normalize=True)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])
# train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test, model_name='bihar_xgb_iterative_lat_long')

******		 Performance on original 		******
{'Train_RMSE': 10.750272395249572, 'Train_Pearson_R': 0.9701495796407584, 'Val_RMSE': 15.575408290600079, 'Val_Pearson_R': 0.9407868540248423, 'Test_RMSE': 15.981114012234265, 'Test_Pearson_R': 0.9429902032375664}


******		 Performance on normalized 		******
{'Train_RMSE': 10.644265034034742, 'Train_Pearson_R': 0.9693241808126227, 'Val_RMSE': 16.50455189401751, 'Val_Pearson_R': 0.9264216244649265, 'Test_RMSE': 15.704175206652273, 'Test_Pearson_R': 0.9404419198099584}


## timestamp split

In [9]:
print('******\t\t Performance on original \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='timestamp', normalize=False)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])


print('\n\n******\t\t Performance on normalized \t\t******')
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(df, cols, split_ratio=[0.4,0.1,0.5], split_type='timestamp', normalize=True)
stats.append(train_XGBoost(X_train, y_train, X_val, y_val, X_test, y_test))
print(stats[-1])

******		 Performance on original 		******
{'Train_RMSE': 8.620144182322491, 'Train_Pearson_R': 0.8978269409360554, 'Val_RMSE': 14.84423147616561, 'Val_Pearson_R': 0.8366638501851577, 'Test_RMSE': 77.65170352414937, 'Test_Pearson_R': 0.774643216682175}


******		 Performance on normalized 		******
{'Train_RMSE': 8.489885034730154, 'Train_Pearson_R': 0.8993080901516067, 'Val_RMSE': 14.687889658135905, 'Val_Pearson_R': 0.8412207108762247, 'Test_RMSE': 78.98216334916847, 'Test_Pearson_R': 0.7498279258575047}


In [11]:
df = pd.DataFrame(data=stats).round(decimals=4)
df

Unnamed: 0,Train_RMSE,Train_Pearson_R,Val_RMSE,Val_Pearson_R,Test_RMSE,Test_Pearson_R
0,36.4563,0.9054,59.4008,0.8309,57.618,0.8419
1,35.4955,0.9018,62.9432,0.8043,58.5243,0.8426
2,14.0318,0.7944,39.2807,0.1884,148.2336,0.0139
3,14.1292,0.7941,40.3806,0.1638,144.453,0.0596
4,10.7503,0.9701,15.5754,0.9408,15.9811,0.943
5,10.6443,0.9693,16.5046,0.9264,15.7042,0.9404
6,8.6201,0.8978,14.8442,0.8367,77.6517,0.7746
7,8.4899,0.8993,14.6879,0.8412,78.9822,0.7498
