In [1]:
import sys
sys.path.append('..')

import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import time
import faiss
import xarray as xr
from geopy.distance import geodesic
import geopandas as gpd
from sklearn.preprocessing import StandardScaler
from shapely.geometry import Point
from sklearn.impute import SimpleImputer, KNNImputer
from MeteoDataset import MeteoDataset
from scipy.spatial import distance
from load_meteo_data import create_meteo_dataframe
from sklearn.neighbors import BallTree
from scipy.spatial.distance import cdist
from pathlib import Path
from constants import *
from load_era5_data import create_era5_dataframe
from eda_utils import impute_data
import arrow

In [3]:
files = {'Bihar_536_Sensor_Data_Sep_2023_Screened.csv': 7, 'Bihar_536_Sensor_Data_Oct_2023_Screened.csv': 7,\
        'Bihar_536_Sensor_Data_Nov_2023_Screened.csv': 6, 'Bihar_536_Sensor_Data_Jan_2024_Screened.csv': 6,\
        'Bihar_536_Sensor_Data_Dec_2023_Screened.csv': 6, 'Bihar_512_Sensor_Data_May_Aug_Screened_Hourly.csv': 6}

nc_files = [f'{data_bihar}/era5_may_dec_2023.nc', f'{data_bihar}/era5_jan_2024.nc']
locations_fp = f'{data_bihar}/bihar_locations.txt'

geojson_file = f'{data_bihar}/bihar.json'
pbl_file = f'{data_bihar}/PBLH_may_Dec_2023.nc'
other_params_file = f'{data_bihar}/Era5_data_May_Dec_2023.nc'
output_meteo_file = f'{data_bihar}/bihar_meteo_may_jan.pkl'
output_meteo_era5_file = f'{data_bihar}/bihar_meteo_era5_may_jan.pkl'
output_meteo_era5_imputed_file = f'{data_bihar}/bihar_meteo_era5_may_jan_iterative_imputed.pkl'
pm25_clustering_file = f'{data_bihar}/bihar_clustering_pm25.csv'
knowair_fp = f'{data_bihar}/KnowAir.npy'
bihar_npy_fp = f'{data_bihar}/bihar_may_jan.npy'

region = gpd.read_file(geojson_file)
# meteo_df = pd.read_pickle(meteo_file)

start_date, end_date = pd.Timestamp('2023-05-01 00:00:00'), pd.Timestamp('2024-02-01 00:00:00')

In [4]:
if Path(output_meteo_file).is_file():
        meteo_df = pd.read_pickle(output_meteo_file)
else:
    meteo_df = create_meteo_dataframe(files, output_meteo_file, start_date, end_date)

In [5]:
min_lon, min_lat, max_lon, max_lat = region.total_bounds
min_lon, min_lat, max_lon, max_lat

(83.320238, 24.286327, 88.29954611201047, 27.521347)

In [6]:
region.total_bounds

array([83.320238  , 24.286327  , 88.29954611, 27.521347  ])

In [7]:
if Path(output_meteo_era5_file).is_file():
        era5_df = pd.read_pickle(output_meteo_era5_file)
else:
    era5_df = create_era5_dataframe(nc_files, meteo_df, output_meteo_era5_file)

In [4]:
if Path(output_meteo_era5_imputed_file).is_file():
    imputed_df = pd.read_pickle(output_meteo_era5_imputed_file)
else:
    imputed_df = impute_data(era5_df, output_meteo_era5_imputed_file, method='iterative')

imputed_df['block'] = imputed_df['block'].apply(lambda x : x.strip())
imputed_df['district'] = imputed_df['district'].apply(lambda x : x.strip())

In [5]:
imputed_df

Unnamed: 0,timestamp,block,district,latitude,longitude,rh,temp,blh,u10,v10,kx,sp,tp,pm25
0,2023-05-01 00:00:00,PATEDHI BELSAR,VAISHALI,25.98455,85.231468,71.577310,35.100352,65.396729,0.008765,-0.914944,33.510487,99355.101562,-3.725290e-09,95.937754
1,2023-05-01 00:00:00,CHAUSA.M,MADHEPURA,25.52800,87.034000,71.577310,35.167571,65.396729,0.008765,-0.914944,33.510487,99355.101562,-3.725290e-09,95.937754
2,2023-05-01 00:00:00,DAGRAUA,PURNIA,25.81000,87.624000,71.577310,35.289755,67.408936,-0.866215,-0.313103,34.728436,100046.187500,3.321469e-05,95.937754
3,2023-05-01 00:00:00,PIRPAINTI,BHAGALPUR,25.29800,87.427000,71.577310,35.090638,61.006836,-0.211810,0.803941,13.025658,66545.390625,7.031001e-04,95.937754
4,2023-05-01 00:00:00,TANKUPPA,GAYA,24.70800,85.142000,36.481667,34.331667,67.408936,-0.866215,-0.313103,34.728436,100046.187500,3.321469e-05,12.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3384859,2024-01-31 23:00:00,SAKRA,MUZAFFARPUR,25.97600,85.561000,79.267332,29.237977,145.891846,-0.865313,0.271649,5.954479,66728.546875,1.022334e-03,154.026599
3384860,2024-01-31 23:00:00,NABINAGAR,AURANGABAD,24.61000,84.147000,76.516667,19.441667,64.378418,-1.515559,1.448338,16.053062,100102.570312,-2.710505e-20,298.600000
3384861,2024-01-31 23:00:00,SAMELI,KATIHAR,25.49000,87.270000,71.577310,29.237977,41.515869,-1.912057,-0.936211,6.145271,100910.531250,-2.710505e-20,165.302997
3384862,2024-01-31 23:00:00,RAJAUN,BANKA,25.00000,86.989000,78.300230,29.237977,27.135986,-1.166061,0.342019,26.191948,97159.726562,-2.710505e-20,126.886332


In [32]:
# imputed_df.count()

timestamp    3384864
block        3384864
district     3384864
latitude     3384864
longitude    3384864
rh           3384864
temp         3384864
blh          3384864
u10          3384864
v10          3384864
kx           3384864
sp           3384864
tp           3384864
pm25         3384864
dtype: int64

In [10]:
# df_g = imputed_df.groupby(['latitude', 'longitude'])

# for loc, group in df_g:
#     print(loc, group.shape)

In [11]:
# era5_df.count()

index        3384864
timestamp    3384864
block        3384864
district     3384864
latitude     3384864
longitude    3384864
rh           2040291
temp         1907280
blh          3384864
u10          3384864
v10          3384864
kx           3384864
sp           3384864
tp           3384864
pm25         2045544
dtype: int64

In [12]:
# pm25_df = pd.read_csv(pm25_clustering_file)
# pm25_df

In [13]:
# df_g = pm25_df.groupby(['latitude', 'longitude'])

# for loc, group in df_g:
#     print(loc, group.shape)

In [14]:
imputed_df.shape

(3384864, 14)

In [15]:
# imputed_df[imputed_df['pm25'] > imputed_df['pm25'].max()-10]

In [22]:
mu, sigma = imputed_df['pm25'].mean(), imputed_df['pm25'].std()
mu, sigma

(105.63217922057937, 98.01558999664988)

In [3]:
d = np.load(knowair_fp)
d.shape

(11688, 184, 18)

In [4]:
d[0][0]

array([ 1.82038193e+00, -1.31661860e+00,  2.53094174e+02,  2.66073469e+02,
        3.84515500e+01, -4.27048668e+01,  2.09199109e+01,  1.92479888e+01,
        4.33315233e-04,  1.02294539e+05,  2.63243316e+02,  2.64921798e+02,
        0.00000000e+00,  7.02799650e+00, -4.34368872e+00,  6.31098542e-01,
       -6.50030234e-05,  0.00000000e+00])

In [7]:
# locs_df = imputed_df.groupby(['longitude', 'latitude'])
# ts = list(imputed_df['timestamp'].unique())

# locations = []
# for locs, grp in locs_df:
#     locations.append((grp['block'].iloc[0], grp['district'].iloc[0], locs[0], locs[1]))

# loc_df = pd.DataFrame(data=locations)
# loc_df.to_csv(locations_fp, sep='|', header=False, index=True)

In [8]:
# print(len(locations), len(ts))
# assert imputed_df.shape[0] == len(locations) * len(ts)

In [9]:
# df_grp = imputed_df.groupby(['block', 'district', 'longitude', 'latitude'])

# for loc, grp in df_grp:
#     print(grp.shape)

In [26]:
locations = pd.read_csv(locations_fp, sep='|', header=None)

# locs = list(locations.apply(lambda x : (x[3], x[4]), axis=1))
# repeated_locs = set()

# for l in locs:
#     if l in repeated_locs:
#         print(l)
#     else:
#         repeated_locs.add(l)

In [28]:
locs_grouped = imputed_df.groupby(['longitude', 'latitude'])

locs_to_index_dict = {}

for idx, row in locations.iterrows():
    locs_to_index_dict[(row[3], row[4])] = row[0] 

T, L, F = len(list(imputed_df['timestamp'].unique())), locations.shape[0], imputed_df.shape[-1]-5
T, L, F

(6624, 511, 9)

In [29]:
bihar_npy_data = np.zeros((T, L, F))
bihar_npy_data.shape

(6624, 511, 9)

In [30]:
for loc, group in locs_grouped:
    group = group.sort_values(by='timestamp')
    l = locs_to_index_dict[loc]

    for t in range(T):
        bihar_npy_data[t][l] = group.iloc[t][5:]

with open(bihar_npy_fp, 'wb') as f:
    np.save(f, bihar_npy_data)