In [2]:
import sys
sys.path.append('..')

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from constants import *
from eda_utils import *
from utils import eval_stat
import seaborn as sns
import pickle as pkl
import math
import netCDF4 as nc

In [None]:
files = ['Bihar_536_Sensor_Data_Sep_2023_Screened.csv', 'Bihar_536_Sensor_Data_Oct_2023_Screened.csv', 'Bihar_536_Sensor_Data_Nov_2023_Screened.csv',
        'Bihar_512_Sensor_Data_May_Aug_Screened_Hourly.csv']

dataset = []

for f in files:
    dataset.append(pd.read_csv(f'{data_bihar}/{f}'))

In [None]:
# dataset

In [None]:
lat_long_to_block = {}
lat_long_to_distr = {}

for data in dataset:
    cols = list(data.columns.values)[1:]

    for c in cols:
        lat, long = float(data.loc[2, c]), float(data.loc[3, c])

        if math.isnan(lat) or math.isnan(long):
            continue

        if (lat, long) not in lat_long_to_block:
            lat_long_to_block[(lat, long)] = data.loc[1, c]
            lat_long_to_distr[(lat, long)] = data.loc[0, c]

# print(lat_long_to_block)
# print(lat_long_to_distr)
print(len(lat_long_to_block), len(lat_long_to_distr))

In [None]:
# Mapping -> (Latitude, Longitude) -> {Timestamp -> ts, Device_ID -> d_id, RH -> rh, Temp -> temp, PM25 -> pm25}
full_data = []

for data in dataset:
    
    f_data = {}
    # f_data = {'timestamp': [], 'device_id': [], 'block': [], 'district': [], 'latitude': [], 'longitude': [], 'rh': [], 
    #              'temp': [], 'pm25': []}
    
    cols = list(data.columns.values)[1:]
    ts = list(data.loc[6:, data.columns.values[0]])

    for c in cols:
        lat, long = float(data.loc[2, c]), float(data.loc[3, c])

        if math.isnan(lat) or math.isnan(long):
            continue

        if (lat, long) not in f_data:
            f_data[(lat, long)] = {'timestamp': ts, 'block': lat_long_to_block[(lat, long)], 
                                   'district': lat_long_to_distr[(lat, long)], 'rh': [], 'temp': [], 'pm25': []}

        if c[0] == 'P':                 
            # f_data[(lat, long)]['device_id'] = c[5:]            # PM25_ is a 5 length substring
            f_data[(lat, long)]['pm25'] = data.loc[6:, c].to_list()
        elif c[0] == 'T':
            f_data[(lat, long)]['temp'] = data.loc[6:, c].to_list()
        elif c[0] == 'R':
            f_data[(lat, long)]['rh'] = data.loc[6:, c].to_list()
        
    full_data.append(f_data)

In [None]:
# full_data[0]

In [None]:
f_data = {'timestamp': [], 'block': [], 'district': [], 'latitude': [], 'longitude': [], 'rh': [], 'temp': [], 'pm25': []}

In [None]:
for data in full_data:

    for key in data:
        lat, long, sz = key[0], key[1], len(data[key]['timestamp'])
        
        assert len(data[key]['timestamp']) == len(data[key]['rh']) == len(data[key]['temp']) == len(data[key]['pm25']), "Improper logic"

        f_data['timestamp'].extend(data[key]['timestamp'])
        # f_data['device_id'].extend([data[key]['device_id']] * sz)
        f_data['block'].extend([data[key]['block']] * sz)
        f_data['district'].extend([data[key]['district']] * sz)
        f_data['latitude'].extend([lat] * sz)
        f_data['longitude'].extend([long] * sz)
        f_data['rh'].extend(data[key]['rh'])
        f_data['temp'].extend(data[key]['temp'])
        f_data['pm25'].extend(data[key]['pm25'])

In [None]:
# df_cols = ['timestamp', 'device_id', 'block', 'district', 'latitude', 'longitude', 'rh', 'temp', 'pm25']
# df_types = [np.datetime64, object, object, object, float, float, float, float, float]

### DataFrame initialization

In [None]:
df = pd.DataFrame(data=f_data)
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df = df.dropna(subset=['timestamp', 'pm25'])
df = df.sort_values(by='timestamp')
df

In [None]:
df.to_pickle(f'{data_bihar}/bihar_512_sensor_data.pkl')

In [None]:
df = pd.read_pickle(f'{data_bihar}/bihar_512_sensor_data_imputed.pkl')
df

In [None]:
df.count()

In [None]:
# times = list(df['timestamp'].unique())
# unique_dates = sorted(list(set([datetime.strptime(str(ts), "%Y-%m-%d %H:%M:%S").date() for ts in times])))
# unique_dates

In [None]:
df_new = df[['timestamp', 'latitude', 'longitude', 'rh', 'temp', 'pm25']].copy(deep=True)
df_new['timestamp'] = df_new['timestamp'].values.astype(float)
df_new

In [None]:
data = df_new.to_numpy()
data.shape

In [None]:
imputed_data = impute(data, method='iterative')

In [None]:
df['rh'] = imputed_data[:, 3]
df['temp'] = imputed_data[:, 4]
df.count()

In [None]:
df.to_pickle(f'{data_bihar}/bihar_512_sensor_data_imputed.pkl')

In [None]:
splits = ['random', 'timestamp', 'lat_long']

for split in splits:
    eval = train_and_eval(imputed_data, method='iterative', model_dir=model_dir, split=split, model_type='xgb')
    print(f'{split}: {eval}')

In [None]:
df = pd.read_pickle(f'{data_bihar}/bihar_512_sensor_data_imputed.pkl')
df

In [None]:
locations = set(df.groupby(['latitude', 'longitude']).groups.keys())
# locations

In [6]:
netcdf_file = nc.Dataset(f'{data_bihar}/Era5_data_May_Dec_2023.nc', 'r')
pbl_file = nc.Dataset(f'{data_bihar}/PBLH_may_Dec_2023.nc', 'r')


dimensions = pbl_file.dimensions
print("Dimensions:")
for dim_name, dim_obj in dimensions.items():
    print(f"{dim_name}: {len(dim_obj)}")

# Get variables
vars = list(netcdf_file.variables.keys())
print(vars)
# for x in vars:
#     if x == 'expver': continue
#     print(pbl_file.variables[x].long_name, pbl_file.variables[x].units)

# Close the NetCDF file
netcdf_file.close()
pbl_file.close()

Dimensions:
longitude: 361
latitude: 141
expver: 2
time: 5880
['longitude', 'latitude', 'expver', 'time', 'u10', 'v10', 't2m', 'kx', 'sp', 'tp']


In [None]:
print(netcdf_file.variables.keys())
print(pbl_file.variables.keys())

In [None]:
blh = pbl_file.variables['blh'][:]

# ts = netcdf_file.variables['longitude'][:]
# ts

In [None]:
blh[0][0]