In [1]:
import sys
sys.path.append('../')

In [19]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from constants import *
from map_utils import *
from eda_utils import *
from scipy.interpolate import griddata
import pickle as pkl
import datetime

### Load Bihar GeoJSON file (for topology), and the modified CSV file that contains atleast 300 not nan values for each timestamp

In [3]:
bihar = gpd.read_file(r'../data/Bihar/bihar.json')
data_file = f'{data_bihar}/Bihar_419_data_modified.csv'

In [4]:
df = pd.DataFrame(columns={'timestamp': pd.Timestamp, 'latitude': np.float64, 'longitude': np.float64, 'rh': np.float32,\
                           'temp': np.float64, 'pm25': np.float64})

df = df.append(pd.read_csv(data_file, parse_dates=['timestamp']))

In [5]:
df.dtypes

timestamp    datetime64[ns]
latitude            float64
longitude           float64
rh                  float64
temp                float64
pm25                float64
dtype: object

In [6]:
df['timestamp'] = df['timestamp'].values.astype(float)

In [7]:
impute_data = impute(df.to_numpy(), 'iterative')

In [8]:
file_path = f'{model_dir}/bihar_xgb_iterative.pkl'
model = pkl.load(open(file_path, "rb"))
preds = model.predict(impute_data[:, :-1])

In [9]:
df['rh'], df['temp'], df['pm25'] = impute_data[:, -3], impute_data[:, -2], preds
df.isna().sum()

timestamp    0
latitude     0
longitude    0
rh           0
temp         0
pm25         0
dtype: int64

### Data variable initialized below is a list, where each item of the list is a dictionary for a particular timestamp 

In [10]:
# data = []

# for i in range(0, len(df), 491):
#     row = {}
#     row['lat'] = impute_data[i:i+490, 1]
#     row['long'] = impute_data[i:i+490, 2]
#     row['pm25'] = preds[i:i+490]
#     data.append(row)

In [11]:
df_ts = df.copy(deep=True)
df_ts['timestamp'] = pd.to_datetime(df_ts['timestamp'], unit='ns')
df_ts

Unnamed: 0,timestamp,latitude,longitude,rh,temp,pm25
0,2023-07-21 17:00:00,26.148,87.457,98.000000,30.000000,5.226232
1,2023-07-21 17:00:00,26.359,87.443,80.500000,31.500000,11.814528
2,2023-07-21 17:00:00,26.407,87.549,99.000000,34.000000,6.900817
3,2023-07-21 17:00:00,26.338,87.107,58.180485,34.013976,8.195849
4,2023-07-21 17:00:00,26.089,87.152,80.500000,30.000000,7.211899
...,...,...,...,...,...,...
265626,2023-08-16 12:00:00,26.994,84.410,50.600000,40.500000,33.636784
265627,2023-08-16 12:00:00,27.133,84.060,60.500000,37.500000,29.982538
265628,2023-08-16 12:00:00,26.907,84.136,62.600000,38.600000,29.238592
265629,2023-08-16 12:00:00,27.060,84.023,60.800000,36.200000,32.201900


In [14]:
min_lat, max_lat, min_long, max_long = coordinate_bounds(bihar)
print(min_lat, max_lat, min_long, max_long)

24.286327 27.521347 83.320238 88.29954611201047


#### data_ts_dict is a dictionary with mapping 'Timestamp' -> ['Latitude', 'Longitude', 'RH', 'Temp', 'PM25'] 

In [45]:
data_ts_dict = {timestamp: group for timestamp, group in df_ts.groupby('timestamp')}
# data_ts_dict

In [13]:
data_ts_dict[pd.Timestamp('2023-07-21 18:00:00')]

Unnamed: 0,timestamp,latitude,longitude,rh,temp,pm25
491,2023-07-21 18:00:00,26.148,87.457,98.000000,30.000000,5.226232
492,2023-07-21 18:00:00,26.359,87.443,78.500000,32.000000,10.911686
493,2023-07-21 18:00:00,26.407,87.549,99.000000,33.000000,5.608342
494,2023-07-21 18:00:00,26.338,87.107,58.235216,34.008843,8.195849
495,2023-07-21 18:00:00,26.089,87.152,81.500000,30.500000,11.047097
...,...,...,...,...,...,...
977,2023-07-21 18:00:00,26.994,84.410,54.700000,36.440000,17.983072
978,2023-07-21 18:00:00,27.133,84.060,75.620000,32.450000,50.170486
979,2023-07-21 18:00:00,26.907,84.136,65.740000,34.330000,25.758116
980,2023-07-21 18:00:00,27.060,84.023,60.530000,34.740000,35.868958


#### data_dt_dict is a dictionary with mapping 'Date' -> List of data_ts_dict values with timestamp in that date

In [40]:
data_dt_dict = {}

for timestamp, values in data_ts_dict.items():
    date = timestamp.date()
    row = {timestamp: values}
    if date not in data_dt_dict:
        data_dt_dict[date] = []
    data_dt_dict[date].append(row)

In [41]:
# data_dt_dict

In [42]:
len(data_dt_dict[datetime.date(2023, 8, 16)])

13

In [43]:
# DATES[0].strftime('%Y-%m-%d')

### Get Mask for points within Bihar region

In [44]:
pm25_values = []
grid_long, grid_lat = np.meshgrid(np.linspace(min_long, max_long, GRID_SIZE), np.linspace(min_lat, max_lat, GRID_SIZE))
mask = get_indices(grid_long, grid_lat, bihar)

#### Creating Average plots for each day

In [53]:
for date, ts_dict in data_dt_dict.items():
    date = date.strftime('%Y-%m-%d')
    tot_values = np.zeros((GRID_SIZE, GRID_SIZE))
    for data in ts_dict:
        for ts, val in data.items():
            grid_values = griddata((val['latitude'], val['longitude']), val['pm25'], (grid_lat, grid_long), method='nearest')
            tot_values = np.add(tot_values, grid_values)
    
    tot_values = tot_values / len(data)
    create_plot(grid_long[mask], grid_lat[mask], tot_values[mask], bihar, f'map_orig_{date}', 'jpg')

#### EWMA parameters and calculation for each timestep, and creating the plots according to it

In [52]:
# ALPHA = 0.9
# ewma = pm25_values[0]
# ewma.shape

In [55]:
# for i, pm25 in enumerate(pm25_values):
#     ewma = np.add(ALPHA * pm25, (1 - ALPHA) * ewma)
        
# create_plot(grid_long[mask], grid_lat[mask], ewma[mask], bihar, f'map_orig_ewma', 'jpg')

## LCN

In [56]:
for date, ts_dict in data_dt_dict.items():
    date = date.strftime('%Y-%m-%d')
    tot_values = np.zeros((GRID_SIZE, GRID_SIZE))
    for data in ts_dict:
        for ts, val in data.items():
            grid_values = griddata((val['latitude'], val['longitude']), val['pm25'], (grid_lat, grid_long), method='nearest')
            smoothed_values = LCN(grid_long, grid_lat, grid_values)
            tot_values = np.add(tot_values, smoothed_values)
    
    tot_values = tot_values / len(data)
    create_plot(grid_long[mask], grid_lat[mask], tot_values[mask], bihar, f'map_LCN_{date}', 'jpg')

In [None]:
# ewma = LCN(grid_long, grid_lat, pm25_values[0])

In [54]:
# for i, pm25 in enumerate(pm25_values):
#     smoothed_values = LCN(grid_long, grid_lat, pm25)
#     ewma = np.add(ALPHA * smoothed_values, (1 - ALPHA) * ewma)

# create_plot(grid_long[mask], grid_lat[mask], ewma[mask], bihar, f'map_LCN_ewma', 'jpg')

## Get Resolution for each point in the Image

We have favorable points as the number of points spread equally in the whole Bihar Region, and we know the total area of Bihar is 94,163 $km^{2}$ (publically available data), so we can find out the resolution for each point of image (i.e. area covered by each point) by simply dividing area by total number of favorable points

In [None]:
print(np.sqrt(94163/np.sum(mask)))

Hence, each point covers approximately $529.56 \; m * 529.56 \; m$ area