In [13]:
import os
import math
import yaml
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import lightgbm
import warnings
warnings.filterwarnings('ignore')

In [14]:
dir = os.path.abspath('.')
config_fp = f'{dir}/config.yaml'

with open(config_fp, 'r') as f:
    config = yaml.safe_load(f)

In [15]:
proj_dir = config['filepath']['proj_dir']
data_dir = config['filepath']['data_dir']
loc_fp = data_dir + config['filepath']['loc_fp']
bihar_pkl_fp = data_dir + config['filepath']['bihar_pkl_fp']

In [16]:
bihar_locs = []

with open(loc_fp, 'r') as f:
    for line in f:
        data = line.strip().split('|')
        bihar_locs.append((float(data[-2]), float(data[-1])))

bihar_locs = set(bihar_locs)

In [17]:
files = {'JJAS_2023.txt': [[2023, 6, 1], [2023, 10, 1]],
         'ON_2023.txt': [[2023, 10, 1], [2023, 12, 1]],
         'DJF_2024.txt': [[2023, 12, 1], [2024, 3, 1]],
         'MAM_2024.txt': [[2024, 3, 1], [2024, 5, 1]]}

columns = ['timestamp', 'longitude', 'latitude', 'ventilation_coefficient']

In [18]:
def time_arr(dates):
    curr_date = datetime(*dates[0])
    end_date = datetime(*dates[1])
    
    ts = []

    while curr_date < end_date:
        ts.append(curr_date)
        curr_date = curr_date + timedelta(hours=1)
    
    return ts

In [19]:
df_list = []

for file, dates in files.items():
    ts = time_arr(dates)

    cnt = 0

    with open(f'{data_dir}/airshed/{file}') as f:

        for line in f:
            line = line.strip().split(',')
            lon, lat = float(line[0]), float(line[1])

            if math.isnan(lon) or math.isnan(lat): continue
            if (lon, lat) not in bihar_locs: continue

            cnt += 1
            
            lon_arr, lat_arr = [lon] * len(ts), [lat] * len(ts)

            values = [float(line[i]) for i in range(2, len(line))]

            assert len(ts) == len(lon_arr) == len(lat_arr) == len(values)

            data = {'timestamp': ts,
                    'longitude': lon_arr,
                    'latitude': lat_arr,
                    'ventilation_coeff': values}
            

            df_list.append(pd.DataFrame(data=data))
    
    print(cnt)

df = pd.concat(df_list, ignore_index=True)
df

511
511
511
511


Unnamed: 0,timestamp,longitude,latitude,ventilation_coeff
0,2023-06-01 00:00:00,83.903,25.392,218.301986
1,2023-06-01 01:00:00,83.903,25.392,712.042706
2,2023-06-01 02:00:00,83.903,25.392,2580.330809
3,2023-06-01 03:00:00,83.903,25.392,3882.309261
4,2023-06-01 04:00:00,83.903,25.392,6393.469082
...,...,...,...,...
4108435,2024-04-30 19:00:00,85.797,25.770,13107.508807
4108436,2024-04-30 20:00:00,85.797,25.770,
4108437,2024-04-30 21:00:00,85.797,25.770,11710.304271
4108438,2024-04-30 22:00:00,85.797,25.770,


In [24]:
pm25_files = ['Bihar_Feb_2024_Screened.csv', 'Bihar_Mar_2024_Screened.csv', 'Bihar_Apr_2024_Screened.csv']

df_list = []

for fp in pm25_files:
    df_pm25 = pd.read_csv(f'{data_dir}/{fp}')
    ts = [datetime.strptime(date, "%Y-%m-%d %H:%M:%S") for date in df_pm25.iloc[6:, 0]]

    cnt = 0
    fp_locs = set()

    for col in df_pm25.columns:
        if col[0] != 'P': continue
        loc = (float(df_pm25.loc[3, col]), float(df_pm25.loc[2, col]))
        fp_locs.add(loc)
        if loc not in bihar_locs: continue

        cnt += 1
        
        data = {
            'timestamp': ts,
            'longitude': [float(df_pm25.loc[3, col])] * len(ts),
            'latitude': [float(df_pm25.loc[2, col])] * len(ts),
            'pm25': [float(pm25) for pm25 in df_pm25.loc[6:, col]]
        }

        t_df = pd.DataFrame(data=data)
        t_df['timestamp'] = t_df['timestamp'].dt.floor('H')
        numeric_cols = [col for col in t_df.columns if t_df[col].dtype == float]

        t_df = t_df.groupby('timestamp')[numeric_cols].mean()
        t_df = t_df.reset_index()

        df_list.append(t_df)
    
    cnt_1 = 0
    for loc in bihar_locs:
        if loc in fp_locs: continue
        cnt_1 += 1

        data = {
            'timestamp': ts,
            'longitude': [loc[0]] * len(ts),
            'latitude': [loc[1]] * len(ts),
            'pm25': [float('nan')] * len(ts)
        }

        df_list.append(pd.DataFrame(data=data))

    print(cnt, cnt_1)

df_pm25 = pd.concat(df_list)
df_pm25

511 0
511 0
509 2


Unnamed: 0,timestamp,longitude,latitude,pm25
0,2024-02-01 00:00:00,87.457,26.148,
1,2024-02-01 01:00:00,87.457,26.148,
2,2024-02-01 02:00:00,87.457,26.148,
3,2024-02-01 03:00:00,87.457,26.148,
4,2024-02-01 04:00:00,87.457,26.148,
...,...,...,...,...
715,2024-04-30 19:00:00,84.465,27.098,
716,2024-04-30 20:00:00,84.465,27.098,
717,2024-04-30 21:00:00,84.465,27.098,
718,2024-04-30 22:00:00,84.465,27.098,


In [25]:
df_pm25.count()

timestamp    1103760
longitude    1103760
latitude     1103760
pm25          701469
dtype: int64

In [26]:
df_pm25_prev = pd.read_pickle(bihar_pkl_fp)
df_pm25_prev = df_pm25_prev[['timestamp', 'longitude', 'latitude', 'pm25']]

df_pm25_f = pd.concat([df_pm25, df_pm25_prev])
df_pm25_f = df_pm25_f[df_pm25_f['timestamp'] > pd.Timestamp('2023-05-31 23:00:00')]
df_pm25_f

Unnamed: 0,timestamp,longitude,latitude,pm25
0,2024-02-01 00:00:00,87.457,26.148,
1,2024-02-01 01:00:00,87.457,26.148,
2,2024-02-01 02:00:00,87.457,26.148,
3,2024-02-01 03:00:00,87.457,26.148,
4,2024-02-01 04:00:00,87.457,26.148,
...,...,...,...,...
3384859,2024-01-31 23:00:00,85.561,25.976,154.026599
3384860,2024-01-31 23:00:00,84.147,24.610,298.600000
3384861,2024-01-31 23:00:00,87.270,25.490,165.302997
3384862,2024-01-31 23:00:00,86.989,25.000,126.886332


In [33]:
df_f = pd.merge(df, df_pm25_f, on=['timestamp', 'longitude', 'latitude'], how='left')
df_f = df_f.sort_values(by='timestamp')
df_f

Unnamed: 0,timestamp,longitude,latitude,ventilation_coeff,pm25
0,2023-06-01 00:00:00,83.903,25.392,218.301986,95.937754
1030656,2023-06-01 00:00:00,88.172,26.316,131.792091,95.937754
1130208,2023-06-01 00:00:00,84.663,26.170,245.459916,20.833333
761280,2023-06-01 00:00:00,83.811,24.553,2397.666969,23.701190
453840,2023-06-01 00:00:00,85.363,25.213,754.990785,95.937754
...,...,...,...,...,...
3963503,2024-04-30 23:00:00,86.400,25.641,10260.580147,
3546263,2024-04-30 23:00:00,84.302,25.590,9853.231298,39.500000
3964967,2024-04-30 23:00:00,85.085,24.578,16229.221784,20.930000
3446711,2024-04-30 23:00:00,87.393,25.821,9937.008842,9.250000


In [34]:
df_f.count()

timestamp            4108440
longitude            4108440
latitude             4108440
ventilation_coeff    3366468
pm25                 3706149
dtype: int64

In [35]:
df_f.to_csv(f'{data_dir}/airshed/bihar_june_apr.csv', index=False)