In [1]:
import sys
sys.path.append('..')

import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import time
from constants import *
from data.MeteoDataset import MeteoDataset

In [3]:
# df = pd.read_pickle(f'{data_bihar}/bihar_meteo_may_dec.pkl')
# df['pm25'].min(), df['pm25'].max()

In [4]:
# df = pd.read_pickle(f'{data_bihar}/bihar_meteo_era5_may_dec.pkl')
# df['pm25'].min(), df['pm25'].max()

In [9]:
def data_proc(df, ts, start_index):
    
    locs = {}
    p_df = pd.DataFrame(columns=[x for x in METEO_COLUMNS_DICT.keys()])
    p_df = p_df.astype(METEO_COLUMNS_DICT)

    for i, col in enumerate(df.columns):

        if i == 0: continue
        option = 1 if col[0] == 'P' else (2 if col[0] == 'T' else 3)
        values = np.array(df[col][start_index:], dtype=np.float64)

        df[col][2], df[col][3] = float(df[col][2]), float(df[col][3])
        loc = (df[col][2], df[col][3])

        if loc not in locs:
            locs[loc] = MeteoDataset(ts, df[col][2], df[col][3], df[col][1], df[col][0])
            locs[loc].set_features(values, option)
        else:
            locs[loc].set_features(values, option)
            
    for _, obj in locs.items():

        dtype = {'timestamp': 'datetime64[ns]', 'rh': float, 'temp': float, 'pm25': float}
        t_df = pd.DataFrame(columns=[x for x in dtype.keys()])
        t_df = t_df.astype(dtype)

        t_df['timestamp'] = obj.timestamp
        t_df['rh'] = obj.rh
        t_df['temp'] = obj.temp
        t_df['pm25'] = obj.pm25

        '''
            15 min updates -> Hourly Updates.
        '''
        t_df['timestamp'] = t_df['timestamp'].dt.floor('H')
        # numeric_cols = [col for col in t_df.columns if t_df[col].dtype == float]
        numeric_cols = ['rh', 'temp', 'pm25']

        t_df = t_df.groupby('timestamp')[numeric_cols].mean().reset_index()
        '''
            15 min updates -> Hourly Updates completed
        '''

        t_df['block'] = [obj.block] * t_df.shape[0]
        t_df['district'] = [obj.district] * t_df.shape[0]
        t_df['latitude'] = [obj.latitude] * t_df.shape[0]
        t_df['longitude'] = [obj.longitude] * t_df.shape[0]

        p_df = pd.concat([p_df, t_df], ignore_index=True)

    return p_df

def create_meteo_dataframe(files, start_date, end_date):

    df = pd.DataFrame(columns=[x for x in METEO_COLUMNS_DICT.keys()])
    df = df.astype(METEO_COLUMNS_DICT)

    for f, start_index in files.items():

        start_time = time.time()

        f_df = pd.read_csv(f'{data_bihar}/{f}')
        ts = pd.to_datetime(f_df.to_numpy()[start_index:, 0])

        t_df = data_proc(f_df, ts, start_index)
        df = pd.concat([df, t_df], ignore_index=True)

        print(f'{f} processed \t time_taken: {(time.time()-start_time)/60:.2f} mins')

    df = df.sort_values(by='timestamp')

    df = df[df['timestamp'] < end_date]
    duration = (end_date - start_date).days * 24

    df_g = df.groupby(['latitude', 'longitude'])

    locs = []

    for loc, group in df_g:

        if group.shape[0] != duration: continue
        locs.append(loc)

    df = df[df[['latitude', 'longitude']].apply(tuple, 1).isin(locs)]
    # df.to_pickle(file_name, protocol=4)
    
    return df

In [12]:
files = {'Bihar_536_Sensor_Data_Sep_2023_Screened.csv': 7, 'Bihar_536_Sensor_Data_Oct_2023_Screened.csv': 7,\
        'Bihar_536_Sensor_Data_Nov_2023_Screened.csv': 6, 'Bihar_536_Sensor_Data_Jan_2024_Screened.csv': 6,\
        'Bihar_536_Sensor_Data_Dec_2023_Screened.csv': 6, 'Bihar_512_Sensor_Data_May_Aug_Screened_Hourly.csv': 6}

start_date, end_date = pd.Timestamp('2023-05-01 00:00:00'), pd.Timestamp('2024-01-01 00:00:00')

df = create_meteo_dataframe(files, start_date, end_date)
df

Bihar_536_Sensor_Data_Sep_2023_Screened.csv processed 	 time_taken: 0.11 mins
Bihar_536_Sensor_Data_Oct_2023_Screened.csv processed 	 time_taken: 0.11 mins
Bihar_536_Sensor_Data_Nov_2023_Screened.csv processed 	 time_taken: 0.15 mins
Bihar_536_Sensor_Data_Jan_2024_Screened.csv processed 	 time_taken: 0.10 mins
Bihar_536_Sensor_Data_Dec_2023_Screened.csv processed 	 time_taken: 0.25 mins
Bihar_512_Sensor_Data_May_Aug_Screened_Hourly.csv processed 	 time_taken: 0.21 mins


Unnamed: 0,timestamp,district,block,latitude,longitude,rh,temp,pm25
2912952,2023-05-01 00:00:00,VAISHALI,PATEDHI BELSAR,25.98455,85.231468,,,
2951328,2023-05-01 00:00:00,GAYA,GURUA,24.66700,84.785000,83.06859,24.583846,52.224359
2328456,2023-05-01 00:00:00,NAWADA,PAKRI BARAWAN,24.95200,85.729000,,,
3119592,2023-05-01 00:00:00,BEGUSARAI,MATIHANI,25.37300,86.160000,85.33375,25.020000,47.920833
2378640,2023-05-01 00:00:00,GAYA,GURARU,24.80800,84.788000,,,
...,...,...,...,...,...,...,...,...
1840727,2023-12-31 23:00:00,BHOJPUR,JAGDISHPUR,25.45900,84.417000,74.79000,15.475641,78.900000
1647287,2023-12-31 23:00:00,AURANGABAD,GOH,24.98300,84.655000,,,
1665887,2023-12-31 23:00:00,PURNIA,JALALGARH,25.96800,87.531000,,,
1656959,2023-12-31 23:00:00,BHAGALPUR,PIRPAINTI,25.29800,87.427000,,,


In [13]:
df['pm25'].min(), df['pm25'].max()

(0.0329749103942652, 998.0)