In [1]:
import os
import yaml
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')

In [2]:
curr_dir = os.path.abspath('')

with open(f'{curr_dir}/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

In [3]:
proj_dir = config['filepath']['proj_dir']
data_dir = config['filepath']['data_dir']
bihar_fp = data_dir + '/airshed/' + config['filepath']['bihar_fp']
bihar_loc_fp = data_dir + config['filepath']['loc_fp']

In [4]:
ts = []
start_date, end_date = [2023, 6, 1], [2024, 5, 1]

curr_date = datetime(*start_date)
while curr_date < datetime(*end_date):
    ts.append(curr_date)

    curr_date += timedelta(hours=1)

In [6]:
df = pd.read_csv(bihar_fp)
df['timestamp'] = df['timestamp'].astype('datetime64[ns]')

df_knn = df.copy(deep=True)
df_knn['ventilation_coeff'] = df_knn.groupby(['longitude', 'latitude'])['ventilation_coeff'].transform(lambda x: x.ffill())
columns = list(df_knn.columns)
print(df_knn.dtypes, df_knn.count())
df_knn

timestamp            datetime64[ns]
longitude                   float64
latitude                    float64
ventilation_coeff           float64
pm25                        float64
dtype: object timestamp            4108440
longitude            4108440
latitude             4108440
ventilation_coeff    4108440
pm25                 3706149
dtype: int64


Unnamed: 0,timestamp,longitude,latitude,ventilation_coeff,pm25
0,2023-06-01 00:00:00,83.903,25.392,218.301986,95.937754
1,2023-06-01 00:00:00,88.172,26.316,131.792091,95.937754
2,2023-06-01 00:00:00,84.663,26.170,245.459916,20.833333
3,2023-06-01 00:00:00,83.811,24.553,2397.666969,23.701190
4,2023-06-01 00:00:00,85.363,25.213,754.990785,95.937754
...,...,...,...,...,...
4108435,2024-04-30 23:00:00,86.400,25.641,10260.580147,
4108436,2024-04-30 23:00:00,84.302,25.590,9853.231298,39.500000
4108437,2024-04-30 23:00:00,85.085,24.578,16229.221784,20.930000
4108438,2024-04-30 23:00:00,87.393,25.821,9937.008842,9.250000


In [7]:
def KNN_impute(df):
    data = df.to_numpy()

    imputer = KNNImputer(n_neighbors=3)
    data = imputer.fit_transform(data)

    return data

In [22]:
df_list = []

for t in ts:
    df_t = df_knn.copy()
    df_t = df_t[df_t['timestamp'] == t]
    # print(t, df_t.shape)
    df_t['timestamp'] = df_t['timestamp'].apply(lambda x: (x-datetime(*start_date)).total_seconds()/3600)

    data = KNN_impute(df_t)
    df_t = pd.DataFrame(data=data, columns=columns)
    df_t['timestamp'] = df_t['timestamp'].apply(lambda x: datetime(*start_date) + timedelta(hours=x))
    
    ts_list = list(df_t['timestamp'].unique())
    assert len(ts_list) == 1 and t == ts_list[0]

    df_list.append(df_t)

In [25]:
df_knn_imputed = pd.concat(df_list).reset_index(drop=True)
print(df_knn_imputed.dtypes)
print(df_knn_imputed.count())
df_knn_imputed.sort_values(by=['timestamp', 'longitude', 'latitude'], inplace=True)
df_knn_imputed = df_knn_imputed.reset_index(drop=True)
df_knn_imputed

timestamp            datetime64[ns]
longitude                   float64
latitude                    float64
ventilation_coeff           float64
pm25                        float64
dtype: object
timestamp            4108440
longitude            4108440
latitude             4108440
ventilation_coeff    4108440
pm25                 4108440
dtype: int64


Unnamed: 0,timestamp,longitude,latitude,ventilation_coeff,pm25
0,2023-06-01 00:00:00,83.400,25.112,132.274372,95.937754
1,2023-06-01 00:00:00,83.513,25.043,90.595377,95.937754
2,2023-06-01 00:00:00,83.539,25.206,121.315759,95.937754
3,2023-06-01 00:00:00,83.610,24.962,92.712455,95.937754
4,2023-06-01 00:00:00,83.610,25.040,59.594687,95.937754
...,...,...,...,...,...
4108435,2024-04-30 23:00:00,87.938,26.088,0.000000,69.500000
4108436,2024-04-30 23:00:00,87.941,25.747,1892.137510,45.250000
4108437,2024-04-30 23:00:00,87.952,26.105,0.000000,79.750000
4108438,2024-04-30 23:00:00,88.121,26.426,0.000000,69.140000


In [27]:
df_knn_imputed['timestamp'].unique()

<DatetimeArray>
['2023-06-01 00:00:00', '2023-06-01 01:00:00', '2023-06-01 02:00:00',
 '2023-06-01 03:00:00', '2023-06-01 04:00:00', '2023-06-01 05:00:00',
 '2023-06-01 06:00:00', '2023-06-01 07:00:00', '2023-06-01 08:00:00',
 '2023-06-01 09:00:00',
 ...
 '2024-04-30 14:00:00', '2024-04-30 15:00:00', '2024-04-30 16:00:00',
 '2024-04-30 17:00:00', '2024-04-30 18:00:00', '2024-04-30 19:00:00',
 '2024-04-30 20:00:00', '2024-04-30 21:00:00', '2024-04-30 22:00:00',
 '2024-04-30 23:00:00']
Length: 8040, dtype: datetime64[ns]

In [36]:
def time_series_impute(df):
    df_list = []
    # cols = ['ventilation_coeff', 'pm25']

    for _, group_df in df.groupby(['longitude', 'latitude']):
        group_df.sort_values(by='timestamp', inplace=True)

        g_df = group_df.copy()
        g_df['timestamp'] = g_df['timestamp'].apply(lambda x: (x-datetime(*start_date)).total_seconds()/3600)

        imputer = KNNImputer(n_neighbors=3)
        data = imputer.fit_transform(g_df.to_numpy())

        g_df = pd.DataFrame(data=data, columns=group_df.columns)
        g_df['timestamp'] = g_df['timestamp'].apply(lambda x : datetime(*start_date)+timedelta(hours=x))

        ts_list = sorted(list(g_df['timestamp'].unique()))

        for t, t_l in zip(ts, ts_list):
            assert t == t_l

        df_list.append(g_df)

    return pd.concat(df_list).reset_index(drop=True)

In [37]:
df_time_series_imputed = time_series_impute(df.copy())
print(df_time_series_imputed.count())

df_time_series_imputed.sort_values(by=['timestamp', 'longitude', 'latitude'], inplace=True)
df_time_series_imputed = df_time_series_imputed.reset_index(drop=True)
df_time_series_imputed

timestamp            4108440
longitude            4108440
latitude             4108440
ventilation_coeff    4108440
pm25                 4108440
dtype: int64


Unnamed: 0,timestamp,longitude,latitude,ventilation_coeff,pm25
0,2023-06-01 00:00:00,83.400,25.112,132.274372,95.937754
1,2023-06-01 00:00:00,83.513,25.043,90.595377,95.937754
2,2023-06-01 00:00:00,83.539,25.206,121.315759,95.937754
3,2023-06-01 00:00:00,83.610,24.962,92.712455,95.937754
4,2023-06-01 00:00:00,83.610,25.040,59.594687,95.937754
...,...,...,...,...,...
4108435,2024-04-30 23:00:00,87.938,26.088,0.000000,69.500000
4108436,2024-04-30 23:00:00,87.941,25.747,1892.137510,45.250000
4108437,2024-04-30 23:00:00,87.952,26.105,0.000000,79.750000
4108438,2024-04-30 23:00:00,88.121,26.426,0.000000,69.140000


In [38]:
assert (df_knn_imputed['timestamp'] == df_time_series_imputed['timestamp']).all() and \
        (df_knn_imputed['longitude'] == df_time_series_imputed['longitude']).all() and \
        (df_knn_imputed['latitude'] == df_time_series_imputed['latitude']).all()

In [41]:
data = {
    'timestamp': df_knn_imputed['timestamp'],
    'longitude':  df_knn_imputed['longitude'],
    'latitude':  df_knn_imputed['latitude'],
    'ventilation_coeff':  (df_knn_imputed['ventilation_coeff'] + df_time_series_imputed['ventilation_coeff'])/2,
    'pm25': (df_knn_imputed['pm25'] + df_time_series_imputed['pm25'])/2,
}

df_imputed = pd.DataFrame(data=data)
print(df_imputed.count())
df_imputed

timestamp            4108440
longitude            4108440
latitude             4108440
ventilation_coeff    4108440
pm25                 4108440
dtype: int64


Unnamed: 0,timestamp,longitude,latitude,ventilation_coeff,pm25
0,2023-06-01 00:00:00,83.400,25.112,132.274372,95.937754
1,2023-06-01 00:00:00,83.513,25.043,90.595377,95.937754
2,2023-06-01 00:00:00,83.539,25.206,121.315759,95.937754
3,2023-06-01 00:00:00,83.610,24.962,92.712455,95.937754
4,2023-06-01 00:00:00,83.610,25.040,59.594687,95.937754
...,...,...,...,...,...
4108435,2024-04-30 23:00:00,87.938,26.088,0.000000,69.500000
4108436,2024-04-30 23:00:00,87.941,25.747,1892.137510,45.250000
4108437,2024-04-30 23:00:00,87.952,26.105,0.000000,79.750000
4108438,2024-04-30 23:00:00,88.121,26.426,0.000000,69.140000


In [42]:
df_imputed.to_csv(f'{data_dir}/airshed/bihar_june_apr_imputed.csv', index=False)