In [21]:
import os
import math
import yaml
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import lightgbm

ModuleNotFoundError: No module named 'lightgbm'

In [2]:
dir = os.path.abspath('.')
config_fp = f'{dir}/config.yaml'

with open(config_fp, 'r') as f:
    config = yaml.safe_load(f)

In [3]:
proj_dir = config['filepath']['proj_dir']
data_dir = config['filepath']['data_dir']
loc_fp = config['filepath']['loc_fp']

In [4]:
files = {'JJAS_2023.txt': [[2023, 6, 1], [2023, 10, 1]],
         'ON_2023.txt': [[2023, 10, 1], [2023, 12, 1]],
         'DJF_2024.txt': [[2023, 12, 1], [2024, 3, 1]],
         'MAM_2024.txt': [[2024, 3, 1], [2024, 5, 1]]}

columns = ['timestamp', 'longitude', 'latitude', 'ventilation_coefficient']

In [5]:
def time_arr(dates):
    curr_date = datetime(*dates[0])
    end_date = datetime(*dates[1])
    
    ts = []

    while curr_date < end_date:
        ts.append(curr_date)
        curr_date = curr_date + timedelta(hours=1)
    
    return ts

In [6]:
df_list = []

for file, dates in files.items():
    ts = time_arr(dates)

    with open(f'{data_dir}/{file}') as f:

        for line in f:
            line = line.strip().split(',')

            lon, lat = float(line[0]), float(line[1])
            # print(line[0], line[1], lon, lat)
            if math.isnan(lon) or math.isnan(lat): continue
            
            lon_arr, lat_arr = [lon] * len(ts), [lat] * len(ts)

            values = [float(line[i]) for i in range(2, len(line))]

            assert len(ts) == len(lon_arr) == len(lat_arr) == len(values)

            data = {'timestamp': ts,
                    'longitude': lon_arr,
                    'latitude': lat_arr,
                    'ventilation_coeff': values}
            

            df_list.append(pd.DataFrame(data=data))

df = pd.concat(df_list, ignore_index=True)

In [7]:
df

Unnamed: 0,timestamp,longitude,latitude,ventilation_coeff
0,2023-06-01 00:00:00,83.903,25.392,218.301986
1,2023-06-01 01:00:00,83.903,25.392,712.042706
2,2023-06-01 02:00:00,83.903,25.392,2580.330809
3,2023-06-01 03:00:00,83.903,25.392,3882.309261
4,2023-06-01 04:00:00,83.903,25.392,6393.469082
...,...,...,...,...
4309435,2024-04-30 19:00:00,85.797,25.770,13107.508807
4309436,2024-04-30 20:00:00,85.797,25.770,
4309437,2024-04-30 21:00:00,85.797,25.770,11710.304271
4309438,2024-04-30 22:00:00,85.797,25.770,


In [10]:
locs = []

with open(loc_fp, 'r') as f:
    for line in f:
        data = line.strip().split('|')
        locs.append((float(data[-2]), float(data[-1])))

locs = set(locs)

In [15]:
df['locs'] = df.apply(lambda x: (x['longitude'], x['latitude']), axis=1)

df

Unnamed: 0,timestamp,longitude,latitude,ventilation_coeff,locs
0,2023-06-01 00:00:00,83.903,25.392,218.301986,"(83.903, 25.392)"
1,2023-06-01 01:00:00,83.903,25.392,712.042706,"(83.903, 25.392)"
2,2023-06-01 02:00:00,83.903,25.392,2580.330809,"(83.903, 25.392)"
3,2023-06-01 03:00:00,83.903,25.392,3882.309261,"(83.903, 25.392)"
4,2023-06-01 04:00:00,83.903,25.392,6393.469082,"(83.903, 25.392)"
...,...,...,...,...,...
4309435,2024-04-30 19:00:00,85.797,25.770,13107.508807,"(85.797, 25.77)"
4309436,2024-04-30 20:00:00,85.797,25.770,,"(85.797, 25.77)"
4309437,2024-04-30 21:00:00,85.797,25.770,11710.304271,"(85.797, 25.77)"
4309438,2024-04-30 22:00:00,85.797,25.770,,"(85.797, 25.77)"


In [17]:
df_locs = df['locs'].unique()

In [20]:
cnt = 0
for l in df_locs:
    if l in locs:
        cnt += 1

cnt

511