In [1]:
import os
import pandas as pd
import numpy as np


In [2]:
# --- config ---
base_dir = "D:/Desktop/MSc Thesis - Copy/"
weather_csv = os.path.join(base_dir, "Data/S-DoT_sensors/S-DoT_NATURE_20230724-20230826_en_preprocessed.csv")


In [3]:
df = pd.read_csv(weather_csv)

df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce", utc=True).dt.tz_convert("Asia/Seoul")
df = df.dropna(subset=["serial", "datetime"]).sort_values(["serial", "datetime"]).reset_index(drop=True)


# add month column
df["month"] = df["datetime"].dt.month
# add day column
df["day"] = df["datetime"].dt.day
# add hour column
df["hour"] = df["datetime"].dt.hour
# add day of week column (0=Mon, 6=Sun)
df["day_of_week"] = df["datetime"].dt.dayofweek


In [4]:
var_cols = [
    "temperature_mean_C",
    "humidity_mean_pc",
    "PM10_mean_microgm3",
]

In [5]:
# # --- filter: weekdays only and hours 08..23 inclusive ---
# is_weekday = df["datetime"].dt.weekday <= 4   # Mon=0 .. Fri=4
# in_hours   = df["datetime"].dt.hour.between(8, 23)  # inclusive
# df_filt = df[is_weekday & in_hours].copy().sort_values(["serial", "datetime"]).reset_index(drop=True)

In [6]:
# # --- helper: minutes since previous NON-NULL value for a feature (per station) ---
# def add_mins_to_nearest_valid(df_in: pd.DataFrame, feature: str,
#                               id_col: str = "serial", time_col: str = "datetime") -> pd.DataFrame:
#     df_out = df_in.copy()
#         # datetime only when the feature is non-null; NaT otherwise
#     dt_if_valid = df_out[time_col].where(df_out[feature].notna())
#         # last valid datetime carried forward per station
#     last_valid_dt = dt_if_valid.groupby(df_out[id_col]).ffill()
#         # minutes since that last valid
#     mins = (df_out[time_col] - last_valid_dt).dt.total_seconds() / 60.0
#         # round to whole minutes; keep NaN as missing (before first valid)
#     df_out[f"mins_to_nearest_valid__{feature}"] = mins.round().astype("Int64")
#     return df_out

In [7]:
def add_minutes_to_nearest_valid(
    df_in: pd.DataFrame,
    var_cols: list[str],
    id_col: str = "serial",
    time_col: str = "datetime",
) -> pd.DataFrame:
    """
    For each feature in var_cols and for each row:
      - if value is non-null -> 0 minutes
      - else -> minutes to the nearest (prev or next) non-null value within the same station
      - if a station has no valid values at all -> stays NaN
    Writes one column per feature: mins_to_nearest_valid__<feature> (Int64).
    """
    df = df_in.sort_values([id_col, time_col]).copy()

    for feature in var_cols:
        if feature not in df.columns:
            continue  # skip missing features gracefully

        t = df[time_col]
        v = df[feature]

        # timestamps of previous and next valid readings within each station
        prev_time = t.where(v.notna()).groupby(df[id_col]).ffill()
        next_time = t.where(v.notna()).groupby(df[id_col]).bfill()

        # distances (minutes) to prev/next valid
        dprev = (t - prev_time).dt.total_seconds() / 60.0
        dnext = (next_time - t).dt.total_seconds() / 60.0

        # row-wise minimum of the two distances (skip NaNs automatically)
        nearest = pd.concat([dprev, dnext], axis=1).min(axis=1, skipna=True)

        # rows that already have a value are distance 0
        nearest = nearest.mask(v.notna(), 0.0)

        # round to whole minutes; keep NaN where no valid exists either side (or ever)
        df[f"mins_to_nearest_valid__{feature}"] = nearest.round().astype("Int64")

    return df

In [8]:
# --- compute for each feature on the dataframe ---
# for col in var_cols:
#     df = add_minutes_to_nearest_valid(df, col)
df = add_minutes_to_nearest_valid(df, var_cols)


# quick peek
cols_to_show = ["serial", "datetime"] + [f"mins_to_nearest_valid__{c}" for c in var_cols]
print(df[cols_to_show].head(12))

         serial                  datetime  \
0   OC3CL200010 2023-09-25 01:08:04+09:00   
1   OC3CL200010 2023-09-25 02:08:06+09:00   
2   OC3CL200010 2023-09-25 03:08:05+09:00   
3   OC3CL200010 2023-09-25 06:08:03+09:00   
4   OC3CL200010 2023-09-25 07:08:03+09:00   
5   OC3CL200010 2023-09-25 08:08:02+09:00   
6   OC3CL200010 2023-09-25 09:08:04+09:00   
7   OC3CL200010 2023-09-25 10:08:06+09:00   
8   OC3CL200010 2023-09-25 11:08:04+09:00   
9   OC3CL200010 2023-09-25 12:08:08+09:00   
10  OC3CL200010 2023-09-25 13:08:07+09:00   
11  OC3CL200010 2023-09-25 14:08:08+09:00   

    mins_to_nearest_valid__temperature_mean_C  \
0                                           0   
1                                           0   
2                                           0   
3                                           0   
4                                           0   
5                                           0   
6                                           0   
7                     