In [102]:
import pandas as pd
import numpy as np
import pickle
import datetime
import os

In [176]:
# Group data into one csv file
file_names = [ 'raw_data/trafficindex2021.csv', 'raw_data/trafficindex2022.csv', 'raw_data/trafficindex2023.csv']
dataframes_list = []

for file in file_names:
    if os.path.exists(file):
        # Read the CSV file
        df = pd.read_csv(file)
        # Add it to our list
        dataframes_list.append(df)
        print(f"Successfully loaded: {file} ({len(df)} rows)")
    else:
        print(f"Warning: File not found - {file}")


merged_df = pd.concat(dataframes_list, ignore_index=True).drop_duplicates()

# Save to a new CSV file
output_filename = 'merged_traffic_indices.csv'
merged_df.to_csv(output_filename, index=False)

print(f"\nSuccess! Merged {len(dataframes_list)} files.")
print(f"Total rows: {len(merged_df)}")
print(f"Saved as: {output_filename}")

Successfully loaded: raw_data/trafficindex2021.csv (104631 rows)
Successfully loaded: raw_data/trafficindex2022.csv (78431 rows)
Successfully loaded: raw_data/trafficindex2023.csv (104305 rows)

Success! Merged 3 files.
Total rows: 287367
Saved as: merged_traffic_indices.csv


In [177]:
merged_df["datetime"] = pd.to_datetime(merged_df["datetime"])

In [178]:
merged_df["hour"] = merged_df["datetime"].dt.hour
merged_df["date"] = merged_df["datetime"].dt.date
merged_df["time_bin"] = pd.cut(
    merged_df["hour"],
    bins=[0, 6, 10, 16, 21, 24],
    labels=["early", "morning", "midday", "evening", "night"],
    right=False
)

In [179]:
merged_df["date"] = pd.to_datetime(merged_df["date"])
merged_df = merged_df.set_index("date")

In [180]:
cutoff = pd.Timestamp("2023-07-04")

filtered = merged_df[
    merged_df.index.get_level_values("date") < cutoff
]


In [181]:
filtered = filtered.reset_index()

In [168]:
# 1. Normalize to midnight so all timestamps compare cleanly
df["date"] = df["date"].dt.normalize()

# 2. Create full expected date range
all_dates = pd.date_range(
    start=df["date"].min(),
    end=df["date"].max(),
    freq="D"
)

# 3. Find missing dates
missing = all_dates.difference(df["date"].unique())

print(len(missing) * 5)
print(len(all_dates) * 5)

460
4570


In [182]:
average_df = filtered.groupby(["date", "time_bin"]).mean()
average_df.describe()

  average_df = filtered.groupby(["date", "time_bin"]).mean()


Unnamed: 0,timestamp,datetime,index,hour
count,4105.0,4105,4105.0,4105.0
mean,1647645000.0,2022-03-19 06:13:59.596083200,3.586178,12.504241
min,1609445000.0,2021-01-01 02:57:30,1.016667,0.428571
25%,1627227000.0,2021-07-25 22:27:30,2.519444,7.5
50%,1647669000.0,2022-03-19 12:57:30,3.391667,12.5
75%,1665401000.0,2022-10-10 18:27:30,4.573611,18.0
max,1688398000.0,2023-07-03 22:27:30,7.456667,22.028571
std,23272650.0,,1.189659,7.007017


In [183]:
congestion_indices = average_df.drop(['datetime', 'hour'], axis=1)

In [184]:
with open("processed_data/bkk_augmented_graph.pickle", "rb") as f:
    G = pickle.load(f)
edges = G.edges

In [233]:
def sample_congestion_df(traffic_df, edges, kappa=20):
    df = traffic_df.copy()

    df["mu"] = np.clip(df["index"] / 10, 1e-6, 1 - 1e-6)

    df["alpha"] = df["mu"] * kappa
    df["beta"]  = (1 - df["mu"]) * kappa

    samples = np.random.beta(
        a=df["alpha"].values[:, None],
        b=df["beta"].values[:, None],
        size=(len(df), len(edges))
    )

    return pd.DataFrame(samples, index=df.index, columns=edges)


In [234]:
result_df = sample_congestion_df(congestion_indices, edges)
result_df.columns = [tuple(col) for col in result_df.columns]
result_df = result_df.copy()
result_df.index = result_df.index.set_levels(
    result_df.index.levels[0].astype(str), level="date"
)

In [235]:
result_df = result_df.reset_index()
result_df["date"] = pd.to_datetime(result_df["date"])
weekdays_df = result_df[result_df["date"].dt.weekday < 5]
weekends_df = result_df[result_df["date"].dt.weekday >= 5]

In [236]:
wd_early   = weekdays_df[weekdays_df["time_bin"] == "early"].set_index('date').drop('time_bin', axis=1)
wd_morning  = weekdays_df[weekdays_df["time_bin"] == "morning"].set_index('date').drop('time_bin', axis=1)
wd_midday = weekdays_df[weekdays_df["time_bin"] == "midday"].set_index('date').drop('time_bin', axis=1)
wd_evening = weekdays_df[weekdays_df["time_bin"] == "evening"].set_index('date').drop('time_bin', axis=1)
wd_night = weekdays_df[weekdays_df["time_bin"] == "night"].set_index('date').drop('time_bin', axis=1)

In [237]:
we_early   = weekends_df[weekends_df["time_bin"] == "early"].set_index('date').drop('time_bin', axis=1)
we_morning  = weekends_df[weekends_df["time_bin"] == "morning"].set_index('date').drop('time_bin', axis=1)
we_midday = weekends_df[weekends_df["time_bin"] == "midday"].set_index('date').drop('time_bin', axis=1)
we_evening = weekends_df[weekends_df["time_bin"] == "evening"].set_index('date').drop('time_bin', axis=1)
we_night = weekends_df[weekends_df["time_bin"] == "night"].set_index('date').drop('time_bin', axis=1)

In [239]:
dfs = {
    "wd_early": wd_early,
    "wd_morning": wd_morning,
    "wd_midday": wd_midday,
    "wd_evening": wd_evening,
    "wd_night": wd_night,
    
    "we_early": we_early,
    "we_morning": we_morning,
    "we_midday": we_midday,
    "we_evening": we_evening,
    "we_night": we_night,
}

for name, df in dfs.items():
    df.to_csv(f"{name}_congestion.csv")
