In [1]:
import pandas as pd
import numpy as np
import pickle
import datetime
import os

In [2]:
# Group data into one csv file
file_names = [ 'raw_data/trafficindex2021.csv', 'raw_data/trafficindex2022.csv', 'raw_data/trafficindex2023.csv']
dataframes_list = []

for file in file_names:
    if os.path.exists(file):
        # Read the CSV file
        df = pd.read_csv(file)
        # Add it to our list
        dataframes_list.append(df)
        print(f"Successfully loaded: {file} ({len(df)} rows)")
    else:
        print(f"Warning: File not found - {file}")


merged_df = pd.concat(dataframes_list, ignore_index=True).drop_duplicates()

# Save to a new CSV file
output_filename = 'merged_traffic_indices.csv'
merged_df.to_csv(output_filename, index=False)

print(f"\nSuccess! Merged {len(dataframes_list)} files.")
print(f"Total rows: {len(merged_df)}")
print(f"Saved as: {output_filename}")

Successfully loaded: raw_data/trafficindex2021.csv (104631 rows)
Successfully loaded: raw_data/trafficindex2022.csv (78431 rows)
Successfully loaded: raw_data/trafficindex2023.csv (104305 rows)

Success! Merged 3 files.
Total rows: 287367
Saved as: merged_traffic_indices.csv


In [3]:
merged_df["datetime"] = pd.to_datetime(merged_df["datetime"])

In [4]:
merged_df["hour"] = merged_df["datetime"].dt.hour
merged_df["date"] = merged_df["datetime"].dt.date
merged_df["time_bin"] = pd.cut(
    merged_df["hour"],
    bins=[0, 6, 10, 16, 21, 24],
    labels=["early", "morning", "midday", "evening", "night"],
    right=False
)

In [5]:
merged_df

Unnamed: 0,timestamp,datetime,index,hour,date,time_bin
0,1609434000,2021-01-01 00:00:00,1.0,0,2021-01-01,early
1,1609434300,2021-01-01 00:05:00,1.3,0,2021-01-01,early
2,1609434600,2021-01-01 00:10:00,1.3,0,2021-01-01,early
3,1609434900,2021-01-01 00:15:00,1.5,0,2021-01-01,early
4,1609435200,2021-01-01 00:20:00,1.5,0,2021-01-01,early
...,...,...,...,...,...,...
287362,1704040500,2023-12-31 23:35:00,2.8,23,2023-12-31,night
287363,1704040800,2023-12-31 23:40:00,2.0,23,2023-12-31,night
287364,1704041100,2023-12-31 23:45:00,2.2,23,2023-12-31,night
287365,1704041400,2023-12-31 23:50:00,2.2,23,2023-12-31,night


In [6]:
merged_df["date"] = pd.to_datetime(merged_df["date"])
merged_df = merged_df.set_index("date")

In [7]:
cutoff = pd.Timestamp("2023-07-04")

filtered = merged_df[
    merged_df.index.get_level_values("date") < cutoff
]


In [8]:
filtered = filtered.reset_index()

In [9]:
average_df = filtered.groupby(["date", "time_bin"]).mean()
average_df.describe()

  average_df = filtered.groupby(["date", "time_bin"]).mean()


Unnamed: 0,timestamp,datetime,index,hour
count,4105.0,4105,4105.0,4105.0
mean,1647645000.0,2022-03-19 06:13:59.596083200,3.586178,12.504241
min,1609445000.0,2021-01-01 02:57:30,1.016667,0.428571
25%,1627227000.0,2021-07-25 22:27:30,2.519444,7.5
50%,1647669000.0,2022-03-19 12:57:30,3.391667,12.5
75%,1665401000.0,2022-10-10 18:27:30,4.573611,18.0
max,1688398000.0,2023-07-03 22:27:30,7.456667,22.028571
std,23272650.0,,1.189659,7.007017


In [10]:
congestion_indices = average_df.drop(['datetime', 'hour'], axis=1)

In [12]:
with open("processed_data/bkk_augmented_graph.pickle", "rb") as f:
    G = pickle.load(f)
edges = G.edges

In [13]:
def sample_congestion_df(traffic_df, edges, kappa=20):
    df = traffic_df.copy()

    df["mu"] = np.clip(df["index"] / 10, 1e-6, 1 - 1e-6)

    df["alpha"] = df["mu"] * kappa
    df["beta"]  = (1 - df["mu"]) * kappa

    samples = np.random.beta(
        a=df["alpha"].values[:, None],
        b=df["beta"].values[:, None],
        size=(len(df), len(edges))
    )

    return pd.DataFrame(samples, index=df.index, columns=edges)


In [14]:
result_df = sample_congestion_df(congestion_indices, edges)
result_df.columns = [tuple(col) for col in result_df.columns]
result_df = result_df.copy()
result_df.index = result_df.index.set_levels(
    result_df.index.levels[0].astype(str), level="date"
)

In [15]:
result_df = result_df.reset_index()
result_df["date"] = pd.to_datetime(result_df["date"])
weekdays_df = result_df[result_df["date"].dt.weekday < 5]
weekends_df = result_df[result_df["date"].dt.weekday >= 5]

In [16]:
wd_early   = weekdays_df[weekdays_df["time_bin"] == "early"].set_index('date').drop('time_bin', axis=1)
wd_morning  = weekdays_df[weekdays_df["time_bin"] == "morning"].set_index('date').drop('time_bin', axis=1)
wd_midday = weekdays_df[weekdays_df["time_bin"] == "midday"].set_index('date').drop('time_bin', axis=1)
wd_evening = weekdays_df[weekdays_df["time_bin"] == "evening"].set_index('date').drop('time_bin', axis=1)
wd_night = weekdays_df[weekdays_df["time_bin"] == "night"].set_index('date').drop('time_bin', axis=1)

In [20]:
wd_morning.head(32)

Unnamed: 0_level_0,"(61703358, 2377639819, 0)","(61703358, 17001, 0)","(61703358, 47386, 0)","(2377639819, 61703358, 0)","(2377639819, 1034495032, 0)","(17001, 61703359, 0)","(47386, 47387, 0)","(1034495032, 2210489615, 0)","(1034495032, 11046, 0)","(61703359, 272206282, 0)",...,"(290, 4427525284, 0)","(291, 53102, 0)","(292, 2209159454, 0)","(293, 53690, 0)","(294, 1147314605, 0)","(295, 7698446505, 0)","(296, 13236154134, 0)","(297, 13175670038, 0)","(298, 12061, 0)","(299, 280388170, 0)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01,0.284687,0.092712,0.188195,0.187058,0.067945,0.163499,0.20211,0.334824,0.225921,0.135692,...,0.282353,0.137777,0.281406,0.173124,0.084238,0.059439,0.207402,0.133001,0.192029,0.317472
2021-01-04,0.367024,0.128275,0.341776,0.399208,0.279182,0.251568,0.271593,0.311528,0.315516,0.283476,...,0.250115,0.230836,0.281919,0.27591,0.275366,0.283039,0.359658,0.231038,0.302072,0.2231
2021-01-05,0.398972,0.311771,0.191511,0.335668,0.254031,0.378345,0.39096,0.372107,0.381956,0.193096,...,0.307498,0.319045,0.289923,0.259247,0.382097,0.263574,0.496482,0.257684,0.436274,0.326036
2021-01-06,0.372332,0.245609,0.427195,0.471551,0.341559,0.310351,0.128033,0.318829,0.263048,0.295644,...,0.488767,0.44155,0.470782,0.411484,0.359863,0.341206,0.394857,0.292987,0.32064,0.36926
2021-01-07,0.297401,0.241269,0.117806,0.260299,0.192043,0.418315,0.582462,0.158207,0.488683,0.265375,...,0.4429,0.524781,0.230768,0.143403,0.433462,0.401158,0.367646,0.203064,0.29191,0.292634
2021-01-08,0.421351,0.337754,0.393871,0.330766,0.298283,0.359007,0.4842,0.402059,0.159704,0.433411,...,0.200826,0.422239,0.304787,0.318669,0.497311,0.383551,0.413066,0.262513,0.324265,0.42484
2021-01-11,0.244233,0.504642,0.188711,0.36312,0.282464,0.315773,0.240067,0.491481,0.255358,0.496933,...,0.376064,0.374517,0.328848,0.487996,0.411576,0.306254,0.262061,0.194983,0.350324,0.437569
2021-01-12,0.349012,0.220622,0.403124,0.339892,0.374146,0.418702,0.477966,0.581874,0.248258,0.175214,...,0.35722,0.1648,0.350169,0.270963,0.148725,0.235931,0.391292,0.483828,0.280463,0.345131
2021-01-13,0.215168,0.370638,0.326865,0.256547,0.251652,0.252825,0.369582,0.211005,0.456777,0.282334,...,0.300792,0.308203,0.340026,0.206335,0.354994,0.316593,0.516908,0.541939,0.380631,0.30701
2021-01-14,0.53346,0.340627,0.162449,0.277077,0.383376,0.323553,0.25761,0.262706,0.296502,0.265842,...,0.374151,0.158993,0.151397,0.373007,0.301199,0.382107,0.219126,0.367566,0.42399,0.427673


In [17]:
we_early   = weekends_df[weekends_df["time_bin"] == "early"].set_index('date').drop('time_bin', axis=1)
we_morning  = weekends_df[weekends_df["time_bin"] == "morning"].set_index('date').drop('time_bin', axis=1)
we_midday = weekends_df[weekends_df["time_bin"] == "midday"].set_index('date').drop('time_bin', axis=1)
we_evening = weekends_df[weekends_df["time_bin"] == "evening"].set_index('date').drop('time_bin', axis=1)
we_night = weekends_df[weekends_df["time_bin"] == "night"].set_index('date').drop('time_bin', axis=1)

In [261]:
dfs = {
    "wd_early": wd_early,
    "wd_morning": wd_morning,
    "wd_midday": wd_midday,
    "wd_evening": wd_evening,
    "wd_night": wd_night,
    
    "we_early": we_early,
    "we_morning": we_morning,
    "we_midday": we_midday,
    "we_evening": we_evening,
    "we_night": we_night,
}

for name, df in dfs.items():
    df.to_csv(f"{name}_congestion.csv")
