In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data-p3/joined_all.csv')
df.head()


Unnamed: 0,Pass_ID,Sch_Departure,Act_Departure,C0 - S2,C_Start,C_Avg,S2,Wait_Time,Day_of_Week,Period_of_Week,Airfield,Season
0,5368296,2028-09-03 13:12,2028-09-03 13:12,1,1,1.0,2028-09-03 11:26,3.0,7 - SUN,2 - WEEKEND,AUC,3 - SUMMER
1,5480057,2028-09-15 9:36,2028-09-15 9:36,1,1,1.0,2028-09-15 7:45,3.0,5 - FRI,1 - WEEKDAY,AUC,3 - SUMMER
2,5349077,2028-09-01 10:21,2028-09-01 11:09,1,1,1.0,2028-09-01 7:58,5.0,5 - FRI,1 - WEEKDAY,AUC,3 - SUMMER
3,5349207,2028-09-01 10:21,2028-09-01 11:09,1,1,1.0,2028-09-01 8:04,4.0,5 - FRI,1 - WEEKDAY,AUC,3 - SUMMER
4,5349308,2028-09-01 10:21,2028-09-01 11:09,1,1,1.0,2028-09-01 8:13,7.0,5 - FRI,1 - WEEKDAY,AUC,3 - SUMMER


In [2]:
df['S2'] = pd.to_datetime(df['S2'])
# df['wait_time_delta'] = pddf['S2'] = pd.to_datetime(df['S2'])
cutoff_date = pd.Timestamp("2028-10-01")

# Filter the DataFrame
df = df[df['S2'] >= cutoff_date]
df.head()

Unnamed: 0,Pass_ID,Sch_Departure,Act_Departure,C0 - S2,C_Start,C_Avg,S2,Wait_Time,Day_of_Week,Period_of_Week,Airfield,Season
25604,5648611,2028-10-03 9:41,2028-10-03 13:11,2,2,2.0,2028-10-03 08:24:00,2.0,2 - TUE,1 - WEEKDAY,AUC,4 - AUTUMN
25605,5888442,2028-10-28 8:11,2028-10-28 8:11,2,2,2.0,2028-10-28 06:43:00,6.0,6 - SAT,2 - WEEKEND,AUC,4 - AUTUMN
25606,5628779,2028-10-01 11:41,2028-10-01 11:41,1,1,1.0,2028-10-01 08:26:00,18.0,7 - SUN,2 - WEEKEND,AUC,4 - AUTUMN
25607,5629253,2028-10-01 11:41,2028-10-01 11:41,1,1,1.0,2028-10-01 08:35:00,,7 - SUN,2 - WEEKEND,AUC,4 - AUTUMN
25608,5629286,2028-10-01 11:41,2028-10-01 11:41,1,1,1.0,2028-10-01 08:37:00,,7 - SUN,2 - WEEKEND,AUC,4 - AUTUMN


In [3]:
def determine_cluster(timestamp):

    is_weekend = timestamp.weekday() >= 5  
    hour = timestamp.hour
    
    if 0 <= hour < 4:
        cluster = "0:00 - 4:00"
    elif 4 <= hour < 8:
        cluster = "4:00 - 8:00"
    elif 8 <= hour < 12:
        cluster = "8:00 - 12:00"
    elif 12 <= hour < 16:
        cluster = "12:00 - 16:00"
    elif 16 <= hour < 20:
        cluster = "16:00 - 20:00"
    else:
        cluster = "20:00 - 0:00"
    
    return f"{'Weekend' if is_weekend else 'Weekday'} - {cluster}"

df['cluster'] = df['S2'].apply(determine_cluster)

In [4]:
def impute_exponential_by_cluster(df, cluster_col='cluster', target_col='Wait_Time', time_col='S2'):
    df_imputed = df.copy()
    
    # Calculate means for each cluster
    cluster_means = df.groupby(cluster_col)[target_col].mean()
    
    # Function to find an existing wait time or sample from exponential
    def impute_value(row):
        if pd.isna(row[target_col]):  # If value is missing
            cluster = row[cluster_col]
            arrival_time = row[time_col]
            
            # Check if someone else in the same cluster and arrival time has a Wait_Time
            same_time_value = df_imputed[
                (df_imputed[cluster_col] == cluster) &
                (df_imputed[time_col] == arrival_time) &
                (~pd.isna(df_imputed[target_col]))
            ][target_col]
            
            if not same_time_value.empty:  # If a valid Wait_Time exists, use it
                return same_time_value.iloc[0]
            
            # Otherwise, sample from exponential
            scale = cluster_means[cluster]  # Mean of the feature for the cluster
            if scale > 0:  # Ensure the scale is valid
                return np.random.exponential(scale=scale)
            return cluster_means[cluster]
        
        # Return original value if not missing
        return row[target_col]
    
    # Apply the imputation
    df_imputed['wait_time_imp'] = df_imputed.apply(impute_value, axis=1)
    return df_imputed


In [5]:
df = impute_exponential_by_cluster(df)

In [6]:
cluster_ord = [
    "Weekday - 0:00 - 4:00", "Weekday - 4:00 - 8:00", "Weekday - 8:00 - 12:00", 
    "Weekday - 12:00 - 16:00", "Weekday - 16:00 - 20:00", "Weekday - 20:00 - 0:00",
    "Weekend - 0:00 - 4:00", "Weekend - 4:00 - 8:00", "Weekend - 8:00 - 12:00", 
    "Weekend - 12:00 - 16:00", "Weekend - 16:00 - 20:00", "Weekend - 20:00 - 0:00"
]

df['cluster'] = pd.Categorical(df['cluster'], categories=cluster_ord, ordered=True)
df = df.sort_values('cluster')

In [7]:
df["S1_imp"] = df['S2'] - pd.to_timedelta(df['wait_time_imp'])
# df["cluster_S1"] = df['S1_imp'].apply(determine_cluster)

In [8]:
def cluster_hours(timestamp):
    is_weekend = timestamp.weekday() >= 5
    weekday = 260
    weekend = 108

    
    return weekend if is_weekend else weekday

df['cluster_hours'] = df['S2'].apply(cluster_hours)

In [9]:
df[['cluster','cluster_hours']]

Unnamed: 0,cluster,cluster_hours
62649,Weekday - 4:00 - 8:00,260
80362,Weekday - 4:00 - 8:00,260
80365,Weekday - 4:00 - 8:00,260
80366,Weekday - 4:00 - 8:00,260
52801,Weekday - 4:00 - 8:00,260
...,...,...
45697,Weekend - 20:00 - 0:00,108
45698,Weekend - 20:00 - 0:00,108
45699,Weekend - 20:00 - 0:00,108
76980,Weekend - 20:00 - 0:00,108


In [10]:
arrival_counts = df.groupby('cluster').size()
arrival_counts.head()


  arrival_counts = df.groupby('cluster').size()


cluster
Weekday - 0:00 - 4:00          0
Weekday - 4:00 - 8:00      11401
Weekday - 8:00 - 12:00     16583
Weekday - 12:00 - 16:00    16413
Weekday - 16:00 - 20:00    17009
dtype: int64

In [11]:
results = arrival_counts.to_frame(name='count')

results = results.merge(df[['cluster', 'cluster_hours']].drop_duplicates(), on='cluster', how='left')
results


Unnamed: 0,cluster,count,cluster_hours
0,Weekday - 0:00 - 4:00,0,
1,Weekday - 4:00 - 8:00,11401,260.0
2,Weekday - 8:00 - 12:00,16583,260.0
3,Weekday - 12:00 - 16:00,16413,260.0
4,Weekday - 16:00 - 20:00,17009,260.0
5,Weekday - 20:00 - 0:00,1162,260.0
6,Weekend - 0:00 - 4:00,0,
7,Weekend - 4:00 - 8:00,4609,108.0
8,Weekend - 8:00 - 12:00,5746,108.0
9,Weekend - 12:00 - 16:00,6973,108.0


In [12]:
results['avg_arrival_rate'] = results['count'] / results['cluster_hours'] / 60 # per minute
df_ar = results
results

Unnamed: 0,cluster,count,cluster_hours,avg_arrival_rate
0,Weekday - 0:00 - 4:00,0,,
1,Weekday - 4:00 - 8:00,11401,260.0,0.730833
2,Weekday - 8:00 - 12:00,16583,260.0,1.063013
3,Weekday - 12:00 - 16:00,16413,260.0,1.052115
4,Weekday - 16:00 - 20:00,17009,260.0,1.090321
5,Weekday - 20:00 - 0:00,1162,260.0,0.074487
6,Weekend - 0:00 - 4:00,0,,
7,Weekend - 4:00 - 8:00,4609,108.0,0.711265
8,Weekend - 8:00 - 12:00,5746,108.0,0.886728
9,Weekend - 12:00 - 16:00,6973,108.0,1.07608


In [13]:
df['15min_interval'] = df['S2'].dt.floor('15min')

grouped = df.groupby(['cluster', '15min_interval'])
max_servers = grouped['C0 - S2'].max().reset_index()
max_servers = max_servers.rename(columns={'C0 - S2': 'C0_max'})

df = df.merge(max_servers, on=['cluster', '15min_interval'], how='left')
df


  grouped = df.groupby(['cluster', '15min_interval'])


Unnamed: 0,Pass_ID,Sch_Departure,Act_Departure,C0 - S2,C_Start,C_Avg,S2,Wait_Time,Day_of_Week,Period_of_Week,Airfield,Season,cluster,wait_time_imp,S1_imp,cluster_hours,15min_interval,C0_max
0,6138847,2028-11-27 8:43,2028-11-27 8:43,1,1,1.0,2028-11-27 06:55:00,4.0,1 - MON,1 - WEEKDAY,AUC,4 - AUTUMN,Weekday - 4:00 - 8:00,4.000000,2028-11-27 06:54:59.999999996,260,2028-11-27 06:45:00,1.0
1,6155439,2028-11-29 8:41,2028-11-29 8:41,2,1,1.0,2028-11-29 07:48:00,,3 - WED,1 - WEEKDAY,AUC,4 - AUTUMN,Weekday - 4:00 - 8:00,11.631322,2028-11-29 07:47:59.999999989,260,2028-11-29 07:45:00,2.0
2,6155515,2028-11-29 8:41,2028-11-29 9:24,2,2,2.0,2028-11-29 07:57:00,4.0,3 - WED,1 - WEEKDAY,AUC,4 - AUTUMN,Weekday - 4:00 - 8:00,4.000000,2028-11-29 07:56:59.999999996,260,2028-11-29 07:45:00,2.0
3,6155519,2028-11-29 8:41,2028-11-29 9:24,2,2,2.0,2028-11-29 07:57:00,4.0,3 - WED,1 - WEEKDAY,AUC,4 - AUTUMN,Weekday - 4:00 - 8:00,4.000000,2028-11-29 07:56:59.999999996,260,2028-11-29 07:45:00,2.0
4,5671339,2028-10-05 9:11,2028-10-05 9:11,2,2,2.0,2028-10-05 07:55:00,5.0,4 - THU,1 - WEEKDAY,AUC,4 - AUTUMN,Weekday - 4:00 - 8:00,5.000000,2028-10-05 07:54:59.999999995,260,2028-10-05 07:45:00,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88523,5763286,2028-10-14 21:50,2028-10-14 22:41,1,1,1.0,2028-10-14 20:34:00,,6 - SAT,2 - WEEKEND,AUC,4 - AUTUMN,Weekend - 20:00 - 0:00,2.399551,2028-10-14 20:33:59.999999998,108,2028-10-14 20:30:00,1.0
88524,5763366,2028-10-14 21:50,2028-10-14 22:41,1,1,1.0,2028-10-14 20:44:00,,6 - SAT,2 - WEEKEND,AUC,4 - AUTUMN,Weekend - 20:00 - 0:00,0.327538,2028-10-14 20:44:00.000000000,108,2028-10-14 20:30:00,1.0
88525,5763381,2028-10-14 21:50,2028-10-14 22:41,1,1,1.0,2028-10-14 20:44:00,,6 - SAT,2 - WEEKEND,AUC,4 - AUTUMN,Weekend - 20:00 - 0:00,1.233761,2028-10-14 20:43:59.999999999,108,2028-10-14 20:30:00,1.0
88526,6072934,2028-11-18 21:40,2028-11-18 21:40,1,1,1.0,2028-11-18 20:11:00,1.0,6 - SAT,2 - WEEKEND,AUC,4 - AUTUMN,Weekend - 20:00 - 0:00,1.000000,2028-11-18 20:10:59.999999999,108,2028-11-18 20:00:00,1.0


In [14]:
average_C0 = df.groupby('cluster')['C0_max'].mean().reset_index(name='avg_C0')
average_C0

  average_C0 = df.groupby('cluster')['C0_max'].mean().reset_index(name='avg_C0')


Unnamed: 0,cluster,avg_C0
0,Weekday - 0:00 - 4:00,
1,Weekday - 4:00 - 8:00,1.91413
2,Weekday - 8:00 - 12:00,1.354882
3,Weekday - 12:00 - 16:00,1.342838
4,Weekday - 16:00 - 20:00,1.499324
5,Weekday - 20:00 - 0:00,1.030981
6,Weekend - 0:00 - 4:00,
7,Weekend - 4:00 - 8:00,1.899761
8,Weekend - 8:00 - 12:00,1.235468
9,Weekend - 12:00 - 16:00,1.303743


In [15]:
distribution_C0 = pd.crosstab(df['cluster'], df['C0 - S2'], normalize='index') * 100
distribution_C0.columns = [f'{col}_total_servers' for col in distribution_C0.columns]
distribution_C0


Unnamed: 0_level_0,1_total_servers,2_total_servers,3_total_servers
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Weekday - 4:00 - 8:00,19.436892,78.133497,2.429611
Weekday - 8:00 - 12:00,70.144123,28.999578,0.856299
Weekday - 12:00 - 16:00,73.557546,25.67477,0.767684
Weekday - 16:00 - 20:00,57.716503,40.854842,1.428655
Weekday - 20:00 - 0:00,96.987952,3.012048,0.0
Weekend - 4:00 - 8:00,21.414624,75.135604,3.449772
Weekend - 8:00 - 12:00,82.335538,17.664462,0.0
Weekend - 12:00 - 16:00,76.710168,23.289832,0.0
Weekend - 16:00 - 20:00,51.79531,43.307279,4.897411
Weekend - 20:00 - 0:00,100.0,0.0,0.0


In [16]:
results = average_C0.merge(distribution_C0, on='cluster', how='left')
results

Unnamed: 0,cluster,avg_C0,1_total_servers,2_total_servers,3_total_servers
0,Weekday - 0:00 - 4:00,,,,
1,Weekday - 4:00 - 8:00,1.91413,19.436892,78.133497,2.429611
2,Weekday - 8:00 - 12:00,1.354882,70.144123,28.999578,0.856299
3,Weekday - 12:00 - 16:00,1.342838,73.557546,25.67477,0.767684
4,Weekday - 16:00 - 20:00,1.499324,57.716503,40.854842,1.428655
5,Weekday - 20:00 - 0:00,1.030981,96.987952,3.012048,0.0
6,Weekend - 0:00 - 4:00,,,,
7,Weekend - 4:00 - 8:00,1.899761,21.414624,75.135604,3.449772
8,Weekend - 8:00 - 12:00,1.235468,82.335538,17.664462,0.0
9,Weekend - 12:00 - 16:00,1.303743,76.710168,23.289832,0.0


In [17]:
#df['wait_time_imp'] = df['wait_time_imp'] / pd.Timedelta(minutes=1) # makes integer

thresholds = [5, 10, 15, 20, 25, 30]

results = {}

for cluster, group in df.groupby('cluster'):
    results[cluster] = {f"<= {threshold} mins": (group['wait_time_imp'] <= threshold).mean() * 100
                        for threshold in thresholds}

percentage_table_by_cluster = pd.DataFrame(results).transpose()
percentage_table_by_cluster.index.name = 'cluster'
percentage_table_by_cluster.reset_index(inplace=True)
percentage_table_by_cluster


  for cluster, group in df.groupby('cluster'):


Unnamed: 0,cluster,<= 5 mins,<= 10 mins,<= 15 mins,<= 20 mins,<= 25 mins,<= 30 mins
0,Weekday - 0:00 - 4:00,,,,,,
1,Weekday - 4:00 - 8:00,47.864222,76.081046,91.184984,98.044031,99.684238,99.815806
2,Weekday - 8:00 - 12:00,56.9921,81.046855,91.780739,96.725562,98.607007,99.674365
3,Weekday - 12:00 - 16:00,68.262962,89.20368,96.393103,98.842381,99.317614,99.701456
4,Weekday - 16:00 - 20:00,77.129755,94.608737,98.712446,99.576695,99.84714,99.941208
5,Weekday - 20:00 - 0:00,91.480207,99.053356,99.655766,100.0,100.0,100.0
6,Weekend - 0:00 - 4:00,,,,,,
7,Weekend - 4:00 - 8:00,56.389672,85.463224,97.157735,99.197223,99.761337,99.891517
8,Weekend - 8:00 - 12:00,67.107553,89.662374,96.606335,98.973199,99.547511,99.965193
9,Weekend - 12:00 - 16:00,68.005163,89.6888,97.77714,99.670156,99.956977,99.985659


In [18]:
cluster_metrics = df.groupby('cluster').apply(
    lambda group: pd.Series({
        'arrival_rate': len(group) / (group['cluster_hours'].mean() * 60), # needs to be in minutes
        'avg_wait': group['wait_time_imp'].mean()
    })
).reset_index()

cluster_metrics['est_service_rate'] = (
    (cluster_metrics['avg_wait'] * cluster_metrics['arrival_rate'] + 
     np.sqrt((cluster_metrics['avg_wait'] * cluster_metrics['arrival_rate'])**2 + 
             4 * cluster_metrics['avg_wait'] * cluster_metrics['arrival_rate'])) 
    / (2 * cluster_metrics['avg_wait'])
)

cluster_metrics['traffic_intensity'] = cluster_metrics['arrival_rate'] / cluster_metrics['est_service_rate']

qos_results = []
for _, row in cluster_metrics.iterrows():
    cluster = row['cluster']
    arrival_rate = row['arrival_rate']
    est_service_rate = row['est_service_rate']
    
    qos_estimates = {}
    for x in thresholds:
        if est_service_rate > arrival_rate:
            qos = 1 - ((arrival_rate / est_service_rate) * np.exp(-(est_service_rate - arrival_rate) * x))
            qos_estimates[f"<= {x} mins"] = f"{qos:.1%}"
        else:
            qos_estimates[f"<= {x} mins"] = "N/A"
    
    result = {
        "Cluster": cluster,
        "Est Serv Rate (per minute)": round(row['est_service_rate'], 3),
        "Est ρ": round(row['traffic_intensity'], 3),
        **qos_estimates
    }
    qos_results.append(result)

df_qos = pd.DataFrame(qos_results)
df_qos


  cluster_metrics = df.groupby('cluster').apply(
  cluster_metrics = df.groupby('cluster').apply(


Unnamed: 0,Cluster,Est Serv Rate (per minute),Est ρ,<= 5 mins,<= 10 mins,<= 15 mins,<= 20 mins,<= 25 mins,<= 30 mins
0,Weekday - 0:00 - 4:00,,,,,,,,
1,Weekday - 4:00 - 8:00,0.848,0.862,52.1%,73.4%,85.2%,91.8%,95.4%,97.5%
2,Weekday - 8:00 - 12:00,1.197,0.888,54.5%,76.7%,88.1%,93.9%,96.9%,98.4%
3,Weekday - 12:00 - 16:00,1.218,0.864,62.2%,83.5%,92.8%,96.8%,98.6%,99.4%
4,Weekday - 16:00 - 20:00,1.289,0.846,68.7%,88.4%,95.7%,98.4%,99.4%,99.8%
5,Weekday - 20:00 - 0:00,0.214,0.347,82.8%,91.4%,95.7%,97.9%,99.0%,99.5%
6,Weekend - 0:00 - 4:00,,,,,,,,
7,Weekend - 4:00 - 8:00,0.849,0.838,57.8%,78.8%,89.3%,94.6%,97.3%,98.6%
8,Weekend - 8:00 - 12:00,1.048,0.846,62.3%,83.2%,92.5%,96.7%,98.5%,99.3%
9,Weekend - 12:00 - 16:00,1.245,0.864,62.9%,84.1%,93.2%,97.1%,98.7%,99.5%


In [19]:
df_qos.to_csv("data-p3/new_mm1_qos_results.csv",index=False)

In [20]:
df_reg = average_C0
df_reg['avg_arrival_rate'] = df_ar['avg_arrival_rate']
df_reg['est_service_rate'] = df_qos['Est Serv Rate (per minute)']# made seconds so easier to compare w arrival rate
df_reg['arrival_rate_per_server'] = df_reg['avg_arrival_rate'] / df_reg['avg_C0']
df_reg['service_rate_per_server'] = df_reg['est_service_rate'] / df_reg['avg_C0']
df_reg


Unnamed: 0,cluster,avg_C0,avg_arrival_rate,est_service_rate,arrival_rate_per_server,service_rate_per_server
0,Weekday - 0:00 - 4:00,,,,,
1,Weekday - 4:00 - 8:00,1.91413,0.730833,0.848,0.38181,0.443021
2,Weekday - 8:00 - 12:00,1.354882,1.063013,1.197,0.78458,0.883472
3,Weekday - 12:00 - 16:00,1.342838,1.052115,1.218,0.783501,0.907034
4,Weekday - 16:00 - 20:00,1.499324,1.090321,1.289,0.727208,0.859721
5,Weekday - 20:00 - 0:00,1.030981,0.074487,0.214,0.072249,0.207569
6,Weekend - 0:00 - 4:00,,,,,
7,Weekend - 4:00 - 8:00,1.899761,0.711265,0.849,0.374397,0.446898
8,Weekend - 8:00 - 12:00,1.235468,0.886728,1.048,0.717727,0.848261
9,Weekend - 12:00 - 16:00,1.303743,1.07608,1.245,0.825378,0.954943


In [21]:
df_reg.to_csv('data-p3/queuing-data-for-regression.csv',index=False)

In [22]:
df.to_csv('data-p3/joined_all_clustered.csv',index=False)