In [33]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.signal import find_peaks
from scipy.stats import gaussian_kde
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# ----------------------------------------
# CONFIGURATION
# ----------------------------------------
from google.colab import drive
drive.mount('/content/drive')
PEDESTRIAN_FILE = '/content/drive/MyDrive/FRA data/Pedestrian_Coordinates.txt'
PLATFORM_FILE = '/content/drive/MyDrive/FRA data/Platform_Design_Coordinates.txt'
FRAME_RATE = 4  # 4 frames per second

# ----------------------------------------
# LOAD AND CLEAN PEDESTRIAN DATA
# ----------------------------------------
# Skip metadata lines starting with '#'
# Load the full pedestrian data
df = pd.read_csv(PEDESTRIAN_FILE, comment='#', delim_whitespace=True,
                          names=['id', 'frame', 'x', 'y', 'z'])



# Try different values of k (from 2 to max_k, e.g., 10).
# For each k:
# Run KMeans to divide the data into k clusters.
# Compute the Silhouette Score for the result.
# Store that score.
# After testing all values:
# Pick the k that gave the highest Silhouette Score.
# Use Silhouette Score as an alternative - Gives good results
def detect_optimal_k_silhouette(X, max_k=10):
    scores = []
    for k in range(2, max_k + 1):  # silhouette score not defined for k=1
        kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
        labels = kmeans.labels_
        score = silhouette_score(X, labels)
        scores.append(score)

    best_k = np.argmax(scores) + 2
    #print(f"Optimal k (Silhouette): {best_k}")
    return best_k


def assign_door_ids(df, optimal_k):
    kmeans = KMeans(n_clusters=optimal_k, random_state=0)
    df['door_id'] = kmeans.fit_predict(df[['x']])
    centers = kmeans.cluster_centers_.flatten()
    sorted_idx = np.argsort(centers)
    label_map = {old: new + 1 for new, old in enumerate(sorted_idx)}
    df['door_id'] = df['door_id'].map(label_map)
    return df, centers


def compute_dynamic_measurement_lines(y_values, static_y, approach_type, dynamic_buffer, platform_edge):
    valid_valleys = []

    if approach_type == "dynamic":
      kde = gaussian_kde(y_values)
      y_grid = np.linspace(min(y_values), max(y_values), 1000)
      kde_values = kde(y_grid)
      valleys, _ = find_peaks(-kde_values)

      valid_valleys = [y for y in y_grid[valleys] if abs(y - static_y) <= dynamic_buffer]
      if platform_edge == "lower":
        valid_valleys += [static_y + i for i in [1.5, 2.5, 3.5]]
      else:
        valid_valleys += [static_y - i for i in [1.5, 2.5, 3.5]]

    else:
      if platform_edge == "lower":
      # for only taking taking one static M-Line
        valid_valleys = [static_y + i for i in [1.5]]
      else:
        valid_valleys = [static_y - i for i in [1.5]]
    return sorted(valid_valleys)


def classify_passengers(df, measurement_lines, platform_edge):
    boarding_ids, alighting_ids, waiting_ids = set(), set(), set()
    if platform_edge == "lower":
      for pid in df['id'].unique():
          p_df = df[df['id'] == pid].sort_values('frame')
          ys = p_df['y'].values

          crossed = False
          for i in range(1, len(ys)):
              prev_y, curr_y = ys[i - 1], ys[i]
              for m_line in measurement_lines:
                  if prev_y > m_line and curr_y <= m_line:
                      boarding_ids.add(pid)
                      crossed = True
                      break
                  elif prev_y <= m_line and curr_y > m_line:
                      alighting_ids.add(pid)
                      crossed = True
                      break
              if crossed:
                  break

          if not crossed:
              waiting_ids.add(pid)
      # for upper platform edge
    else:
      for pid in df['id'].unique():
          p_df = df[df['id'] == pid].sort_values('frame')
          ys = p_df['y'].values

          crossed = False
          for i in range(1, len(ys)):
              prev_y, curr_y = ys[i - 1], ys[i]
              for m_line in measurement_lines:
                  if prev_y < m_line and curr_y >= m_line:
                      boarding_ids.add(pid)
                      crossed = True
                      break
                  elif prev_y >= m_line and curr_y < m_line:
                      alighting_ids.add(pid)
                      crossed = True
                      break
              if crossed:
                  break

          if not crossed:
              waiting_ids.add(pid)


    df['category'] = df['id'].apply(
        lambda x: 'boarding' if x in boarding_ids else ('alighting' if x in alighting_ids else 'waiting')
    )
    return df


def summarize_per_door(df, train_id, start_frame):
    summary = []
    arrival_time_seconds = start_frame / FRAME_RATE
    hours = int(arrival_time_seconds // 3600)
    minutes = int((arrival_time_seconds % 3600) // 60)
    seconds = int(arrival_time_seconds % 60)
    arrival_time_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"

    for door in sorted(df['door_id'].unique()):
        df_door = df[df['door_id'] == door]

        boarding_ids = set(df_door[df_door['category'] == 'boarding']['id'])
        alighting_ids = set(df_door[df_door['category'] == 'alighting']['id'])

        boarding_frames = df_door[df_door['id'].isin(boarding_ids)]['frame']
        alighting_frames = df_door[df_door['id'].isin(alighting_ids)]['frame']

        dwell = np.nan
        if not alighting_frames.empty and not boarding_frames.empty:
            min_frame = min(alighting_frames.min(), boarding_frames.min())
            max_frame = max(alighting_frames.max(), boarding_frames.max())
            dwell = (max_frame - min_frame) / FRAME_RATE
        elif not boarding_frames.empty:
            dwell = (boarding_frames.max() - boarding_frames.min()) / FRAME_RATE
        elif not alighting_frames.empty:
            dwell = (alighting_frames.max() - alighting_frames.min()) / FRAME_RATE

        summary.append({
            'train_id': train_id,
            'door_id': door,
            'no_of_boarding': len(boarding_ids),
            'no_of_alighting': len(alighting_ids),
            'dwell_time_seconds': dwell,
            'train_arrival_time': arrival_time_str
        })

    return pd.DataFrame(summary)


def process_all_trains(df_all, train_df, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer):
    all_summaries = []
    measurement_lines = []
    # Run the code for all train IDs
    for train_id in train_df['train_id'].unique():
        train_info = train_df[train_df['train_id'] == train_id].iloc[0]
        start_frame, end_frame = train_info['start_frame'], train_info['end_frame']
        df = df_all[(df_all['frame'] >= start_frame) & (df_all['frame'] < end_frame)].copy()

        # Detect doors
        if platform_edge == "lower":
          edge_df = df[df['y'] < edge_threshold_y].copy()
        else:
          edge_df = df[df['y'] > edge_threshold_y].copy()
        #print("empty edge df", edge_df.shape)
        optimal_k = detect_optimal_k_silhouette(edge_df[['x']].to_numpy())
        df, _ = assign_door_ids(df, optimal_k)

        # Classify
        measurement_lines = compute_dynamic_measurement_lines(df['y'].values, static_y, approach_type, dynamic_buffer, platform_edge)
        df = classify_passengers(df, measurement_lines, platform_edge)

        # Summarize
        summary_df = summarize_per_door(df, train_id, train_info['start_frame'])
        all_summaries.append(summary_df)

    return pd.concat(all_summaries, ignore_index=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np

def evaluate_model(platform_name, summary_df, approach_type, approach_details):
    results = []

    # Drop rows with missing dwell time
    df_for_modelling = summary_df[
            (summary_df['dwell_time_seconds'].notna()) &
            (summary_df['dwell_time_seconds'] != 0)
        ]
    # print("---df_for_modelling", df_for_modelling.shape)
    # Feature Set 1: 2 components
    X1 = df_for_modelling[['no_of_boarding', 'no_of_alighting']]
    y = df_for_modelling['dwell_time_seconds']

    X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.3, random_state=42)
    model1 = LinearRegression().fit(X_train, y_train)
    y_pred1 = model1.predict(X_test)

    mae1 = mean_absolute_error(y_test, y_pred1)
    rmse1 = np.sqrt(mean_squared_error(y_test, y_pred1))

    # Safe MAPE computation
    mask1 = y_test != 0
    if np.any(mask1):
        mape1 = np.mean(np.abs((y_test[mask1] - y_pred1[mask1]) / y_test[mask1])) * 100
    else:
        mape1 = np.nan  # or set to 0 or a placeholder

    reg_eq1 = (
        f"Dwell Time = {model1.intercept_:.2f} + "
        f"{model1.coef_[0]:.2f} * no_of_boarding + "
        f"{model1.coef_[1]:.2f} * no_of_alighting"
    )

    results.append({
    'Platform edge': platform_name,
    "Approach_details": approach_details,
    'X features': '2 Features - no_of_boarding, no_of_alighting',
    'approach_type': approach_type,
    'Model': 'Linear regression',
    'MAE': mae1,
    'MAPE': mape1,
    'RMSE': rmse1,
    'Regression Equation': reg_eq1,
    'train_instances_total': df_for_modelling['train_id'].nunique(),
    'observations_total': df_for_modelling.shape[0],
    'Intercept': round(model1.intercept_, 2),
    'Coef_no_of_boarding': round(model1.coef_[0], 2),
    'Coef_no_of_alighting': round(model1.coef_[1], 2),
    'Coef_interaction':  "NA"
      })


    # Feature Set 2: 3 components including interaction term
    df_for_modelling['interaction'] = df_for_modelling['no_of_boarding'] * df_for_modelling['no_of_alighting']
    X2 = df_for_modelling[['no_of_boarding', 'no_of_alighting', 'interaction']]

    X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, test_size=0.3, random_state=42)
    model2 = LinearRegression().fit(X_train2, y_train2)
    y_pred2 = model2.predict(X_test2)

    mae2 = mean_absolute_error(y_test2, y_pred2)
    rmse2 = np.sqrt(mean_squared_error(y_test2, y_pred2))

    # Safe MAPE computation
    mask2 = y_test2 != 0
    if np.any(mask2):
        mape2 = np.mean(np.abs((y_test2[mask2] - y_pred2[mask2]) / y_test2[mask2])) * 100
    else:
        mape2 = np.nan

    reg_eq2 = (
          f"Dwell Time = {model2.intercept_:.2f} + "
          f"{model2.coef_[0]:.2f} * no_of_boarding + "
          f"{model2.coef_[1]:.2f} * no_of_alighting + "
          f"{model2.coef_[2]:.2f} * interaction"
            )

    results.append({
    'Platform edge': platform_name,
    "Approach_details": approach_details,
    'X features': '3 Features - no_of_boarding, no_of_alighting, no_of_alighting * no_of_boarding',
    'approach_type': approach_type,
    'Model': 'Linear regression',
    'MAE': mae2,
    'MAPE': mape2,
    'RMSE': rmse2,
    'Regression Equation': reg_eq2,
    'train_instances_total': df_for_modelling['train_id'].nunique(),
    'observations_total': df_for_modelling.shape[0],
    'Intercept': round(model2.intercept_, 2),
    'Coef_no_of_boarding': round(model2.coef_[0], 2),
    'Coef_no_of_alighting': round(model2.coef_[1], 2),
    'Coef_interaction':  round(model2.coef_[2], 2),
      })

    return results



def generate_train_df(df_platform, edge_threshold_y, platform_edge, passenger_threshold, window_size_frames=240):
    min_frame = df_platform['frame'].min()
    max_frame = df_platform['frame'].max()

    train_instances = []
    current_train_id = 1

    for start in range(min_frame, max_frame, window_size_frames):
        end = start + window_size_frames
        window_df = df_platform[(  df_platform['frame'] >= start) & (df_platform['frame'] < end)]
        if platform_edge == "lower":
          edge_df = window_df[window_df['y'] < edge_threshold_y]
        else:
          edge_df = window_df[window_df['y'] > edge_threshold_y]

        num_passengers = edge_df['id'].nunique()

        if num_passengers >= passenger_threshold:
            train_instances.append({
                'train_id': current_train_id,
                'start_frame': start,
                'end_frame': end
            })
            current_train_id += 1

    return pd.DataFrame(train_instances)


# Platform parameters
platform_y_min = -3.77
y_divide = 4.475
df_time = df.copy()
filtered_ids = []
platform_area_map = {}

# -----------------------------
# Identify passengers moving toward the platform edge
# -----------------------------
for pid, group in df_time.groupby('id'):
    y_start = group.iloc[0]['y']
    y_end = group.iloc[-1]['y']

    dist_start = abs(y_start - platform_y_min)
    dist_end = abs(y_end - platform_y_min)

    # Check movement and whether it's toward the edge
    if dist_end < dist_start:
        filtered_ids.append(pid)

        # Assign platform area based on where the movement starts
        platform_area_map[pid] = 'lower' if y_start < y_divide else 'upper'

# Keep only those passengers
df_clean = df_time[df_time['id'].isin(filtered_ids)].copy()

# Assign platform area column
df_clean['platform_area'] = df_clean['id'].map(platform_area_map)
df_lower = df_clean[df_clean['platform_area'] == 'lower'].copy()




In [35]:
# -----------------------------
# final_results Dataframe
# -----------------------------
import time
start_time = time.time()
final_results = []
summary_lower = pd.DataFrame()

# ---- LOWER ----
df_lower = df_clean[df_clean['platform_area'] == 'lower'].copy()
# Set passenger_threshold: 5 (01:00–05:30 & 19:30–23:00), 15 (05:30–19:30) based on time of day

# | Time Interval | `passenger_threshold` |
# | ------------- | --------------------- |
# | 01:00 – 05:30 | 5                     |
# | 05:30 – 19:30 | 15                    |
# | 19:30 – 23:00 | 5                     |

# Frame thresholds
frame_530 = 4 * (5 * 3600 + 30 * 60)      # 5:30 AM
frame_1930 = 4 * (19 * 3600 + 30 * 60)    # 7:30 PM

# Slice the dataframe
df_early = df_lower[df_lower['frame'] < frame_530]
df_day = df_lower[(df_lower['frame'] >= frame_530) & (df_lower['frame'] < frame_1930)]
df_late = df_lower[df_lower['frame'] >= frame_1930]

# Generate train instances for each time period
edge_threshold_y=-2.27
platform_edge = "lower"
train_early = generate_train_df(df_early, edge_threshold_y, platform_edge, passenger_threshold=5)
train_day = generate_train_df(df_day, edge_threshold_y, platform_edge,  passenger_threshold=15)
train_late = generate_train_df(df_late, edge_threshold_y, platform_edge, passenger_threshold=5)

# Adjust train IDs to be unique
train_day['train_id'] += train_early['train_id'].max()
train_late['train_id'] += train_day['train_id'].max()

# Dynamic approach
train_df_lower = pd.concat([train_early, train_day, train_late], ignore_index=True)
static_y = -3.77
platform_edge = "lower"
approach_type = "dynamic"
edge_threshold_y=-2.27
dynamic_buffer=4
approach_details = "Entire Dataset with dynamic M line approach for lower platform edge"
summary_lower_dynamic = process_all_trains(df_lower, train_df_lower, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model( platform_edge, summary_lower_dynamic, approach_type, approach_details)

# static M line approach
summary_lower = pd.DataFrame()
static_y = -3.77
platform_edge = "lower"
approach_type = "static"
edge_threshold_y=-2.27
dynamic_buffer=4
approach_details = "Entire Dataset with static M line for lower platform edge"
summary_lower_static = process_all_trains(df_lower, train_df_lower, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model( platform_edge, summary_lower_static, approach_type, approach_details)

In [36]:
# ---- UPPER ----
df_upper = df_clean[df_clean['platform_area'] == 'upper'].copy()
summary_upper = pd.DataFrame()
# Set passenger_threshold: 5 (01:00–05:30 & 19:30–23:00), 15 (05:30–19:30) based on time of day

# | Time Interval | `passenger_threshold` |
# | ------------- | --------------------- |
# | 01:00 – 05:30 | 5                     |
# | 05:30 – 19:30 | 15                    |
# | 19:30 – 23:00 | 5                     |

# Frame thresholds
frame_530 = 4 * (5 * 3600 + 30 * 60)      # 5:30 AM
frame_1930 = 4 * (19 * 3600 + 30 * 60)    # 7:30 PM

# Slice the dataframe
df_early = df_upper[df_upper['frame'] < frame_530]
df_day = df_upper[(df_upper['frame'] >= frame_530) & (df_upper['frame'] < frame_1930)]
df_late = df_upper[df_upper['frame'] >= frame_1930]

# Generate train instances for each time period
edge_threshold_y=11.22
platform_edge = "upper"
train_early = generate_train_df(df_early, edge_threshold_y, platform_edge, passenger_threshold=5)
train_day = generate_train_df(df_day, edge_threshold_y,platform_edge, passenger_threshold=20)
train_late = generate_train_df(df_late, edge_threshold_y,platform_edge, passenger_threshold=5)

# Adjust train IDs to be unique
train_day['train_id'] += train_early['train_id'].max()
train_late['train_id'] += train_day['train_id'].max()

# Dynamic approach
train_df_upper = pd.concat([train_early, train_day, train_late], ignore_index=True)
static_y = 12.72
platform_edge = "upper"
approach_type = "dynamic"
approach_details = "Entire Dataset with dynamic M line approach for Upper platform edge"
edge_threshold_y= 11.22
dynamic_buffer = 3
summary_upper_dynamic = process_all_trains(df_upper, train_df_upper, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model(platform_edge, summary_upper_dynamic, approach_type, approach_details)

# static approach
summary_upper = pd.DataFrame()
static_y = 12.72
platform_edge = "upper"
approach_type = "static"
approach_details = "Entire Dataset with static M line for Upper platform edge"
edge_threshold_y= 11.22
dynamic_buffer = 3
summary_upper_static = process_all_trains(df_upper, train_df_upper, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model(platform_edge, summary_upper_static, approach_type, approach_details)


In [37]:
# for entire dataset - combining all trains, all door position from both the platform edges

platform_edge = "both"
approach_type = "dynamic"
approach_details = "Entire Dataset with dynamic M line approach for both platform edges"
# Find the maximum train_id in the first DataFrame
max_train_id = summary_lower_dynamic['train_id'].max()
# Offset the train_id in the second DataFrame to make them unique
summary_upper_dynamic['train_id'] += max_train_id
# Now concatenate
df_lower_combined = pd.concat([summary_lower_dynamic, summary_upper_dynamic], ignore_index=True)
final_results += evaluate_model(platform_edge, df_lower_combined, approach_type, approach_details)

platform_edge = "both"
approach_type = "static"
approach_details = "Entire Dataset with static M line for both platform edges"
max_train_id = summary_lower_static['train_id'].max()
summary_upper_static['train_id'] += max_train_id
df_upper_combined = pd.concat([summary_lower_static, summary_upper_static], ignore_index=True)
final_results += evaluate_model(platform_edge, df_upper_combined, approach_type, approach_details)


In [38]:
# -----------------------------
# Helper: Convert hour, minute to frame
# -----------------------------
def time_to_frame(hour, minute):
    return 4 * (hour * 3600 + minute * 60)

# -----------------------------
# Filter: Only moving passengers
# -----------------------------
def filter_moving_passengers(df, platform_y):
    filtered_ids = []
    for pid, group in df.groupby('id'):
        y_start = group.iloc[0]['y']
        y_end = group.iloc[-1]['y']
        if abs(y_start - y_end) > 0.5:  # minimal movement threshold
            filtered_ids.append(pid)
    df_clean = df[df['id'].isin(filtered_ids)].copy()
    return df_clean

# -----------------------------
# Time Frame Definitions
# -----------------------------
frame_low_start = time_to_frame(1, 0)     # 01:00
frame_low_end = time_to_frame(4, 30)      # 04:30
frame_high_start = time_to_frame(6, 30)   # 06:30
frame_high_end = time_to_frame(10, 0)     # 10:00

# -----------------------------
# Initial Platform Preprocessing
# -----------------------------
platform_y_min = -3.77
platform_y_max = 14.75
y_divide = 4.475

# Filter base time window (1 AM to 10 AM)
df_time = df[(df['frame'] >= frame_low_start) & (df['frame'] < frame_high_end)]

# Filter for moving passengers
df_clean = filter_moving_passengers(df_time, platform_y_min)

# Add platform area column
df_clean['platform_area'] = np.where(df_clean['y'] > y_divide, 'upper', 'lower')

# -----------------------------
# Platform Settings (Lower edge)
# -----------------------------
df_lower = df_clean[df_clean['platform_area'] == 'lower'].copy()
edge_threshold_y = -2.27          # updated threshold
static_y = -3.77
dynamic_buffer = 4
platform_edge = "lower"

# -----------------------------
# LOW CROWD WINDOW
# -----------------------------
df_low = df_lower[(df_lower['frame'] >= frame_low_start) & (df_lower['frame'] < frame_low_end)]
train_df_low = generate_train_df(df_low, edge_threshold_y, platform_edge, passenger_threshold=5)
print("train_df_low", train_df_low.shape)

# Approach 1: Static + Dynamic
approach_type = "dynamic"
approach_details = "Dataset for time period - 1 AM to 4.30 AM for low crowd with static and dynamic M line for lower platform edge"
summary_low = process_all_trains(df_low, train_df_low, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model(platform_edge, summary_low, approach_type, approach_details)

# Approach 2: Static only
approach_type = "static"
approach_details = "Dataset for time period - 1 AM to 4.30 AM for low crowd with static M line for lower platform edge"
summary_low = process_all_trains(df_low, train_df_low, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model(platform_edge, summary_low, approach_type, approach_details)

# -----------------------------
# HIGH CROWD WINDOW
# -----------------------------
df_high = df_lower[(df_lower['frame'] >= frame_high_start) & (df_lower['frame'] < frame_high_end)]
train_df_high = generate_train_df(df_high, edge_threshold_y, platform_edge, passenger_threshold=15)
print("train_df_high", train_df_high.shape)

# Approach 3: Static + Dynamic
approach_type = "dynamic"
approach_details = "Dataset for time period - 6.30 AM to 10 AM for high crowd with static and dynamic M line for lower platform edge"
summary_high = process_all_trains(df_high, train_df_high, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model(platform_edge, summary_high, approach_type, approach_details)

# Approach 4: Static only
approach_type = "static"
approach_details = "Dataset for time period - 6.30 AM to 10 AM for high crowd with static M line for lower platform edge"
summary_high = process_all_trains(df_high, train_df_high, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model(platform_edge, summary_high, approach_type, approach_details)


train_df_low (11, 3)
train_df_high (38, 3)


In [39]:
# -----------------------------
# Helper: Convert hour, minute to frame
# -----------------------------
def time_to_frame(hour, minute):
    return 4 * (hour * 3600 + minute * 60)

# -----------------------------
# Filter: Only moving passengers
# -----------------------------
def filter_moving_passengers(df, platform_y):
    filtered_ids = []
    for pid, group in df.groupby('id'):
        y_start = group.iloc[0]['y']
        y_end = group.iloc[-1]['y']
        if abs(y_start - y_end) > 0.5:  # minimal movement threshold
            filtered_ids.append(pid)
    df_clean = df[df['id'].isin(filtered_ids)].copy()
    return df_clean

# -----------------------------
# Time Frame Definitions
# -----------------------------
frame_low_start = time_to_frame(1, 0)     # 01:00
frame_low_end = time_to_frame(4, 30)      # 04:30
frame_high_start = time_to_frame(6, 30)   # 06:30
frame_high_end = time_to_frame(10, 0)     # 10:00

# -----------------------------
# Initial Platform Preprocessing
# -----------------------------
platform_y_max = 12.72
y_divide = 4.475

# Filter base time window (1 AM to 10 AM)
df_time = df[(df['frame'] >= frame_low_start) & (df['frame'] < frame_high_end)]

# Filter for moving passengers
df_clean = filter_moving_passengers(df_time, platform_y_max)

# Add platform area column
df_clean['platform_area'] = np.where(df_clean['y'] > y_divide, 'upper', 'lower')

# -----------------------------
# Platform Settings (Upper edge)
# -----------------------------
df_upper = df_clean[df_clean['platform_area'] == 'upper'].copy()
edge_threshold_y = 11.22          # updated threshold for upper edge
static_y = 12.72                  # max platform y
dynamic_buffer = 4
platform_edge = "upper"

# -----------------------------
# LOW CROWD WINDOW
# -----------------------------
df_low = df_upper[(df_upper['frame'] >= frame_low_start) & (df_upper['frame'] < frame_low_end)]
train_df_low = generate_train_df(df_low, edge_threshold_y, platform_edge, passenger_threshold=5)

# Approach 1: Static + Dynamic
approach_type = "dynamic"
approach_details = "Dataset for time period - 1 AM to 4.30 AM for low crowd with static and dynamic M line for upper platform edge"
summary_low = process_all_trains(df_low, train_df_low, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model( platform_edge, summary_low, approach_type, approach_details)

# Approach 2: Static only
approach_type = "static"
approach_details = "Dataset for time period - 1 AM to 4.30 AM for low crowd with static M line for upper platform edge"
summary_low = process_all_trains(df_low, train_df_low, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model(platform_edge, summary_low, approach_type, approach_details)

# -----------------------------
# HIGH CROWD WINDOW
# -----------------------------
df_high = df_upper[(df_upper['frame'] >= frame_high_start) & (df_upper['frame'] < frame_high_end)]
train_df_high = generate_train_df(df_high, edge_threshold_y, platform_edge, passenger_threshold=15)

# Approach 3: Static + Dynamic
approach_type = "dynamic"
approach_details = "Dataset for time period - 6.30 AM to 10 AM for high crowd with static and dynamic M line for upper platform edge"
summary_high = process_all_trains(df_high, train_df_high, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model(platform_edge, summary_high, approach_type, approach_details)

# Approach 4: Static only
approach_type = "static"
approach_details = "Dataset for time period - 6.30 AM to 10 AM for high crowd with static M line for upper platform edge"
summary_high = process_all_trains(df_high, train_df_high, edge_threshold_y, static_y, platform_edge, approach_type, dynamic_buffer)
final_results += evaluate_model( platform_edge, summary_high, approach_type, approach_details)


In [40]:

# Final DataFrame
results_df = pd.DataFrame(final_results)
# save
results_df.to_csv("final_results.csv")

In [41]:
results_df.head()


Unnamed: 0,Platform edge,Approach_details,X features,approach_type,Model,MAE,MAPE,RMSE,Regression Equation,train_instances_total,observations_total,Intercept,Coef_no_of_boarding,Coef_no_of_alighting,Coef_interaction
0,lower,Entire Dataset with dynamic M line approach fo...,"2 Features - no_of_boarding, no_of_alighting",dynamic,Linear regression,12.614724,79.51483,14.866362,Dwell Time = 25.78 + 1.74 * no_of_boarding + 5...,123,1006,25.78,1.74,5.71,
1,lower,Entire Dataset with dynamic M line approach fo...,"3 Features - no_of_boarding, no_of_alighting, ...",dynamic,Linear regression,12.222111,74.885247,14.512221,Dwell Time = 22.44 + 2.25 * no_of_boarding + 1...,123,1006,22.44,2.25,10.7,-0.72
2,lower,Entire Dataset with static M line for lower pl...,"2 Features - no_of_boarding, no_of_alighting",static,Linear regression,12.354049,98.780885,14.545063,Dwell Time = 16.16 + 3.10 * no_of_boarding + 8...,123,958,16.16,3.1,8.23,
3,lower,Entire Dataset with static M line for lower pl...,"3 Features - no_of_boarding, no_of_alighting, ...",static,Linear regression,12.349307,98.647191,14.542893,Dwell Time = 16.09 + 3.12 * no_of_boarding + 9...,123,958,16.09,3.12,9.67,-0.41
4,upper,Entire Dataset with dynamic M line approach fo...,"2 Features - no_of_boarding, no_of_alighting",dynamic,Linear regression,13.25399,82.68763,15.79563,Dwell Time = 32.51 + 2.96 * no_of_boarding + 0...,149,805,32.51,2.96,0.34,


In [42]:
## Identifying Train Instances Using Passenger Threshold
# Since the passenger count on the platform varies throughout the day, we use a `passenger_threshold` to identify valid train instances.
# Note : this notebook may take upto 15-20 minutes to run.
