In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
clsuter_path = '/Users/anzhunie/Desktop/Pedestrian_Training/Data/clustered.csv'

In [3]:
df_clsuter = pd.read_csv(clsuter_path)

df_clsuter = df_clsuter.dropna(subset=['ID', 'Trajectory', 'Cluster'])

In [4]:
grouped_cluster = df_clsuter.groupby(['ID', 'Trajectory'])['Cluster'].nunique().reset_index()
non_unique_cluster = grouped_cluster[grouped_cluster['Cluster'] > 1]
print(non_unique_cluster)

Empty DataFrame
Columns: [ID, Trajectory, Cluster]
Index: []


In [5]:
unique_clusters = df_clsuter.groupby(['ID', 'Trajectory'])['Cluster'].first().reset_index()
unique_clusters

Unnamed: 0,ID,Trajectory,Cluster
0,40.0,1.0,1
1,40.0,2.0,1
2,40.0,3.0,2
3,40.0,4.0,2
4,40.0,5.0,2
...,...,...,...
1027,283.0,17.0,1
1028,287.0,1.0,1
1029,287.0,2.0,1
1030,287.0,3.0,1


In [6]:
exp1 = '../Data/Experiment 1.csv'
df_exp1 = pd.read_csv(exp1)

exp2 = '../Data/Experiment 2.csv'
df_exp2 = pd.read_csv(exp2)

exp3 = '../Data/Experiment 3.csv'
df_exp3 = pd.read_csv(exp3)

dfs = [df_exp1,df_exp2,df_exp3]

In [7]:
cluster_map = (
    unique_clusters
    .dropna(subset=['ID', 'Trajectory', 'Cluster'])
    .groupby(['ID', 'Trajectory'])['Cluster']
    .first()
    .reset_index()
)

for i, df in enumerate(dfs):
    exp_num = i + 1

    df['ID'] = pd.to_numeric(df['ID'], errors='coerce')
    df['Trajectory'] = pd.to_numeric(df['Trajectory'], errors='coerce')
    cluster_map['ID'] = pd.to_numeric(cluster_map['ID'], errors='coerce')
    cluster_map['Trajectory'] = pd.to_numeric(cluster_map['Trajectory'], errors='coerce')

    df = df.merge(cluster_map, on=['ID', 'Trajectory'], how='left')
    
    df['Speed Change'] = df.groupby(['ID', 'Trajectory'])['Speed'].diff()
    df['Direction Change'] = df.groupby(['ID', 'Trajectory'])['Direction'].diff()

    df['exp_num'] = exp_num


    dfs[i] = df

combined_df = pd.concat(dfs, ignore_index=True)

In [8]:
combined_df.loc[combined_df['Cluster'].isna(), combined_df.columns.difference(['Time'])] = np.nan

In [9]:
def label_crowd_radius(df):
    crowd_radius = 2.58

    # Convert only the numeric columns to numeric values, handling invalid entries
    numeric_columns = df.select_dtypes(include=['object']).columns  # Get object columns (likely text)
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')  # Coerce invalid values to NaN

    # Initialize 'Crowd_Radius_Label' column as NaN
    df['Crowd_Radius_Label'] = np.nan

    # Iterate over each unique (ID, Trajectory) pair, excluding NaN IDs
    for (participant_id, trajectory) in df[['ID', 'Trajectory']].dropna().drop_duplicates().itertuples(index=False):
        # Extract the subset for this (ID, Trajectory) pair
        trajectory_section = df[(df['ID'] == participant_id) & (df['Trajectory'] == trajectory)].copy()

        # Step 1: Label all 'Inside' points (Distance ≤ crowd_radius)
        df.loc[trajectory_section.index[trajectory_section['Distance'] <= crowd_radius], 'Crowd_Radius_Label'] = 'Inside'

        # Step 2: Label all 'Outside' points (Distance > crowd_radius)
        df.loc[trajectory_section.index[trajectory_section['Distance'] > crowd_radius], 'Crowd_Radius_Label'] = 'Outside'

        # Step 3: Find the last 'Outside' index for this (ID, Trajectory)
        last_outside_idx = (
            trajectory_section[trajectory_section['Distance'] > crowd_radius].index[-1] 
            if not trajectory_section[trajectory_section['Distance'] > crowd_radius].empty 
            else None
        )

        # Step 4: Change all 'Inside' points before the last 'Outside' to 'Inside-Out'
        if last_outside_idx is not None:
            # Apply both conditions together to avoid index mismatches
            inside_before_last_outside = trajectory_section.loc[
                (trajectory_section.index <= last_outside_idx) & (trajectory_section['Distance'] <= crowd_radius)
            ].index

            # Update labels for these points
            df.loc[inside_before_last_outside, 'Crowd_Radius_Label'] = 'Inside-Out'

    for (participant_id, trajectory) in df[['ID', 'Trajectory']].dropna().drop_duplicates().itertuples(index=False):
        # Extract the subset for this (ID, Trajectory) pair
        trajectory_section = df[(df['ID'] == participant_id) & (df['Trajectory'] == trajectory)].copy()

        # Step 1: Check if the first 'Crowd_Radius_Label' is 'Outside'
        first_idx = trajectory_section.index[0]
        if trajectory_section.loc[first_idx, 'Crowd_Radius_Label'] == 'Outside':
            # If the first row is 'Outside', find the first row that is not 'Outside'
            non_outside_rows = trajectory_section[~trajectory_section['Crowd_Radius_Label'].isin(['Outside'])]
            
            # Step 2: Mask all rows after the first non-Outside label except 'Time'
            if not non_outside_rows.empty:
                first_non_outside_idx = non_outside_rows.index.min()
                # Mask all rows from `first_non_outside_idx` onward except 'Time' within the current (ID, Trajectory) group
                cols_to_mask = [col for col in df.columns if col != 'Time']
                df.loc[trajectory_section.loc[first_non_outside_idx:].index, cols_to_mask] = np.nan
        else:
            # Step 3: If the first row is not 'Outside', mask the entire trajectory group except 'Time'
            cols_to_mask = [col for col in df.columns if col != 'Time']
            df.loc[trajectory_section.index, cols_to_mask] = np.nan

    return df  # Return the modified DataFrame with labels


In [10]:
df_all = label_crowd_radius(combined_df)
df_all

Unnamed: 0,Time,ID,Positionx,Positionz,Positiony,Yaw,Up,Right,Down,Left,Trajectory,Distance,Speed,Direction,Cluster,Speed Change,Direction Change,exp_num,Crowd_Radius_Label
0,0.0,,,,,,,,,,,,,,,,,,
1,0.5,,,,,,,,,,,,,,,,,,
2,1.0,,,,,,,,,,,,,,,,,,
3,1.5,,,,,,,,,,,,,,,,,,
4,2.0,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74438,669.0,,,,,,,,,,,,,,,,,,
74439,669.5,,,,,,,,,,,,,,,,,,
74440,670.0,,,,,,,,,,,,,,,,,,
74441,670.5,,,,,,,,,,,,,,,,,,


In [11]:
# Count non-NaN rows per (ID, Trajectory) group
retained_rows = df_all.groupby(['ID', 'Trajectory']).apply(lambda x: x.dropna().shape[0])

# Display retained row counts
retained_rows

ID     Trajectory
40.0   2.0           12
       3.0           21
       4.0           18
       5.0           11
       6.0           14
                     ..
283.0  17.0          14
287.0  1.0           15
       2.0           19
       3.0            4
       4.0            6
Length: 995, dtype: int64

In [12]:
# Check (ID, Trajectory) pairs present in unique_cluster but with no retained rows in retained_rows:
retained_df = retained_rows.reset_index()
retained_df.columns = ['ID', 'Trajectory', 'Non_NaN_Count']

merged = unique_clusters.merge(retained_df, on=['ID', 'Trajectory'], how='left')

missing_trajectories = merged[(merged['Non_NaN_Count'].isna()) | (merged['Non_NaN_Count'] == 0)]

print(f"Number of (ID, Trajectory) pairs present in unique_cluster but with no retained rows in retained_rows: {missing_trajectories.shape[0]}")
print(missing_trajectories[['ID', 'Trajectory']])


Number of (ID, Trajectory) pairs present in unique_cluster but with no retained rows in retained_rows: 39
         ID  Trajectory
0      40.0         1.0
31     41.0         1.0
57     43.0         1.0
83     45.0         1.0
102    50.0         1.0
134    52.0         1.0
156    55.0         1.0
209    58.0         1.0
223    62.0         1.0
295    71.0         1.0
320    74.0         1.0
339    75.0         1.0
361    78.0         1.0
371    79.0         1.0
405   135.0         1.0
445   139.0         1.0
470   140.0         1.0
511   146.0         1.0
542   148.0         1.0
584   156.0         1.0
608   158.0         1.0
630   163.0         1.0
681   168.0         1.0
698   170.0         1.0
723   173.0         1.0
867   230.0         1.0
877   241.0         1.0
882   242.0         1.0
889   244.0         1.0
895   245.0         1.0
941   255.0         1.0
948   256.0         1.0
957   273.0         1.0
965   274.0         1.0
976   276.0         1.0
991   280.0         1.0
1001  

In [13]:
# Unique (ID, Trajectory) count per Cluste
df_valid = df_all.dropna(subset=['ID', 'Trajectory', 'Cluster'])

unique_traj = df_valid[['ID', 'Trajectory', 'Cluster']].drop_duplicates()

cluster_counts = unique_traj.groupby('Cluster').size().reset_index(name='Num_Trajectories')

print("Unique (ID, Trajectory) count per Cluster:")
print(cluster_counts)


Unique (ID, Trajectory) count per Cluster:
   Cluster  Num_Trajectories
0      1.0               682
1      2.0               313


In [14]:
df_all.to_csv("/Users/anzhunie/Desktop/Pedestrian_Training/Prediction/dataset_with_cluster_masked.csv", index=False)

In [15]:
df_all.columns

Index(['Time', 'ID', 'Positionx', 'Positionz', 'Positiony', 'Yaw', 'Up',
       'Right', 'Down', 'Left', 'Trajectory', 'Distance', 'Speed', 'Direction',
       'Cluster', 'Speed Change', 'Direction Change', 'exp_num',
       'Crowd_Radius_Label'],
      dtype='object')