In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
clsuter_path = '/Users/anzhunie/Desktop/Pedestrian_Training/Data/clustered.csv'

In [3]:
df_clsuter = pd.read_csv(clsuter_path)

df_clsuter = df_clsuter.dropna(subset=['ID', 'Trajectory', 'Cluster'])

In [4]:
grouped_cluster = df_clsuter.groupby(['ID', 'Trajectory'])['Cluster'].nunique().reset_index()
non_unique_cluster = grouped_cluster[grouped_cluster['Cluster'] > 1]
print(non_unique_cluster)

Empty DataFrame
Columns: [ID, Trajectory, Cluster]
Index: []


In [5]:
unique_clusters = df_clsuter.groupby(['ID', 'Trajectory'])['Cluster'].first().reset_index()
print(unique_clusters)

         ID  Trajectory  Cluster
0      40.0         1.0        1
1      40.0         2.0        1
2      40.0         3.0        2
3      40.0         4.0        2
4      40.0         5.0        2
...     ...         ...      ...
1027  283.0        17.0        1
1028  287.0         1.0        1
1029  287.0         2.0        1
1030  287.0         3.0        1
1031  287.0         4.0        2

[1032 rows x 3 columns]


In [6]:
entire_path = '/Users/anzhunie/Desktop/Pedestrian_Training/Data/entrie dataset.csv'

df_entire = pd.read_csv(entire_path)

df_entire = df_entire.dropna(subset=['ID', 'Trajectory'])
df_entire = df_entire.merge(unique_clusters, on=['ID', 'Trajectory'], how='left')

In [7]:
df_entire.loc[df_entire['Cluster'].isna(), df_entire.columns.difference(['Time'])] = np.nan

In [8]:
df_entire_rows = df_entire.groupby(['ID', 'Trajectory']).apply(lambda x: x.dropna().shape[0])
df_entire_rows 

ID     Trajectory
40.0   1.0           15
       2.0           23
       3.0           47
       4.0           76
       5.0           66
                     ..
283.0  17.0          36
287.0  1.0           32
       2.0           40
       3.0           14
       4.0           68
Length: 1032, dtype: int64

In [9]:
def label_crowd_radius(df):
    crowd_radius = 2.58

    # Convert only the numeric columns to numeric values, handling invalid entries
    numeric_columns = df.select_dtypes(include=['object']).columns  # Get object columns (likely text)
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')  # Coerce invalid values to NaN

    # Initialize 'Crowd_Radius_Label' column as NaN
    df['Crowd_Radius_Label'] = np.nan

    # Iterate over each unique (ID, Trajectory) pair, excluding NaN IDs
    for (participant_id, trajectory) in df[['ID', 'Trajectory']].dropna().drop_duplicates().itertuples(index=False):
        # Extract the subset for this (ID, Trajectory) pair
        trajectory_section = df[(df['ID'] == participant_id) & (df['Trajectory'] == trajectory)].copy()

        # Step 1: Label all 'Inside' points (Distance ≤ crowd_radius)
        df.loc[trajectory_section.index[trajectory_section['Distance'] <= crowd_radius], 'Crowd_Radius_Label'] = 'Inside'

        # Step 2: Label all 'Outside' points (Distance > crowd_radius)
        df.loc[trajectory_section.index[trajectory_section['Distance'] > crowd_radius], 'Crowd_Radius_Label'] = 'Outside'

        # Step 3: Find the last 'Outside' index for this (ID, Trajectory)
        last_outside_idx = (
            trajectory_section[trajectory_section['Distance'] > crowd_radius].index[-1] 
            if not trajectory_section[trajectory_section['Distance'] > crowd_radius].empty 
            else None
        )

        # Step 4: Change all 'Inside' points before the last 'Outside' to 'Inside-Out'
        if last_outside_idx is not None:
            # Apply both conditions together to avoid index mismatches
            inside_before_last_outside = trajectory_section.loc[
                (trajectory_section.index <= last_outside_idx) & (trajectory_section['Distance'] <= crowd_radius)
            ].index

            # Update labels for these points
            df.loc[inside_before_last_outside, 'Crowd_Radius_Label'] = 'Inside-Out'

    for (participant_id, trajectory) in df[['ID', 'Trajectory']].dropna().drop_duplicates().itertuples(index=False):
        # Extract the subset for this (ID, Trajectory) pair
        trajectory_section = df[(df['ID'] == participant_id) & (df['Trajectory'] == trajectory)].copy()

        # Step 1: Check if the first 'Crowd_Radius_Label' is 'Outside'
        first_idx = trajectory_section.index[0]
        if trajectory_section.loc[first_idx, 'Crowd_Radius_Label'] == 'Outside':
            # If the first row is 'Outside', find the first row that is not 'Outside'
            non_outside_rows = trajectory_section[~trajectory_section['Crowd_Radius_Label'].isin(['Outside'])]
            
            # Step 2: Mask all rows after the first non-Outside label except 'Time'
            if not non_outside_rows.empty:
                first_non_outside_idx = non_outside_rows.index.min()
                # Mask all rows from `first_non_outside_idx` onward except 'Time' within the current (ID, Trajectory) group
                cols_to_mask = [col for col in df.columns if col != 'Time']
                df.loc[trajectory_section.loc[first_non_outside_idx:].index, cols_to_mask] = np.nan
        else:
            # Step 3: If the first row is not 'Outside', mask the entire trajectory group except 'Time'
            cols_to_mask = [col for col in df.columns if col != 'Time']
            df.loc[trajectory_section.index, cols_to_mask] = np.nan

    return df  # Return the modified DataFrame with labels


In [10]:
df_all = label_crowd_radius(df_entire)
df_all

Unnamed: 0,Time,ID,Positionx,Positionz,Positiony,Yaw,Up,Right,Down,Left,Trajectory,Distance,Speed,Direction,Speed Change,Direction Change,Cluster,Crowd_Radius_Label
0,0.0,,,,,,,,,,,,,,,,,
1,0.5,,,,,,,,,,,,,,,,,
2,1.0,,,,,,,,,,,,,,,,,
3,1.5,,,,,,,,,,,,,,,,,
4,2.0,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56348,668.5,,,,,,,,,,,,,,,,,
56349,669.0,,,,,,,,,,,,,,,,,
56350,669.5,,,,,,,,,,,,,,,,,
56351,670.0,,,,,,,,,,,,,,,,,


In [11]:
# Count non-NaN rows per (ID, Trajectory) group
retained_rows = df_all.groupby(['ID', 'Trajectory']).apply(lambda x: x.dropna().shape[0])

# Display retained row counts
print(retained_rows)

ID     Trajectory
40.0   2.0           13
       3.0           22
       4.0           19
       5.0           12
       6.0           15
                     ..
283.0  17.0          15
287.0  1.0           16
       2.0           20
       3.0            5
       4.0            7
Length: 995, dtype: int64


In [14]:
df_entire.to_csv("/Users/anzhunie/Desktop/local_Pedestrian_Training/entire_dataset_with_cluster_masked.csv", index=False)