Now that the data is pre-processed, we will be deriving new features from the existing dataset.

#### 1. Location Duration
The amount of time (in seconds) the GPS device reported the same location. In other words, the time spent stationed at a given location. 

In [2]:
import pandas as pd

def add_location_durations_all(df):
    """
    For the entire dataframe, compute halt durations per (trip_uid, latitude, longitude)
    and add as a new column 'location_duration'.
    """
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"])

    # Compute duration per trip + location
    location_durations = (
        df.groupby(["trip_uid", "latitude", "longitude"])["timestamp"]
          .agg(lambda x: (x.max() - x.min()).total_seconds())
          .rename("location_duration")
          .reset_index()
    )

    # Merge back into original df
    df = df.merge(location_durations, on=["trip_uid", "latitude", "longitude"], how="left")

    # Replace NaN with 0 (moving points that don’t have halts)
    df["location_duration"] = df["location_duration"].fillna(0)

    return df

In [None]:
df = pd.read_csv('data/processed/tracker_data_preprocessed.csv')
df_with_location_durations = add_location_durations_all(df)
df_with_location_durations.to_csv('data/processed/tracker_data_with_location_durations.csv', index=False)

Now that we have access to this, we can collapse the long-form dataset into a single row for each trip instead of every GPS location as a separate row. This should bring the dataset down to a total of 4658 rows (trips)

In the below function each trip is collapsed into the following features:

1. Trip ID
2. Distance
3. Duration
4. Average Speed
5. Start Time
6. End Time
7. Start Location
8. End Location
9. Halt count 
10. Avg halt duration
11. Max halt duration
12. Max halt location

In [35]:
import pandas as pd
import numpy as np

def derive_trip_features(df):

    rows = []   # collect dicts instead of concat

    for trip_id, trip_data in df.groupby('trip_uid'):

        trip_data = trip_data.sort_values('timestamp')

        distance = float(trip_data['distance'].sum())

        duration = (trip_data['timestamp'].iloc[-1] -
                    trip_data['timestamp'].iloc[0]).total_seconds() / 60

        avg_speed = distance / (duration * 60) if duration > 0 else 0.0

        start_time = trip_data['timestamp'].iloc[0]
        end_time   = trip_data['timestamp'].iloc[-1]

        start_location = (
            float(trip_data['latitude'].iloc[0]),
            float(trip_data['longitude'].iloc[0])
        )

        end_location = (
            float(trip_data['latitude'].iloc[-1]),
            float(trip_data['longitude'].iloc[-1])
        )

        halt_data = trip_data[trip_data['location_duration'] > 0]

        halt_count = int(halt_data['location_duration'].nunique())

        avg_halt_duration = float(halt_data['location_duration'].mean()) if not halt_data.empty else 0.0
        max_halt_duration = float(halt_data['location_duration'].max()) if not halt_data.empty else 0.0

        if not halt_data.empty:
            idx = halt_data['location_duration'].idxmax()
            max_halt_location = (
                float(trip_data.loc[idx, 'latitude']),
                float(trip_data.loc[idx, 'longitude'])
            )
        else:
            max_halt_location = (None, None)

        rows.append({
            'trip_id': trip_id,
            'distance': distance,
            'duration': duration,
            'avg_speed': avg_speed,
            'start_time': start_time,
            'end_time': end_time,
            'start_location': start_location,
            'end_location': end_location,
            'halt_count': halt_count,
            'avg_halt_duration': avg_halt_duration,
            'max_halt_duration': max_halt_duration,
            'max_halt_location': max_halt_location
        })

    return pd.DataFrame(rows)


features_df = derive_trip_features(df_with_location_durations)

In [30]:
features_df = features_df[~features_df['max_halt_location'].apply(
    lambda x: isinstance(x, (list, tuple)) and x[0] is None and x[1] is None
)]
features_df

def clean_tuple(x):
    if isinstance(x, (list, tuple)) and len(x) == 2:
        return (float(x[0]), float(x[1]))
    return x

features_df['start_location'] = features_df['start_location'].apply(clean_tuple)
features_df['end_location']   = features_df['end_location'].apply(clean_tuple)
features_df['max_halt_location'] = features_df['max_halt_location'].apply(clean_tuple)

In [36]:
features_df

Unnamed: 0,trip_id,distance,duration,avg_speed,start_time,end_time,start_location,end_location,halt_count,avg_halt_duration,max_halt_duration,max_halt_location
0,C001_06-Veloci2_1.0,5080.199201,19.950000,4.244110,2024-01-31 17:49:44,2024-01-31 18:09:41,"(52.66710166666667, -8.576183333333333)","(52.67005833333333, -8.633385)",6,33.791667,96.0,"(52.66534, -8.607245)"
1,C001_06-Veloci2_110.0,6298.928335,19.500000,5.383699,2024-03-04 08:42:54,2024-03-04 09:02:24,"(52.67011, -8.633483333333333)","(52.667611666666666, -8.574975)",3,7.636364,20.0,"(52.66794166666666, -8.576695)"
2,C001_06-Veloci2_111.0,1765.016288,9.516667,3.091097,2024-03-04 10:18:56,2024-03-04 10:28:27,"(52.666605, -8.57684)","(52.672138333333336, -8.556015)",3,86.214286,309.0,"(52.672138333333336, -8.556015)"
3,C001_06-Veloci2_112.0,3399.361455,7.916667,7.156550,2024-03-04 12:21:25,2024-03-04 12:29:20,"(52.67209666666667, -8.555928333333334)","(52.671821666666666, -8.55602)",3,188.461538,451.0,"(52.671821666666666, -8.55602)"
4,C001_06-Veloci2_113.0,8880.359707,33.950000,4.359529,2024-03-04 18:16:24,2024-03-04 18:50:21,"(52.671821666666666, -8.55602)","(52.66999833333333, -8.63372)",8,41.325000,223.0,"(52.670008333333335, -8.633391666666666)"
...,...,...,...,...,...,...,...,...,...,...,...,...
4653,P666_ZZ-JG_86.0,4132.682274,21.433333,3.213594,2023-09-20 15:43:41,2023-09-20 16:05:07,"(52.67801833333333, -8.569161666666666)","(52.666845, -8.543201666666667)",4,160.655172,316.0,"(52.666845, -8.543201666666667)"
4654,P666_ZZ-JG_87.0,3149.144739,21.216667,2.473798,2023-09-20 16:22:05,2023-09-20 16:43:18,"(52.666205, -8.54393)","(52.66678666666667, -8.543111666666666)",5,119.760000,281.0,"(52.65831333333333, -8.54979)"
4655,P666_ZZ-JG_88.0,3692.867836,24.033333,2.560935,2023-09-20 17:28:30,2023-09-20 17:52:32,"(52.66665833333333, -8.543538333333334)","(52.66679666666667, -8.543175)",5,211.750000,581.0,"(52.66080333333333, -8.54976)"
4656,P666_ZZ-JG_89.0,3149.814093,16.933333,3.100211,2023-09-20 18:53:16,2023-09-20 19:10:12,"(52.66588166666666, -8.545276666666666)","(52.666855, -8.543191666666667)",6,72.350000,365.0,"(52.65833666666666, -8.549711666666667)"


In [37]:
features_df.describe().round(3)

Unnamed: 0,distance,duration,avg_speed,start_time,end_time,halt_count,avg_halt_duration,max_halt_duration
count,4658.0,4658.0,4658.0,4658,4658,4658.0,4658.0,4658.0
mean,4282.054,35.295,3.95,2023-12-31 03:46:14.680764160,2023-12-31 04:21:32.393087232,4.129,84.435,184.349
min,0.0,0.0,0.0,2023-03-22 16:24:08,2023-03-22 16:35:38,0.0,0.0,0.0
25%,1992.656,9.5,2.663,2023-08-04 06:17:51.750000128,2023-08-04 06:39:20.249999872,2.0,23.333,40.0
50%,3433.056,16.325,3.97,2023-10-25 06:43:16.500000,2023-10-25 07:01:58.500000,4.0,54.667,120.0
75%,5813.027,25.267,5.025,2024-05-14 10:23:40.249999872,2024-05-14 10:30:11.500000,5.0,111.526,281.0
max,41674.595,9759.267,210.461,2025-05-09 12:07:42,2025-05-09 12:54:28,34.0,1259.719,4282.0
std,3453.467,333.917,3.747,,,2.596,96.424,212.59


In [38]:
#Saving the dervied features dataframe to a csv file
features_df.to_csv('data/processed/derived_route_features_v2.csv')