#### Import Libraries and Load Data

In [1]:
import pandas as pd
import json
import numpy as np
import csv
from datetime import datetime
import os
import glob

In [7]:
# Directory and file name pattern
directory = 'Cleaning/trial_week/clean_data'
file_base = 'clean_2024-01-'

# Generate the list of file names
files_names = [f'{file_base}{i}.json' for i in range(28, 29)]  # day 28 only

# Generate the list of file directories
files = [os.path.join(directory, file_name) for file_name in files_names]

data = []

# Load data from each JSON file
for file_name in files:
    with open(file_name, 'r') as file:
        for line in file:
            data.append(json.loads(line))

# Create a DataFrame
df = pd.DataFrame(data)

# Drop the 'time_diff' attribute if it exists
if 'time_diff' in df.columns:
    df.drop('time_diff', axis=1, inplace=True)


# Calculate trip time in seconds
df['trip_time'] = df['time']

# Now, drop the 'time' column
df.drop(columns=['time'], inplace=True)

# Drop rows with missing values (if any)
df.dropna(inplace=True)



# Load pooling data
pooling_file_path = 'Cleaning/anon_pooling_jan_24_amman.csv' 
df_pooling = pd.read_csv(pooling_file_path)

#### Extract Stationary Intervals

In [3]:
def detect_stationary_intervals(df, output_path='stationary_intervals_all_trips.csv'):
    results = []
    for index, row in df.iterrows():
        trip_id = row['trip_id']
        lats = np.array(row['lats'])
        lngs = np.array(row['lngs'])
        time_gap = np.array(row['time_gap'])
        
        # Compute differences between consecutive points directly from lat/lng arrays
        lat_diff = np.abs(np.diff(lats))
        lng_diff = np.abs(np.diff(lngs))

        # Identify stationary intervals (where there is no change in both lat and lng)
        stationary_indices = np.where((lat_diff == 0) & (lng_diff == 0))[0]
        
        # Group consecutive stationary indices into intervals
        if len(stationary_indices) > 0:
            start_idx = stationary_indices[0]
            for i in range(1, len(stationary_indices)):
                # If the current index is not consecutive, close the interval
                if stationary_indices[i] != stationary_indices[i - 1] + 1:
                    end_idx = stationary_indices[i - 1]
                    # Save the interval only if it has more than one index
                    if end_idx > start_idx:
                        interval = {
                            "trip_id": trip_id,
                            "start_idx": int(start_idx),
                            "end_idx": int(end_idx),
                            "start_lat": float(lats[start_idx]),
                            "start_lng": float(lngs[start_idx]),
                            "end_lat": float(lats[end_idx]),
                            "end_lng": float(lngs[end_idx]),
                            "time_elapsed": float(time_gap[end_idx] - time_gap[start_idx])
                        }
                        results.append(interval)
                    # Start a new interval
                    start_idx = stationary_indices[i]
            # Add the last interval if it has more than one index
            end_idx = stationary_indices[-1]
            if end_idx > start_idx:
                interval = {
                    "trip_id": trip_id,
                    "start_idx": int(start_idx),
                    "end_idx": int(end_idx),
                    "start_lat": float(lats[start_idx]),
                    "start_lng": float(lngs[start_idx]),
                    "end_lat": float(lats[end_idx]),
                    "end_lng": float(lngs[end_idx]),
                    "time_elapsed": float(time_gap[end_idx] - time_gap[start_idx])
                }
                results.append(interval)

    # Convert results to a DataFrame and save to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False)

    print(f"Stationary intervals saved to {output_path}")
    return results_df

stationary_df = detect_stationary_intervals(df)

Stationary intervals saved to stationary_intervals_all_trips.csv


In [4]:
def extract_longest_stationary_interval(stationary_df):
    # Find the longest stationary interval for each trip_id based on time_elapsed
    longest_intervals = stationary_df.loc[stationary_df.groupby('trip_id')['time_elapsed'].idxmax()]

    longest_intervals.reset_index(drop=True, inplace=True)
    
    return longest_intervals

longest_intervals_df = extract_longest_stationary_interval(stationary_df)

####  Identify Non-matching Coordinates

In [8]:
def create_combined_csvs(longest_intervals_df, pooling_df, output_path1='combined_trip_data.csv'):
    
    # Rename 'booking_id' to 'trip_id' in pooling_df for consistency
    pooling_df = pooling_df.rename(columns={'booking_id': 'trip_id'})
    
    # Create csv file to combine pooling_df and longest_intervals_df
    combined_df = (
        longest_intervals_df[['trip_id', 'start_lat', 'start_lng']]
        .merge(pooling_df[['trip_id', 'pickup_latitude', 'pickup_longitude']], on='trip_id', how='inner')
    )

    # Rename columns for clarity in the combined data
    combined_df = combined_df.rename(columns={
        'start_lat': 'stationary_df_lat',
        'start_lng': 'stationary_df_lng',
        'pickup_latitude': 'pooling_df_lat',
        'pickup_longitude': 'pooling_df_lng'
    })

    # Save the combined data to CSV
    combined_df.to_csv(output_path1, index=False)
    print(f"Combined data saved to {output_path1}")
    
    return combined_df

combined_df = create_combined_csvs(longest_intervals_df, df_pooling)

Combined data saved to combined_trip_data.csv


In [10]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0

    # Convert latitude and longitude from degrees to radians
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)

    # Compute differences
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    # Apply Haversine formula
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c

    return distance

def extract_non_matching_coordinates(combined_df):
    # Identify rows where the stationary coordinates do not match pooling coordinates
    non_matching_df = combined_df[
        (combined_df['stationary_df_lat'] != combined_df['pooling_df_lat']) |
        (combined_df['stationary_df_lng'] != combined_df['pooling_df_lng'])
    ]

    # calculate the distance between the coordinates
    non_matching_df['distance_km'] = haversine(
        non_matching_df['stationary_df_lat'],
        non_matching_df['stationary_df_lng'],
        non_matching_df['pooling_df_lat'],
        non_matching_df['pooling_df_lng']
    )

    return non_matching_df


# Save non-matching entries to CSV
non_matching_entries = extract_non_matching_coordinates(combined_df)
non_matching_entries.to_csv('non_matching_coordinates.csv', index=False)
print("Non-matching coordinate entries saved to non_matching_coordinates.csv")

Non-matching coordinate entries saved to non_matching_coordinates.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_matching_df['distance_km'] = haversine(


#### Splitting Trips into 3 Segments [Driver-2-Merchant (1), Wait-Time-At-Merchant (2), Merchant-2-Customer (3)]

In [11]:
def segment_trips(df, longest_intervals_df):
    road_list = []
    second_segment_list = []

    # Iterate over each entry in the df dataframe
    for index, row in df.iterrows():
        trip_id = row['trip_id']
        interval = longest_intervals_df[longest_intervals_df['trip_id'] == trip_id]
        lats = row['lats']
        lngs = row['lngs']
        time_gap = row['time_gap']
        dist_gap = row['dist_gap']

        if not interval.empty:
            start_idx = interval['start_idx'].values[0]
            end_idx = interval['end_idx'].values[0]

            # Segment the trip into three parts
            first_segment = {
                'trip_id': trip_id,
                'time_gap': time_gap[:start_idx],
                'dist': dist_gap[start_idx - 1],
                'trip_time': time_gap[start_idx - 1],
                'driverID': row['driverID'],
                'weekID': row['weekID'],
                'timeID': row['timeID'],
                'dateID': row['dateID'],
                'dist_gap': dist_gap[:start_idx],
                'lats': lats[:start_idx],
                'lngs': lngs[:start_idx],
                'segmentID': 1
            }
            
            stationary_time_gap = time_gap[start_idx:end_idx+1]
            # subtract each element in the new_time_gap list by the first element in the list
            stationary_time_gap = [time - stationary_time_gap[0] for time in stationary_time_gap]

            stationary_dist_gap = dist_gap[start_idx:end_idx+1]
            # subtract each element in the new_dist_gap list by the first element in the list
            stationary_dist_gap = [dist - stationary_dist_gap[0] for dist in stationary_dist_gap]
        
            second_segment = {
                'trip_id': trip_id,
                'time_gap': stationary_time_gap,
                'dist': stationary_dist_gap[-1],
                'trip_time': stationary_time_gap[-1],
                'driverID': row['driverID'],
                'weekID': row['weekID'],
                'timeID': row['timeID'],
                'dateID': row['dateID'],
                'dist_gap': stationary_dist_gap,
                'lats': lats[start_idx:end_idx+1],
                'lngs': lngs[start_idx:end_idx+1],
            }
            
            new_time_gap = time_gap[end_idx+1:]
            # subtract each element in the new_time_gap list by the first element in the list
            new_time_gap = [time - new_time_gap[0] for time in new_time_gap]

            new_dist_gap = dist_gap[end_idx+1:]
            # subtract each element in the new_dist_gap list by the first element in the list
            new_dist_gap = [dist - new_dist_gap[0] for dist in new_dist_gap]
            

            third_segment = {
                'trip_id': trip_id,
                'time_gap': new_time_gap,
                'dist': new_dist_gap[-1],
                'trip_time': new_time_gap[-1],
                'driverID': row['driverID'],
                'weekID': row['weekID'],
                'timeID': row['timeID'],
                'dateID': row['dateID'],
                'dist_gap': new_dist_gap,
                'lats': lats[end_idx+1:],
                'lngs': lngs[end_idx+1:],
                'segmentID': 3
            }
        else:
            print(f"No interval found for trip_id: {trip_id}")

        # add the first and third segments to the road_list
        # check if the time_gap of first_segment is empty or not
        if len(first_segment['time_gap']) > 1:
            road_list.append(first_segment)

        if len(third_segment['time_gap']) > 1:
            road_list.append(third_segment)

        # add the second segment to the second_segment_list
        if len(second_segment['time_gap']) > 1:
            second_segment_list.append(second_segment)

    # Convert the lists to DataFrames
    road_df = pd.DataFrame(road_list)
    second_segment_df = pd.DataFrame(second_segment_list)

    return road_df, second_segment_df

road_df, second_segment_df = segment_trips(df, longest_intervals_df)



road_json = road_df.to_dict(orient='records')
second_seg_json = second_segment_df.to_dict(orient='records')

with open('Segmented_Trips_Filtered_01_28.json', 'w') as file:
    for json_obj in road_json:
        json.dump(json_obj, file)
        file.write('\n')

with open('Merchants_Segments_Filtered_01_28.json', 'w') as file:
    for json_obj in second_seg_json:
        json.dump(json_obj, file)
        file.write('\n')
        
print("Segments 1 & 3 saved to 'Segmented_Trips_Filtered_01_28.json'")
print("Segment 2 with wait times saved to 'Merchants_Segments_Filtered_01_28.json'")


# print(road_df.columns)
# print(second_segment_df.columns)

Segments 1 & 3 saved to 'Segmented_Trips_Filtered_01_28.json'
Segment 2 with wait times saved to 'Merchants_Segments_Filtered_01_28.json'
