## Pings to trajectories json

In [None]:
import os
os.mkdir('Data')

In [10]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime
import re
import json

input_dir = 'careems_data/'
output_dir = 'json_traj/'

# Get a list of all input files in the directory
input_files = [f for f in os.listdir(input_dir) if f.startswith('ooling_pings_jan_24_amman_2024-01-28') and f.endswith('.csv')]

#extract the date from filename
def extract_date_from_filename(filename):
    #regular expression to extract the date in the format YYYY-MM-DD
    match = re.search(r"\d{4}-\d{2}-\d{2}", filename)
    
    if match:
        return match.group(0)  #return extracted date
    else:
        raise ValueError("Date not found in filename. Expected format: trajectories-YYYY-MM-DD.csv")

#get day of the week from date string
def day_of_week(date_str):

    date = datetime.strptime(date_str, "%Y-%m-%d")

    #get day of the week (Monday is 0, Sunday is 6)
    day_index = date.weekday()

    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

    #get day of the week from the index
    day_name = days[day_index]

     #get day of the month (from 0 to 30)
    day_of_month = date.day - 1  

    return day_index, day_name, day_of_month

#get time ID (minute of the day from 0 to 1439)
def time_id_from_timestamp(timestamp_str):
    time = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
    total_minutes = time.hour * 60 + time.minute

    return total_minutes

def process_file(input_file, output_file):
    df = pd.read_csv(input_file)

    #convert to datetime
    df['location_read_at'] = pd.to_datetime(df['location_read_at'])

    #calculate distance between two points
    def calculate_distance(lat1, lon1, lat2, lon2):
        return geodesic((lat1, lon1), (lat2, lon2)).kilometers

    #calculate time difference in seconds
    def calculate_time_difference(time1, time2):
        return (time2 - time1).total_seconds()

    trip_data = []

    null_booking_id = '9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0'
    filtered_df = df[df['hash_booking_id'] != null_booking_id]  #filter out the null booking id
    grouped = filtered_df.groupby('hash_booking_id')

    for booking_id, group in grouped:
        #sort pings by timestamp
        group = group.sort_values(by='location_read_at')

        driver_id = group['hash_driver_id'].iloc[-1]

        #first instance of driver id to track switches
        first_instance = group[group['hash_driver_id'] == driver_id].iloc[0]

        time_id = first_instance['location_read_at']

        #filter out pings before switch
        valid_group = group[group['location_read_at'] >= time_id]

        lngs = valid_group['longitude'].tolist()
        lats = valid_group['latitude'].tolist()

        #dist gaps
        dist_gaps = [0]
        time_gaps = [0]
        prev_lat = lats[0]
        prev_lng = lngs[0]
        prev_time = time_id
        cum_dist = 0

        #total distance
        for lat, lng in zip(lats[1:], lngs[1:]):
            dist = calculate_distance(prev_lat, prev_lng, lat, lng)
            cum_dist += dist
            dist_gaps.append(cum_dist)
            prev_lat = lat
            prev_lng = lng

        total_dist = cum_dist

        #time gaps
        time_gaps = [(t - time_id).total_seconds() for t in valid_group['location_read_at']]

        #total time
        total_time = calculate_time_difference(valid_group['location_read_at'].iloc[0], valid_group['location_read_at'].iloc[-1])

        trip_data.append([booking_id, driver_id, time_id, lngs, lats, total_dist, total_time, time_gaps, dist_gaps])

    output_df = pd.DataFrame(trip_data, columns=['booking_id', 'driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

    # Convert DataFrame to list of dicts for JSON processing
    trip_data_dicts = output_df.to_dict('records')

    # Create JSON objects
    json_data = []
    date_str = extract_date_from_filename(input_file)

    for row in trip_data_dicts:
        # Day of the week
        week_id, name, date_id = day_of_week(date_str)
        # Time ID is minute of day
        time_id = time_id_from_timestamp(str(row['time_id']))

        new_dict = {
            'trip_id': row['booking_id'],
            'time_gap': row['time_gap'],
            'dist': float(row['dist']),
            'lats': row['lats'],
            'driverID': row['driver_id'],
            'weekID': week_id,
            'timeID': time_id,
            'dateID': date_id,
            'time': float(row['time']),
            'lngs': row['lngs'],
            'dist_gap': row['dist_gap']
        }
        json_str = json.dumps(new_dict, separators=(',', ':'))  # Convert to JSON string
        json_data.append(json_str)

    # Write JSON data to file
    with open(output_file, 'w', encoding='utf-8') as output_file:
        for entry in json_data:
            output_file.write(entry + '\n')
            output_file.flush()

# Process each input file
for input_file in input_files:
    date_str = input_file.split('_')[-1].split('.')[0]  # Extract date from file name
    output_file = os.path.join(output_dir, f'traj_fix_dist_{date_str}.json')
    process_file(os.path.join(input_dir, input_file), output_file)
    print(f"Data has been written to {output_file}")

Data has been written to json_traj/traj_fix_dist_2024-01-28.json


## Clean trips

In [6]:
input_dir = 'json_traj/jan/'
output_dir = 'clean_data/'
clean_file_template = "clean_{date}.json"
pooling_data = pd.read_csv("careems data/anon_pooling_jan_24_amman.csv")


for j in os.listdir(input_dir):
    date_str = j.split('_')[1].split('.')[0]  # Extract date from file name
    filtered_pooling_data = pooling_data[pooling_data['day'] == date_str]
    
    if filtered_pooling_data.empty:
        print(f"No pooling data for date: {date_str}")
        continue

    input_file_path = os.path.join(input_dir, j)
    clean_file_path = os.path.join(output_dir, clean_file_template.format(date=date_str))

    with open(input_file_path, "r") as json_traj_file:
        new_data = [json.loads(line) for line in json_traj_file]  # Read each line as a JSON object

    with open(clean_file_path, 'w', encoding='utf-8') as clean_file:
        for entry in new_data:
            entry_time = entry["time"]
            entry_trip_id = entry["trip_id"]
            entry_driver_id = entry["driverID"]

            for _, row in filtered_pooling_data.iterrows():
                pool_time = float(row["captain_engagement_time"] * 60)
                pool_trip_id = row["booking_id"]
                pool_driver_id = row["captain_id"]
                time_diff = pool_time - entry_time

                # Adding time difference to entry
                entry_with_time_diff = entry.copy()
                entry_with_time_diff["time_diff"] = time_diff

                # Writing good trips to clean file (same trip and driver id and <= 5 sec time diff)
                if entry_trip_id == pool_trip_id and entry_driver_id == pool_driver_id and abs(time_diff) <= 5:
                    json.dump(entry_with_time_diff, clean_file)
                    clean_file.write("\n")
                    break

    print(f"Processed and cleaned data for date: {date_str}")

print("Matching process completed. Check the 'trial week/clean_data/' directory for results.")


Processed and cleaned data for date: 2024-01-22
Processed and cleaned data for date: 2024-01-11
Processed and cleaned data for date: 2024-01-23
Processed and cleaned data for date: 2024-01-01
Processed and cleaned data for date: 2024-01-21
Processed and cleaned data for date: 2024-01-06
Processed and cleaned data for date: 2024-01-12
Processed and cleaned data for date: 2024-01-05
Processed and cleaned data for date: 2024-01-17
Processed and cleaned data for date: 2024-01-24
Processed and cleaned data for date: 2024-01-09
Processed and cleaned data for date: 2024-01-15
Processed and cleaned data for date: 2024-01-13
Processed and cleaned data for date: 2024-01-04
Processed and cleaned data for date: 2024-01-14
Processed and cleaned data for date: 2024-01-16
Processed and cleaned data for date: 2024-01-03
Processed and cleaned data for date: 2024-01-18
Processed and cleaned data for date: 2024-01-08
Processed and cleaned data for date: 2024-01-19
Processed and cleaned data for date: 202

### Map driver ids across all files

In [2]:
folder_path = 'clean_data/'  # Replace with the path to your folder

# Get a list of all JSON files in the folder
input_files = glob.glob(os.path.join(folder_path, '*.json'))

# Step 1: Collect unique driver IDs from all files
driver_ids = set()  # Use a set to ensure uniqueness
for file_path in input_files:
    with open(file_path, 'r', encoding='utf-8') as input_file:
        for line in input_file:
            entry = json.loads(line)
            driver_ids.add(entry['driverID'])

# Step 2: Create a mapping from unique driver IDs to integers
driver_id_map = {driver_id: idx for idx, driver_id in enumerate(driver_ids, start=1)}

# Step 3: Apply the mapping and add a new entry to the data
for file_path in input_files:
    mapped_data = []
    with open(file_path, 'r', encoding='utf-8') as input_file:
        for line in input_file:
            entry = json.loads(line)
            entry['mapped_driveID'] = driver_id_map[entry['driverID']]  # Add new entry
            mapped_data.append(entry)
    
    # Step 4: Write the modified data back to the original file
    with open(file_path, 'w', encoding='utf-8') as output_file:
        for entry in mapped_data:
            json_str = json.dumps(entry, separators=(',', ':'))  # Convert to single-line JSON
            output_file.write(json_str + '\n')  # Write each entry to a new line


### change names of driverid and mapped driver id

In [5]:
import json
import os
import glob

# Specify the folder containing the JSON files
folder_path = 'clean_data/'  # Replace with the path to your folder

# Get a list of all JSON files in the folder
input_files = glob.glob(os.path.join(folder_path, '*.json'))

# Process each file in the folder
for file_path in input_files:
    updated_data = []
    with open(file_path, 'r', encoding='utf-8') as input_file:
        for line in input_file:
            entry = json.loads(line)
            # Rename driverID to unmapped_driverID
            entry['unmapped_driverID'] = entry.pop('driverID')
            # Rename mapped_driver_id to driverID
            entry['driverID'] = entry.pop('mapped_driveID')
            updated_data.append(entry)
    
    # Write the modified data back to the original file
    with open(file_path, 'w', encoding='utf-8') as output_file:
        for entry in updated_data:
            json_str = json.dumps(entry, separators=(',', ':'))  # Convert to single-line JSON
            output_file.write(json_str + '\n')  # Write each entry to a new line

  