### Raw Pings to Trajectories (CSV format)

In [2]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime

In [4]:

input_file = 'careems data/pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'traj_fixed_time_2024-01-31.csv'

df = pd.read_csv(input_file)

#convert to datetime
df['location_read_at'] = pd.to_datetime(df['location_read_at'])

#calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

#calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

trip_data = []

#group by booking id
# grouped = df.groupby('hash_booking_id')


null_booking_id = '9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0'
filtered_df = df[df['hash_booking_id'] != null_booking_id]  #filter out the null booking id
grouped = filtered_df.groupby('hash_booking_id')


for booking_id, group in grouped:
    #sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
  
    driver_id = group['hash_driver_id'].iloc[-1]
    
    #first instance of driver id to track switches
    first_instance = group[group['hash_driver_id'] == driver_id].iloc[0]

    time_id = first_instance['location_read_at']
    
    #filter out pings before switch
    valid_group = group[group['location_read_at'] >= time_id]
    
    lngs = valid_group['longitude'].tolist()
    lats = valid_group['latitude'].tolist()
    
   #dist gaps
    dist_gaps = [0]
    prev_lat = lats[0]
    prev_lng = lngs[0]
    cum_dist = 0
    
    #total distance
    for lat, lng in zip(lats[1:], lngs[1:]):
        dist = calculate_distance(prev_lat, prev_lng, lat, lng)
        cum_dist += dist
        dist_gaps.append(cum_dist)
        prev_lat = lat
        prev_lng = lng
    
    total_dist = cum_dist
    
    #time gaps
    time_gaps = [(t - time_id).total_seconds() for t in valid_group['location_read_at']]
    
    #last time gap is total time
    # total_time = time_gaps[-1] 
    total_time = calculate_time_difference(valid_group['location_read_at'].iloc[0], valid_group['location_read_at'].iloc[-1])

    
    trip_data.append([booking_id, driver_id, time_id, lngs, lats, total_dist, total_time, time_gaps, dist_gaps])


output_df = pd.DataFrame(trip_data, columns=['booking_id', 'driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

#write output to csv
output_df.to_csv(output_file, index=False)


### CSV to JSON Objects

In [5]:
import csv
import re
from datetime import datetime
import json

#extract the date from filename
def extract_date_from_filename(filename):
    #regular expression to extract the date in the format YYYY-MM-DD
    match = re.search(r"\d{4}-\d{2}-\d{2}", filename)
    
    if match:
        return match.group(0)  #return extracted date
    else:
        raise ValueError("Date not found in filename. Expected format: trajectories-YYYY-MM-DD.csv")

#get day of the week from date string
def day_of_week(date_str):

    date = datetime.strptime(date_str, "%Y-%m-%d")

    #get day of the week (Monday is 0, Sunday is 6)
    day_index = date.weekday()

    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

    #get day of the week from the index
    day_name = days[day_index]

     #get day of the month (from 0 to 30)
    day_of_month = date.day - 1  

    return day_index, day_name, day_of_month

#get time ID (minute of the day from 0 to 1439)
def time_id_from_timestamp(timestamp_str):
    # clean_timestamp_str = re.sub(r"\.\d+", "", timestamp_str)  #remove fractional seconds (not in new rectified pooling) 
    
    time = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
    total_minutes = time.hour * 60 + time.minute

    return total_minutes



def convert_csv_to_dicts(csv_file_path):
    result = []
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            #convert to actual lists
            row['time_gap'] = list(map(float, row['time_gap'].strip('[]').split(', ')))
            row['lats'] = list(map(float, row['lats'].strip('[]').split(', ')))
            row['lngs'] = list(map(float, row['lngs'].strip('[]').split(', ')))
            row['dist_gap'] = list(map(float, row['dist_gap'].strip('[]').split(', ')))
            
            #date from filename
            date_str = extract_date_from_filename(csv_file_path)

            #day of week
            week_id, name, date_id = day_of_week(date_str)
            
            #timeID is minute of day
            time_id = time_id_from_timestamp(row['time_id'])
            
            #create dict with our desired keys
            new_dict = {
                'trip_id': row['booking_id'],
                'time_gap': row['time_gap'],
                'dist': float(row['dist']),
                'lats': row['lats'],
                'driverID': row['driver_id'],
                'weekID': week_id,
                'timeID': time_id,
                'dateID': date_id,
                'time': float(row['time']),
                'lngs': row['lngs'],
                'dist_gap': row['dist_gap']
            }
            json_str = json.dumps(new_dict, separators=(',', ':')) #convert to json string
            result.append(json_str)
    return result


def write_dicts_to_text(data, output_file_path):
    if not data:
        print("No data to write.")
        return
    
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for entry in data:
            # print(entry)
            #write json string to file
            output_file.write(entry+'\n')
            output_file.flush()
            

csv_file_path = 'traj_fixed_time_2024-01-31.csv'
output_file_path = 'fixed_traj_new.json' 

result = convert_csv_to_dicts(csv_file_path)

write_dicts_to_text(result, output_file_path)


print(f"Data has been written to {output_file_path}.")
# print(result[0])

Data has been written to fixed_traj_new.json.


### Mapping DriverID to Integers (Enumeration)

In [7]:
import json

# Step 1: Read data from the existing JSON file
json_file_path = 'fixed_traj_new.json'

json_file_path2 = 'fixed_traj_new_mapped.json'
# Read the content of the JSON file
with open(json_file_path, 'r', encoding='utf-8') as input_file:
    data = [json.loads(line) for line in input_file]  # Each line contains a JSON object

# Step 2: Extract unique driver IDs
driver_ids = set()  # Use a set to ensure uniqueness
for entry in data:
    driver_ids.add(entry['driverID'])

# Step 3: Create a mapping from unique driver IDs to integers
driver_id_map = {driver_id: idx for idx, driver_id in enumerate(driver_ids, start=1)}

# Step 4: Apply the mapping to the data
mapped_data = []
for entry in data:
    mapped_entry = entry.copy()  # Create a copy to avoid modifying the original
    mapped_entry['driverID'] = driver_id_map[entry['driverID']]  # Replace driver ID with its integer mapping
    mapped_data.append(mapped_entry)

# Step 5: Write the modified data back to the JSON file
with open(json_file_path2, 'w', encoding='utf-8') as output_file:
    for entry in mapped_data:
        json_str = json.dumps(entry, separators=(',', ':'))  # Convert to single-line JSON
        output_file.write(json_str + '\n')  # Write each entry to a new line
