## Main Script: Converting File Shapes

### Pings to trajectories csv

In [2]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime

In [4]:

input_file = 'careems data/pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'traj_fixed_time_2024-01-31.csv'

df = pd.read_csv(input_file)

#convert to datetime
df['location_read_at'] = pd.to_datetime(df['location_read_at'])

#calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

#calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

trip_data = []

#group by booking id
# grouped = df.groupby('hash_booking_id')


null_booking_id = '9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0'
filtered_df = df[df['hash_booking_id'] != null_booking_id]  #filter out the null booking id
grouped = filtered_df.groupby('hash_booking_id')


for booking_id, group in grouped:
    #sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
  
    driver_id = group['hash_driver_id'].iloc[-1]
    
    #first instance of driver id to track switches
    first_instance = group[group['hash_driver_id'] == driver_id].iloc[0]

    time_id = first_instance['location_read_at']
    
    #filter out pings before switch
    valid_group = group[group['location_read_at'] >= time_id]
    
    lngs = valid_group['longitude'].tolist()
    lats = valid_group['latitude'].tolist()
    
   #dist gaps
    dist_gaps = [0]
    prev_lat = lats[0]
    prev_lng = lngs[0]
    cum_dist = 0
    
    #total distance
    for lat, lng in zip(lats[1:], lngs[1:]):
        dist = calculate_distance(prev_lat, prev_lng, lat, lng)
        cum_dist += dist
        dist_gaps.append(cum_dist)
        prev_lat = lat
        prev_lng = lng
    
    total_dist = cum_dist
    
    #time gaps
    time_gaps = [(t - time_id).total_seconds() for t in valid_group['location_read_at']]
    
    #last time gap is total time
    # total_time = time_gaps[-1] 
    total_time = calculate_time_difference(valid_group['location_read_at'].iloc[0], valid_group['location_read_at'].iloc[-1])

    
    trip_data.append([booking_id, driver_id, time_id, lngs, lats, total_dist, total_time, time_gaps, dist_gaps])


output_df = pd.DataFrame(trip_data, columns=['booking_id', 'driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

#write output to csv
output_df.to_csv(output_file, index=False)


### csv to json

In [5]:
import csv
import re
from datetime import datetime
import json

#extract the date from filename
def extract_date_from_filename(filename):
    #regular expression to extract the date in the format YYYY-MM-DD
    match = re.search(r"\d{4}-\d{2}-\d{2}", filename)
    
    if match:
        return match.group(0)  #return extracted date
    else:
        raise ValueError("Date not found in filename. Expected format: trajectories-YYYY-MM-DD.csv")

#get day of the week from date string
def day_of_week(date_str):

    date = datetime.strptime(date_str, "%Y-%m-%d")

    #get day of the week (Monday is 0, Sunday is 6)
    day_index = date.weekday()

    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

    #get day of the week from the index
    day_name = days[day_index]

     #get day of the month (from 0 to 30)
    day_of_month = date.day - 1  

    return day_index, day_name, day_of_month

#get time ID (minute of the day from 0 to 1439)
def time_id_from_timestamp(timestamp_str):
    # clean_timestamp_str = re.sub(r"\.\d+", "", timestamp_str)  #remove fractional seconds (not in new rectified pooling) 
    
    time = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
    total_minutes = time.hour * 60 + time.minute

    return total_minutes



def convert_csv_to_dicts(csv_file_path):
    result = []
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            #convert to actual lists
            row['time_gap'] = list(map(float, row['time_gap'].strip('[]').split(', ')))
            row['lats'] = list(map(float, row['lats'].strip('[]').split(', ')))
            row['lngs'] = list(map(float, row['lngs'].strip('[]').split(', ')))
            row['dist_gap'] = list(map(float, row['dist_gap'].strip('[]').split(', ')))
            
            #date from filename
            date_str = extract_date_from_filename(csv_file_path)

            #day of week
            week_id, name, date_id = day_of_week(date_str)
            
            #timeID is minute of day
            time_id = time_id_from_timestamp(row['time_id'])
            
            #create dict with our desired keys
            new_dict = {
                'trip_id': row['booking_id'],
                'time_gap': row['time_gap'],
                'dist': float(row['dist']),
                'lats': row['lats'],
                'driverID': row['driver_id'],
                'weekID': week_id,
                'timeID': time_id,
                'dateID': date_id,
                'time': float(row['time']),
                'lngs': row['lngs'],
                'dist_gap': row['dist_gap']
            }
            json_str = json.dumps(new_dict, separators=(',', ':')) #convert to json string
            result.append(json_str)
    return result


def write_dicts_to_text(data, output_file_path):
    if not data:
        print("No data to write.")
        return
    
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        for entry in data:
            # print(entry)
            #write json string to file
            output_file.write(entry+'\n')
            output_file.flush()
            

csv_file_path = 'traj_fixed_time_2024-01-31.csv'
output_file_path = 'fixed_traj_new.json' 

result = convert_csv_to_dicts(csv_file_path)

write_dicts_to_text(result, output_file_path)


print(f"Data has been written to {output_file_path}.")
# print(result[0])

Data has been written to fixed_traj_new.json.


### Mapping DriverID to ints

In [7]:
import json

# Step 1: Read data from the existing JSON file
json_file_path = 'new.json'

json_file_path2 = 'new_mapped.json'
# Read the content of the JSON file
with open(json_file_path, 'r', encoding='utf-8') as input_file:
    data = [json.loads(line) for line in input_file]  # Each line contains a JSON object

# Step 2: Extract unique driver IDs
driver_ids = set()  # Use a set to ensure uniqueness
for entry in data:
    driver_ids.add(entry['driverID'])

# Step 3: Create a mapping from unique driver IDs to integers
driver_id_map = {driver_id: idx for idx, driver_id in enumerate(driver_ids, start=1)}

# Step 4: Apply the mapping to the data
mapped_data = []
for entry in data:
    mapped_entry = entry.copy()  # Create a copy to avoid modifying the original
    mapped_entry['driverID'] = driver_id_map[entry['driverID']]  # Replace driver ID with its integer mapping
    mapped_data.append(mapped_entry)

# Step 5: Write the modified data back to the JSON file
with open(json_file_path2, 'w', encoding='utf-8') as output_file:
    for entry in mapped_data:
        json_str = json.dumps(entry, separators=(',', ':'))  # Convert to single-line JSON
        output_file.write(json_str + '\n')  # Write each entry to a new line


### DeepTTE Calculations 

In [None]:
import statistics

all_time_gaps = []
all_lats = []
all_lngs = []
all_dist_gaps = []
all_dists = []
all_times = []

for row in result:
    all_time_gaps.extend(row['time_gap'])
    all_lats.extend(row['lats'])
    all_lngs.extend(row['lngs'])
    all_dist_gaps.extend(row['dist_gap'])
    all_dists.append(row['dist'])
    all_times.append(row['time'])

# Calculate the standard deviations and means for the collected data
time_gap_std = statistics.stdev(all_time_gaps)
time_gap_mean = statistics.mean(all_time_gaps)

lats_std = statistics.stdev(all_lats)
lats_mean = statistics.mean(all_lats)

lngs_std = statistics.stdev(all_lngs)
lngs_mean = statistics.mean(all_lngs)

dist_gap_std = statistics.stdev(all_dist_gaps)
dist_gap_mean = statistics.mean(all_dist_gaps)

dist_std = statistics.stdev(all_dists)
dist_mean = statistics.mean(all_dists)

time_std = statistics.stdev(all_times)
time_mean = statistics.mean(all_times)

# Output the results    
print(f"Time gap standard deviation: {time_gap_std}")
print(f"Time gap mean: {time_gap_mean}")
print(f"Latitude standard deviation: {lats_std}")
print(f"Latitude mean: {lats_mean}")
print(f"Longitude standard deviation: {lngs_std}")
print(f"Longitude mean: {lngs_mean}")
print(f"Distance gap standard deviation: {dist_gap_std}")
print(f"Distance gap mean: {dist_gap_mean}")
print(f"Distance standard deviation: {dist_std}")
print(f"Distance mean: {dist_mean}")
print(f"Time standard deviation: {time_std}")
print(f"Time mean: {time_mean}")

### Convert Str to Hexa

In [11]:
# # hex_driver_id = row['driver_id'].encode().hex()
# id = "0012cf835ee80e59fefbe618282b2edc082940ddba6a4658e2626801026e2399"
# print(id.encode.hex())

# id = "0012cf835ee80e59fefbe618282b2edc082940ddba6a4658e2626801026e2399"
# hex_value = id.encode("utf-8").hex()  # Encoding and converting to hex
# print(hex_value)

# Hexadecimal string
hex_str = "0012cf835ee80e59fefbe618282b2edc082940ddba6a4658e2626801026e2399"

# Convert string to bytes (interpreting as hexadecimal)
hex_bytes = bytes.fromhex(hex_str)

# Convert bytes back to hexadecimal string (just for demonstration, you can use this `hex_bytes` as is)
hex_str_from_bytes = hex_bytes.hex()

print(hex_str_from_bytes)  # Should output the same as `hex_str`



0012cf835ee80e59fefbe618282b2edc082940ddba6a4658e2626801026e2399


### GPU File Processing

In this modified script:

1. We import cudf and cupy instead of pandas and numpy, respectively.
2. We use cudf.read_csv to read the CSV file into a cuDF DataFrame.
3. We perform computations on GPU where applicable, such as distance calculations, using cupy arrays.
4. We convert cuDF Series to cupy arrays using to_array() method when necessary.
5. We utilize GPU-accelerated operations provided by cuDF and cupy for efficient data processing.

> Please make sure to review the cuDF documentation for additional details and functionalities: https://docs.rapids.ai/api/cudf/stable/

In [None]:
pip install cudf

In [None]:
import cudf
import cupy as cp
from datetime import datetime
from geopy.distance import geodesic

# Read the input CSV file
input_file = 'Anon_Pings/anon_pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'trajectories-01-31.csv'

# Read CSV file into a cuDF DataFrame
df = cudf.read_csv(input_file)

# Convert timestamp column to datetime
df['location_read_at'] = cudf.to_datetime(df['location_read_at'] / 1000, unit='s')

# Function to calculate distance between two points on GPU
def calculate_distance(lat1, lon1, lat2, lon2):
    lat1 = cp.radians(lat1)
    lon1 = cp.radians(lon1)
    lat2 = cp.radians(lat2)
    lon2 = cp.radians(lon2)
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Function to calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

# Initialize list to store trip data
trip_data = []

# Group data by booking_id to process trips separately
grouped = df.groupby('booking_id')

# Iterate over each trip
for booking_id, group in grouped:
    # Sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
    # Extract trip information
    driver_id = group['driver_id'].iloc[0]
    time_id = group['location_read_at'].iloc[0]
    lngs = group['longitude'].to_array()
    lats = group['latitude'].to_array()
    
    # Initialize dist_gaps to store cumulative distances on GPU
    dist_gaps = cp.zeros_like(lats)
    prev_lat = lats[0]
    prev_lng = lngs[0]
    cum_dist = 0
    
    # Calculate total distance
    for i in range(1, len(lats)):
        lat = lats[i]
        lng = lngs[i]
        dist = calculate_distance(prev_lat, prev_lng, lat, lng)
        cum_dist += dist
        dist_gaps[i] = cum_dist
        prev_lat = lat
        prev_lng = lng
    
    total_dist = cum_dist
    
    # Calculate total time
    total_time = calculate_time_difference(group['location_read_at'].iloc[0], group['location_read_at'].iloc[-1])
    
    # Calculate time gaps
    time_gaps = (group['location_read_at'] - time_id).dt.total_seconds().to_array()
    
    # Append trip data to list
    trip_data.append([booking_id, driver_id, time_id] + [lngs, lats, total_dist, total_time, time_gaps, dist_gaps])

# Create cuDF DataFrame from trip data
output_df = cudf.DataFrame(trip_data, columns=['booking_id', 'driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

# Write output cuDF DataFrame to CSV
output_df.to_csv(output_file, index=False)


### Check the distance travelled from another file

In [5]:
import csv

def filter_csv_by_booking_id(csv_file, booking_id):
    with open(csv_file, 'r', newline='') as file:
        reader = csv.DictReader(file)
        
        # Initialize distance variable
        distance_travelled_km = None
        
        # Iterate over each row in the CSV file
        for row in reader:
            # Check if the booking_id matches the desired booking_id
            if row['booking_id'] == booking_id:
                # Extract the distance_travelled_km for the matching row
                distance_travelled_km = row['distance_travelled_km']
                break
        
        # Check if distance_travelled_km was found
        if distance_travelled_km is not None:
            print(f"Distance travelled for booking_id {booking_id}: {distance_travelled_km} km")
        else:
            print(f"No distance travelled found for booking_id {booking_id}")

# Example usage:
input_csv_file = 'Pooling/anon_pooling_jan_24_amman.csv'
desired_booking_id = '29c5e8211f059fed952cc810e964c523e727221d0bd669001bb75c6ebd85f913'  # Replace 'ABC123' with the desired booking_id
filter_csv_by_booking_id(input_csv_file, desired_booking_id)


Distance travelled for booking_id 29c5e8211f059fed952cc810e964c523e727221d0bd669001bb75c6ebd85f913: 11.79921054840088 km


### Creating a Test CSV file with n trips

In [7]:
import csv
import os.path

def filter_entries(input_file, output_file, booking_ids):
    # Check if the output file exists, if not, create it with headers
    file_exists = os.path.isfile(output_file)
    with open(output_file, 'a', newline='') as csvfile:
        fieldnames = []  # Initialize empty list for fieldnames
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            with open(input_file, 'r', newline='') as input_csv:
                reader = csv.DictReader(input_csv)
                fieldnames = reader.fieldnames  # Get fieldnames from input file
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()

        with open(input_file, 'r', newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                if row['booking_id'] in booking_ids:
                    writer.writerow(row)

# Example usage
input_file = 'Anon_Pings/anon_pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'test.csv'
booking_ids = ['88165aea83997095058b3f6676c1e3bdeedb4802c52afc9c412b1c610713a1ca',
               '3846a90814f7e29b6b0c11717b40afd9fcd86ac7aae41fa9ffe19fbfcc4bfe26',
               '29c5e8211f059fed952cc810e964c523e727221d0bd669001bb75c6ebd85f913']

filter_entries(input_file, output_file, booking_ids)


### Test the distance functions:

In [21]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime


# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

lats = [30.64392,30.642129,30.64393,30.640667,30.637807,30.634062,30.630342,30.62768,30.624637,30.622056,30.620839,30.62065,30.620698,30.620622,30.620588,30.621499,30.625048,30.625105,30.625109,30.624056,30.623248,30.626844]
lngs = [104.115353,104.113091,104.110404,104.108335,104.106304,104.104013,104.101653,104.100465,104.097907,104.095813,104.091939,104.087057,104.083797,104.080276,104.076107,104.071857,104.072423,104.072982,104.073218,104.076707,104.076795,104.076552]

dist_gaps = [0] + [calculate_distance(lats[i], lngs[i], lats[i + 1], lngs[i + 1]) for i in range(len(lats)-1)]

# print(dist_gaps)

result1 = calculate_distance(30.64392, 104.115353, 30.642129, 104.113091)
result2 = calculate_distance(30.642129, 104.113091, 30.64393, 104.110404)
result3 = calculate_distance(30.64392, 104.115353, 30.64393, 104.110404)
print(result1)
print(result2)
print(result1+result2)
print(result3)


0.29400740273246023
0.325896974688779
0.6199043774212392
0.47439900315363936
