### Main Script: Converting File Shapes

#### Trial 4

In [8]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime

# Read the input CSV file
input_file = 'Anon_Pings/anon_pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'trajectories-01-31.csv'

df = pd.read_csv(input_file)

# Convert timestamp column to datetime
df['location_read_at'] = pd.to_datetime(df['location_read_at'] / 1000, unit='s')

# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Function to calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

# Initialize list to store trip data
trip_data = []

# Group data by booking_id to process trips separately
grouped = df.groupby('booking_id')

# Iterate over each trip
for booking_id, group in grouped:
    # Sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
    # Extract trip information
    driver_id = group['driver_id'].iloc[0]
    time_id = group['location_read_at'].iloc[0]
    lngs = group['longitude'].tolist()
    lats = group['latitude'].tolist()
    
    # Initialize dist_gaps to store cumulative distances
    dist_gaps = [0]
    prev_lat = lats[0]
    prev_lng = lngs[0]
    cum_dist = 0
    
    # Calculate total distance
    for lat, lng in zip(lats[1:], lngs[1:]):
        dist = calculate_distance(prev_lat, prev_lng, lat, lng)
        cum_dist += dist
        dist_gaps.append(cum_dist)
        prev_lat = lat
        prev_lng = lng
    
    total_dist = cum_dist
    
    # Calculate total time
    total_time = calculate_time_difference(group['location_read_at'].iloc[0], group['location_read_at'].iloc[-1])
    
    # Calculate time gaps
    time_gaps = [(t - time_id).total_seconds() for t in group['location_read_at']]
    
    # Append trip data to list
    trip_data.append([booking_id, driver_id, time_id, lngs, lats, total_dist, total_time, time_gaps, dist_gaps])

# Create DataFrame from trip data
output_df = pd.DataFrame(trip_data, columns=['booking_id', 'driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

# Write output DataFrame to CSV
output_df.to_csv(output_file, index=False)


#### fix lost pings trial 1

In [None]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime

# Read the input CSV file
input_file = 'Anon_Pings/anon_pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'traj_by_driver.csv'

df = pd.read_csv(input_file)

# Convert timestamp column to datetime
df['location_read_at'] = pd.to_datetime(df['location_read_at'] / 1000, unit='s')

# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Function to calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

# Initialize list to store trip data
trip_data = []

# Group by driver ID
grouped = df.groupby('hash_driver_id')

# Check status and booking ID
null_booking_id = "9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0"

for driver_id, group in grouped:
    # Find previous booking ID for the driver
    previous_booking_id = None
    last_record = None
    
    # Check if any record matches the null booking ID and is "In ride"
    if (group['status'] == "In ride").any() and (group['hash_booking_id'] == null_booking_id).any():
        # Find the previous booking ID
        for index, row in group.iterrows():
            if last_record:
                previous_booking_id = last_record['hash_booking_id']
            if row['hash_booking_id'] == null_booking_id:
                # If found the specific booking ID with "In ride" status, update it to previous booking ID
                if previous_booking_id:
                    df.at[index, 'hash_booking_id'] = previous_booking_id
            last_record = row

print(df)

### Extracting date and time from the csv file name

In [None]:
import re
from datetime import datetime

# Function to extract the date from a given filename
def extract_date_from_filename(filename):
    # Regular expression to extract the date in the format YYYY-MM-DD
    match = re.search(r"\d{4}-\d{2}-\d{2}", filename)
    
    if match:
        return match.group(0)  # Return the extracted date string
    else:
        raise ValueError("Date not found in filename. Expected format: trajectories-YYYY-MM-DD.csv")

# Function to determine the day of the week for a given date string
def day_of_week(date_str):
    # Parse the date from the input string
    date = datetime.strptime(date_str, "%Y-%m-%d")

    # Get the day of the week (Monday is 0, Sunday is 6)
    day_index = date.weekday()

    # Create a list of days for easy reference
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

    # Get the day of the week from the index
    day_name = days[day_index]

    # Get the day of the month (from 0 to 30)
    day_of_month = date.day - 1  # Zero-based indexing for the day of the month

    # Return the index and the day name
    return day_index, day_name, day_of_month

# Assign the filename directly
filename = "trajectories-2024-01-31.csv"  # Expected input

try:
    # Extract the date from the filename
    date_str = extract_date_from_filename(filename)

    # Get the day of the week
    index, name, day = day_of_week(date_str)

    # Output the results
    print(f"The day of the week for {date_str} is {name}, with an index of {index}.")
    print(f"The day of the month is {day}.")
except ValueError as ve:
    print(ve)


In [None]:
import re
from datetime import datetime, timedelta

# Function to extract the date from a given filename
def extract_date_from_filename(filename):
    # Regular expression to extract the date in the format YYYY-MM-DD
    match = re.search(r"\d{4}-\d{2}-\d{2}", filename)
    
    if match:
        return match.group(0)  # Return the extracted date string
    else:
        raise ValueError("Date not found in filename. Expected format: trajectories-YYYY-MM-DD.csv")

# Function to determine the day of the week and the day of the month for a given date string
def day_info(date_str):
    # Parse the date from the input string
    date = datetime.strptime(date_str, "%Y-%m-%d")

    # Get the day of the week (Monday is 0, Sunday is 6)
    day_index = date.weekday()

    # Create a list of days for easy reference
    days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

    # Get the day of the week from the index
    day_name = days_of_week[day_index]

    # Get the day of the month (from 0 to 30)
    day_of_month = date.day - 1  # Zero-based indexing for the day of the month

    # Return the day of the week index, day name, and day of the month
    return day_index, day_name, day_of_month

# Function to get the time ID (minute of the day from 0 to 1439)
def time_id_from_timestamp(timestamp_str):
    # Handle possible additional data after the expected time format
    clean_timestamp_str = re.sub(r"\.\d+", "", timestamp_str)  # Remove fractional seconds
    
    # Parse the time from the timestamp
    time = datetime.strptime(clean_timestamp_str, "%Y-%m-%d %H:%M:%S")

    # Calculate the total minutes from midnight
    total_minutes = time.hour * 60 + time.minute

    return total_minutes

# Assign the filename directly
filename = "trajectories-2024-01-31.csv"  # Example input filename

try:
    # Extract the date from the filename
    date_str = extract_date_from_filename(filename)

    # Get the day of the week and day of the month
    index, name, day_of_month = day_info(date_str)

    # Output the results for the day
    print(f"The day of the week for {date_str} is {name}, with an index of {index}.")
    print(f"The day of the month is {day_of_month}.")

    # Example timestamp for testing
    timestamp_str = "2024-01-31 05:48:02.746999979"  # Example timestamp input

    # Get the time ID (minute of the day)
    time_id = time_id_from_timestamp(timestamp_str)

    # Output the time ID
    print(f"The time ID for the timestamp {timestamp_str} is {time_id}.")
except ValueError as ve:
    print(ve)


### Current: Convert CSV to Dictionaries

#### Trial 2

In [None]:
import csv
import re
from datetime import datetime
import json

def hex_conversion_test(hex_str):
    """
    Converts a hexadecimal string to bytes and then back to a hexadecimal string.
    Returns the original bytes and the reconverted hex string for comparison.
    """
    # Convert hex string to bytes
    hex_bytes = bytes.fromhex(hex_str)

    # Convert bytes back to hex string
    reconverted_hex_str = hex_bytes.hex()

    # Return the converted bytes and reconverted hex string
    return reconverted_hex_str

# Function to extract the date from a given filename
def extract_date_from_filename(filename):
    # Regular expression to extract the date in the format YYYY-MM-DD
    match = re.search(r"\d{4}-\d{2}-\d{2}", filename)
    
    if match:
        return match.group(0)  # Return the extracted date string
    else:
        raise ValueError("Date not found in filename. Expected format: trajectories-YYYY-MM-DD.csv")

# Function to determine the day of the week for a given date string
def day_of_week(date_str):
    # Parse the date from the input string
    date = datetime.strptime(date_str, "%Y-%m-%d")

    # Get the day of the week (Monday is 0, Sunday is 6)
    day_index = date.weekday()

    # Create a list of days for easy reference
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

    # Get the day of the week from the index
    day_name = days[day_index]

     # Get the day of the month (from 0 to 30)
    day_of_month = date.day - 1  # Zero-based indexing for the day of the month

    # Return the index and the day name
    return day_index, day_name, day_of_month

# Function to get the time ID (minute of the day from 0 to 1439)
def time_id_from_timestamp(timestamp_str):
    # Handle possible additional data after the expected time format
    clean_timestamp_str = re.sub(r"\.\d+", "", timestamp_str)  # Remove fractional seconds
    
    # Parse the time from the timestamp
    time = datetime.strptime(clean_timestamp_str, "%Y-%m-%d %H:%M:%S")

    # Calculate the total minutes from midnight
    total_minutes = time.hour * 60 + time.minute

    return total_minutes

def convert_csv_to_dicts(csv_file_path):
    result = []
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            # Convert string representation of lists to actual lists
            row['time_gap'] = list(map(float, row['time_gap'].strip('[]').split(', ')))
            row['lats'] = list(map(float, row['lats'].strip('[]').split(', ')))
            row['lngs'] = list(map(float, row['lngs'].strip('[]').split(', ')))
            row['dist_gap'] = list(map(float, row['dist_gap'].strip('[]').split(', ')))
            
            # Extract the date from the filename
            date_str = extract_date_from_filename(csv_file_path)

            # Get the day of the week
            week_id, name, date_id = day_of_week(date_str)
            
            # Get the time ID (minute of the day)
            time_id = time_id_from_timestamp(row['time_id'])
            
            # Create a new dictionary with desired keys
            new_dict = {
                'trip_id': row['booking_id'],
                'time_gap': row['time_gap'],
                'dist': float(row['dist']),
                'lats': row['lats'],
                'driverID': row['driver_id'],
                'weekID': week_id,
                # 'states': row['states'],  # Assuming 'states' field is already formatted correctly
                'timeID': time_id,
                'dateID': date_id,
                'time': float(row['time']),
                'lngs': row['lngs'],
                'dist_gap': row['dist_gap']
            }
            json_str = json.dumps(new_dict, separators=(',', ':'))
            # print(json_str)
            # print(type(json_str))
            # break
            result.append(json_str)
    return result

def write_dicts_to_text(data, output_file_path):
    if not data:
        print("No data to write.")
        return
    
    # Open the text file for writing with UTF-8 encoding
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        # Write each JSON-formatted string to the file
        for entry in data:
            print(entry)
            
            # Convert the dictionary (or JSON string) to JSON format
            # if isinstance(entry, dict):
            #     # If entry is a dictionary, convert it to a JSON-formatted string
            #     json_str = json.dumps(entry, indent=4)
            # else:
            #     # If it's already a JSON string, just ensure it's properly formatted
            #     json_str = entry
            
            # Write the JSON string to the file, with a newline for readability
            output_file.write(entry + "\n")


# Specify the path to the CSV file
csv_file_path = 'trajectories-2024-01-31.csv'
output_file_path = 'new.json'  # Output file name

# Convert the CSV data to the desired format
result = convert_csv_to_dicts(csv_file_path)

# Write the results to the specified output file
write_dicts_to_text(result, output_file_path)

# Confirm the output
print(f"Data has been written to {output_file_path}.")

# Print the first entry to verify the format
# print(result[0])

In [7]:
import json

# Step 1: Read data from the existing JSON file
json_file_path = 'new.json'

json_file_path2 = 'new_mapped.json'
# Read the content of the JSON file
with open(json_file_path, 'r', encoding='utf-8') as input_file:
    data = [json.loads(line) for line in input_file]  # Each line contains a JSON object

# Step 2: Extract unique driver IDs
driver_ids = set()  # Use a set to ensure uniqueness
for entry in data:
    driver_ids.add(entry['driverID'])

# Step 3: Create a mapping from unique driver IDs to integers
driver_id_map = {driver_id: idx for idx, driver_id in enumerate(driver_ids, start=1)}

# Step 4: Apply the mapping to the data
mapped_data = []
for entry in data:
    mapped_entry = entry.copy()  # Create a copy to avoid modifying the original
    mapped_entry['driverID'] = driver_id_map[entry['driverID']]  # Replace driver ID with its integer mapping
    mapped_data.append(mapped_entry)

# Step 5: Write the modified data back to the JSON file
with open(json_file_path2, 'w', encoding='utf-8') as output_file:
    for entry in mapped_data:
        json_str = json.dumps(entry, separators=(',', ':'))  # Convert to single-line JSON
        output_file.write(json_str + '\n')  # Write each entry to a new line


Calcualtions 

In [None]:
import statistics

all_time_gaps = []
all_lats = []
all_lngs = []
all_dist_gaps = []
all_dists = []
all_times = []

for row in result:
    all_time_gaps.extend(row['time_gap'])
    all_lats.extend(row['lats'])
    all_lngs.extend(row['lngs'])
    all_dist_gaps.extend(row['dist_gap'])
    all_dists.append(row['dist'])
    all_times.append(row['time'])

# Calculate the standard deviations and means for the collected data
time_gap_std = statistics.stdev(all_time_gaps)
time_gap_mean = statistics.mean(all_time_gaps)

lats_std = statistics.stdev(all_lats)
lats_mean = statistics.mean(all_lats)

lngs_std = statistics.stdev(all_lngs)
lngs_mean = statistics.mean(all_lngs)

dist_gap_std = statistics.stdev(all_dist_gaps)
dist_gap_mean = statistics.mean(all_dist_gaps)

dist_std = statistics.stdev(all_dists)
dist_mean = statistics.mean(all_dists)

time_std = statistics.stdev(all_times)
time_mean = statistics.mean(all_times)

# Output the results    
print(f"Time gap standard deviation: {time_gap_std}")
print(f"Time gap mean: {time_gap_mean}")
print(f"Latitude standard deviation: {lats_std}")
print(f"Latitude mean: {lats_mean}")
print(f"Longitude standard deviation: {lngs_std}")
print(f"Longitude mean: {lngs_mean}")
print(f"Distance gap standard deviation: {dist_gap_std}")
print(f"Distance gap mean: {dist_gap_mean}")
print(f"Distance standard deviation: {dist_std}")
print(f"Distance mean: {dist_mean}")
print(f"Time standard deviation: {time_std}")
print(f"Time mean: {time_mean}")

#### Trial 1

In [None]:
import csv

def convert_csv_to_dicts(csv_file_path):
    result = []
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            # Convert string representation of lists to actual lists
            row['time_gap'] = list(map(float, row['time_gap'].strip('[]').split(', ')))
            row['lats'] = list(map(float, row['lats'].strip('[]').split(', ')))
            row['lngs'] = list(map(float, row['lngs'].strip('[]').split(', ')))
            row['dist_gap'] = list(map(float, row['dist_gap'].strip('[]').split(', ')))
            # Check if the 'driver_id' field has the expected format
            if '_' in row['driver_id']:
                driver_id_parts = row['driver_id'].split('_')
                if len(driver_id_parts) == 2 and driver_id_parts[1].isdigit():
                    driver_id = int(driver_id_parts[1])
                else:
                    driver_id = None
            else:
                driver_id = None
            
            # Check if the 'time_id' field has the expected format
            if '-W' in row['time_id']:
                time_id_parts = row['time_id'].split('-W')
                if len(time_id_parts) == 2 and time_id_parts[1].isdigit():
                    week_id = int(time_id_parts[1].split('-')[0])
                    time_id = int(time_id_parts[1].split('-')[1])
                    date_id = int(time_id_parts[0])
                else:
                    week_id = None
                    time_id = None
                    date_id = None
            else:
                week_id = None
                time_id = None
                date_id = None
            
            # Create a new dictionary with desired keys
            new_dict = {
                'time_gap': row['time_gap'],
                'dist': float(row['dist']),
                'lats': row['lats'],
                'driverID': driver_id,
                'weekID': week_id,
                # 'states': row['states'],  # Assuming 'states' field is already formatted correctly
                'timeID': time_id,
                'dateID': date_id,
                'time': float(row['time']),
                'lngs': row['lngs'],
                'dist_gap': row['dist_gap']
            }
            result.append(new_dict)
    return result

# Specify the path to the CSV file
csv_file_path = 'trajectories-01-31.csv'

# Convert the CSV data to the desired format
result = convert_csv_to_dicts(csv_file_path)

# Print the first entry to verify the format
print(result[0])

### Convert Str to Hexa

In [11]:
# # hex_driver_id = row['driver_id'].encode().hex()
# id = "0012cf835ee80e59fefbe618282b2edc082940ddba6a4658e2626801026e2399"
# print(id.encode.hex())

# id = "0012cf835ee80e59fefbe618282b2edc082940ddba6a4658e2626801026e2399"
# hex_value = id.encode("utf-8").hex()  # Encoding and converting to hex
# print(hex_value)

# Hexadecimal string
hex_str = "0012cf835ee80e59fefbe618282b2edc082940ddba6a4658e2626801026e2399"

# Convert string to bytes (interpreting as hexadecimal)
hex_bytes = bytes.fromhex(hex_str)

# Convert bytes back to hexadecimal string (just for demonstration, you can use this `hex_bytes` as is)
hex_str_from_bytes = hex_bytes.hex()

print(hex_str_from_bytes)  # Should output the same as `hex_str`



0012cf835ee80e59fefbe618282b2edc082940ddba6a4658e2626801026e2399


### GPU File Processing

In this modified script:

1. We import cudf and cupy instead of pandas and numpy, respectively.
2. We use cudf.read_csv to read the CSV file into a cuDF DataFrame.
3. We perform computations on GPU where applicable, such as distance calculations, using cupy arrays.
4. We convert cuDF Series to cupy arrays using to_array() method when necessary.
5. We utilize GPU-accelerated operations provided by cuDF and cupy for efficient data processing.

> Please make sure to review the cuDF documentation for additional details and functionalities: https://docs.rapids.ai/api/cudf/stable/

In [None]:
pip install cudf

In [None]:
import cudf
import cupy as cp
from datetime import datetime
from geopy.distance import geodesic

# Read the input CSV file
input_file = 'Anon_Pings/anon_pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'trajectories-01-31.csv'

# Read CSV file into a cuDF DataFrame
df = cudf.read_csv(input_file)

# Convert timestamp column to datetime
df['location_read_at'] = cudf.to_datetime(df['location_read_at'] / 1000, unit='s')

# Function to calculate distance between two points on GPU
def calculate_distance(lat1, lon1, lat2, lon2):
    lat1 = cp.radians(lat1)
    lon1 = cp.radians(lon1)
    lat2 = cp.radians(lat2)
    lon2 = cp.radians(lon2)
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Function to calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

# Initialize list to store trip data
trip_data = []

# Group data by booking_id to process trips separately
grouped = df.groupby('booking_id')

# Iterate over each trip
for booking_id, group in grouped:
    # Sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
    # Extract trip information
    driver_id = group['driver_id'].iloc[0]
    time_id = group['location_read_at'].iloc[0]
    lngs = group['longitude'].to_array()
    lats = group['latitude'].to_array()
    
    # Initialize dist_gaps to store cumulative distances on GPU
    dist_gaps = cp.zeros_like(lats)
    prev_lat = lats[0]
    prev_lng = lngs[0]
    cum_dist = 0
    
    # Calculate total distance
    for i in range(1, len(lats)):
        lat = lats[i]
        lng = lngs[i]
        dist = calculate_distance(prev_lat, prev_lng, lat, lng)
        cum_dist += dist
        dist_gaps[i] = cum_dist
        prev_lat = lat
        prev_lng = lng
    
    total_dist = cum_dist
    
    # Calculate total time
    total_time = calculate_time_difference(group['location_read_at'].iloc[0], group['location_read_at'].iloc[-1])
    
    # Calculate time gaps
    time_gaps = (group['location_read_at'] - time_id).dt.total_seconds().to_array()
    
    # Append trip data to list
    trip_data.append([booking_id, driver_id, time_id] + [lngs, lats, total_dist, total_time, time_gaps, dist_gaps])

# Create cuDF DataFrame from trip data
output_df = cudf.DataFrame(trip_data, columns=['booking_id', 'driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

# Write output cuDF DataFrame to CSV
output_df.to_csv(output_file, index=False)


### Check the distance travelled from another file

In [5]:
import csv

def filter_csv_by_booking_id(csv_file, booking_id):
    with open(csv_file, 'r', newline='') as file:
        reader = csv.DictReader(file)
        
        # Initialize distance variable
        distance_travelled_km = None
        
        # Iterate over each row in the CSV file
        for row in reader:
            # Check if the booking_id matches the desired booking_id
            if row['booking_id'] == booking_id:
                # Extract the distance_travelled_km for the matching row
                distance_travelled_km = row['distance_travelled_km']
                break
        
        # Check if distance_travelled_km was found
        if distance_travelled_km is not None:
            print(f"Distance travelled for booking_id {booking_id}: {distance_travelled_km} km")
        else:
            print(f"No distance travelled found for booking_id {booking_id}")

# Example usage:
input_csv_file = 'Pooling/anon_pooling_jan_24_amman.csv'
desired_booking_id = '29c5e8211f059fed952cc810e964c523e727221d0bd669001bb75c6ebd85f913'  # Replace 'ABC123' with the desired booking_id
filter_csv_by_booking_id(input_csv_file, desired_booking_id)


Distance travelled for booking_id 29c5e8211f059fed952cc810e964c523e727221d0bd669001bb75c6ebd85f913: 11.79921054840088 km


### Unix to DateTime

In [None]:
import datetime

# Get the Unix timestamp
unix_timestamp = 1609459200  # Replace with your actual Unix timestamp

# Convert to datetime
dt = datetime.datetime.fromtimestamp(unix_timestamp)

# Print the datetime in a desired format
print(dt.strftime("%Y-%m-%d %H:%M:%S"))

### Creating a Test CSV file with n trips

In [7]:
import csv
import os.path

def filter_entries(input_file, output_file, booking_ids):
    # Check if the output file exists, if not, create it with headers
    file_exists = os.path.isfile(output_file)
    with open(output_file, 'a', newline='') as csvfile:
        fieldnames = []  # Initialize empty list for fieldnames
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            with open(input_file, 'r', newline='') as input_csv:
                reader = csv.DictReader(input_csv)
                fieldnames = reader.fieldnames  # Get fieldnames from input file
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()

        with open(input_file, 'r', newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                if row['booking_id'] in booking_ids:
                    writer.writerow(row)

# Example usage
input_file = 'Anon_Pings/anon_pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'test.csv'
booking_ids = ['88165aea83997095058b3f6676c1e3bdeedb4802c52afc9c412b1c610713a1ca',
               '3846a90814f7e29b6b0c11717b40afd9fcd86ac7aae41fa9ffe19fbfcc4bfe26',
               '29c5e8211f059fed952cc810e964c523e727221d0bd669001bb75c6ebd85f913']

filter_entries(input_file, output_file, booking_ids)


### Test the distance functions:

In [21]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime


# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

lats = [30.64392,30.642129,30.64393,30.640667,30.637807,30.634062,30.630342,30.62768,30.624637,30.622056,30.620839,30.62065,30.620698,30.620622,30.620588,30.621499,30.625048,30.625105,30.625109,30.624056,30.623248,30.626844]
lngs = [104.115353,104.113091,104.110404,104.108335,104.106304,104.104013,104.101653,104.100465,104.097907,104.095813,104.091939,104.087057,104.083797,104.080276,104.076107,104.071857,104.072423,104.072982,104.073218,104.076707,104.076795,104.076552]

dist_gaps = [0] + [calculate_distance(lats[i], lngs[i], lats[i + 1], lngs[i + 1]) for i in range(len(lats)-1)]

# print(dist_gaps)

result1 = calculate_distance(30.64392, 104.115353, 30.642129, 104.113091)
result2 = calculate_distance(30.642129, 104.113091, 30.64393, 104.110404)
result3 = calculate_distance(30.64392, 104.115353, 30.64393, 104.110404)
print(result1)
print(result2)
print(result1+result2)
print(result3)


0.29400740273246023
0.325896974688779
0.6199043774212392
0.47439900315363936
