### Converting File Shapes

#### Trial 1

In [9]:
import pandas as pd
from geopy.distance import geodesic

# Read the input CSV file
input_file = 'test.csv'
output_file = 'deeptte.csv'

df = pd.read_csv(input_file)

# Convert timestamp column to datetime
df['location_read_at'] = pd.to_datetime(df['location_read_at'])

# Group data by booking_id to process trips separately
grouped = df.groupby('booking_id')

# Initialize lists to store trip data
trip_data = []

# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Function to calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

# Iterate over each trip
for booking_id, group in grouped:
    # Sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
    # Extract trip information
    driver_id = group['driver_id'].iloc[0]
    time_id = group['location_read_at'].iloc[0]
    lngs = group['longitude'].tolist()
    lats = group['latitude'].tolist()
    
    # Calculate total distance
    total_dist = sum(calculate_distance(lats[i], lngs[i], lats[i+1], lngs[i+1]) for i in range(len(lats)-1))
    
    # Calculate total time
    total_time = calculate_time_difference(group['location_read_at'].iloc[0], group['location_read_at'].iloc[-1])
    
    # Calculate time and distance gaps
    time_gaps = [(t - time_id).total_seconds() for t in group['location_read_at']]
    dist_gaps = [0] + [geodesic((lats[i], lngs[i]), (lats[i+1], lngs[i+1])).kilometers for i in range(len(lats)-1)]
    
    # Append trip data to list
    trip_data.append([driver_id, time_id, lngs, lats, total_dist, total_time, time_gaps, dist_gaps])

# Create DataFrame from trip data
output_df = pd.DataFrame(trip_data, columns=['driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

# Write output DataFrame to CSV
output_df.to_csv(output_file, index=False)


#### Trial 2

In [11]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime

# Read the input CSV file
input_file = 'test.csv'
output_file = 'deeptte.csv'

df = pd.read_csv(input_file)

# Convert timestamp column to datetime
df['location_read_at'] = pd.to_datetime(df['location_read_at'] / 1000, unit='s')

# Group data by booking_id to process trips separately
grouped = df.groupby('booking_id')

# Initialize lists to store trip data
trip_data = []

# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Function to calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

# Iterate over each trip
for booking_id, group in grouped:
    # Sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
    # Extract trip information
    driver_id = group['driver_id'].iloc[0]
    time_id = group['location_read_at'].iloc[0].strftime('%Y-%m-%d %H:%M:%S')  # Convert to normal time
    lngs = group['longitude'].tolist()
    lats = group['latitude'].tolist()
    
    # Calculate total distance
    total_dist = sum(calculate_distance(lats[i], lngs[i], lats[i+1], lngs[i+1]) for i in range(len(lats)-1))
    
    # Calculate total time
    total_time = calculate_time_difference(group['location_read_at'].iloc[0], group['location_read_at'].iloc[-1])
    
    # Calculate time and distance gaps
    time_gaps = [(t - group['location_read_at'].iloc[0]).total_seconds() for t in group['location_read_at']]
    dist_gaps = [0] + [geodesic((lats[i], lngs[i]), (lats[i+1], lngs[i+1])).kilometers for i in range(len(lats)-1)]
    
    # Append trip data to list
    trip_data.append([driver_id, time_id, lngs, lats, total_dist, total_time, time_gaps, dist_gaps])

# Create DataFrame from trip data
output_df = pd.DataFrame(trip_data, columns=['driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

# Write output DataFrame to CSV
output_df.to_csv(output_file, index=False)


#### Trial 3

In [15]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime

# Read the input CSV file
input_file = 'test.csv'
output_file = 'deeptte3.csv'

df = pd.read_csv(input_file)

# Convert timestamp column to datetime
df['location_read_at'] = pd.to_datetime(df['location_read_at'] / 1000, unit='s')

# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).meters

# Function to calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

# Initialize list to store trip data
trip_data = []

# Group data by booking_id to process trips separately
grouped = df.groupby('booking_id')

# Iterate over each trip
for booking_id, group in grouped:
    # Sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
    # Extract trip information
    driver_id = group['driver_id'].iloc[0]
    time_id = group['location_read_at'].iloc[0]
    lngs = group['longitude'].tolist()
    lats = group['latitude'].tolist()
    
    # Calculate total distance
    # total_dist = sum(calculate_distance(lats[i], lngs[i], lats[i+1], lngs[i+1]) for i in range(len(lats)-1))
    total_dist = sum(calculate_distance(lats[i], lngs[i], lats[i+1], lngs[i+1]) for i in range(len(lats)-1))
    
    # Calculate total time
    total_time = calculate_time_difference(group['location_read_at'].iloc[0], group['location_read_at'].iloc[-1])
    
    # Calculate time and distance gaps
    time_gaps = [(t - time_id).total_seconds() for t in group['location_read_at']]
    # dist_gaps = [0] + [geodesic((lats[i], lngs[i]), (lats[i+1], lngs[i+1])).kilometers for i in range(len(lats)-1)]
    dist_gaps = [calculate_distance(lats[0], lngs[0], lats[i], lngs[i]) for i in range(len(lats)-1)]
    
    # Append trip data to list
    trip_data.append([booking_id, driver_id, time_id, lngs, lats, total_dist, total_time, time_gaps, dist_gaps])

# Create DataFrame from trip data
output_df = pd.DataFrame(trip_data, columns=['booking_id', 'driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

# Write output DataFrame to CSV
output_df.to_csv(output_file, index=False)


#### Trial 4

In [7]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime

# Read the input CSV file
input_file = 'Anon_Pings/anon_pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'trajectories-01-31.csv'

df = pd.read_csv(input_file)

# Convert timestamp column to datetime
df['location_read_at'] = pd.to_datetime(df['location_read_at'] / 1000, unit='s')

# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Function to calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

# Initialize list to store trip data
trip_data = []

# Group data by booking_id to process trips separately
grouped = df.groupby('booking_id')

# Iterate over each trip
for booking_id, group in grouped:
    # Sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
    # Extract trip information
    driver_id = group['driver_id'].iloc[0]
    time_id = group['location_read_at'].iloc[0]
    lngs = group['longitude'].tolist()
    lats = group['latitude'].tolist()
    
    # Initialize dist_gaps to store cumulative distances
    dist_gaps = [0]
    prev_lat = lats[0]
    prev_lng = lngs[0]
    cum_dist = 0
    
    # Calculate total distance
    for lat, lng in zip(lats[1:], lngs[1:]):
        dist = calculate_distance(prev_lat, prev_lng, lat, lng)
        cum_dist += dist
        dist_gaps.append(cum_dist)
        prev_lat = lat
        prev_lng = lng
    
    total_dist = cum_dist
    
    # Calculate total time
    total_time = calculate_time_difference(group['location_read_at'].iloc[0], group['location_read_at'].iloc[-1])
    
    # Calculate time gaps
    time_gaps = [(t - time_id).total_seconds() for t in group['location_read_at']]
    
    # Append trip data to list
    trip_data.append([booking_id, driver_id, time_id, lngs, lats, total_dist, total_time, time_gaps, dist_gaps])

# Create DataFrame from trip data
output_df = pd.DataFrame(trip_data, columns=['booking_id', 'driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

# Write output DataFrame to CSV
output_df.to_csv(output_file, index=False)


KeyboardInterrupt: 

### GPU File Processing

In this modified script:

1. We import cudf and cupy instead of pandas and numpy, respectively.
2. We use cudf.read_csv to read the CSV file into a cuDF DataFrame.
3. We perform computations on GPU where applicable, such as distance calculations, using cupy arrays.
4. We convert cuDF Series to cupy arrays using to_array() method when necessary.
5. We utilize GPU-accelerated operations provided by cuDF and cupy for efficient data processing.

> Please make sure to review the cuDF documentation for additional details and functionalities: https://docs.rapids.ai/api/cudf/stable/

In [None]:
pip install cudf

In [None]:
import cudf
import cupy as cp
from datetime import datetime
from geopy.distance import geodesic

# Read the input CSV file
input_file = 'Anon_Pings/anon_pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'trajectories-01-31.csv'

# Read CSV file into a cuDF DataFrame
df = cudf.read_csv(input_file)

# Convert timestamp column to datetime
df['location_read_at'] = cudf.to_datetime(df['location_read_at'] / 1000, unit='s')

# Function to calculate distance between two points on GPU
def calculate_distance(lat1, lon1, lat2, lon2):
    lat1 = cp.radians(lat1)
    lon1 = cp.radians(lon1)
    lat2 = cp.radians(lat2)
    lon2 = cp.radians(lon2)
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Function to calculate time difference in seconds
def calculate_time_difference(time1, time2):
    return (time2 - time1).total_seconds()

# Initialize list to store trip data
trip_data = []

# Group data by booking_id to process trips separately
grouped = df.groupby('booking_id')

# Iterate over each trip
for booking_id, group in grouped:
    # Sort pings by timestamp
    group = group.sort_values(by='location_read_at')
    
    # Extract trip information
    driver_id = group['driver_id'].iloc[0]
    time_id = group['location_read_at'].iloc[0]
    lngs = group['longitude'].to_array()
    lats = group['latitude'].to_array()
    
    # Initialize dist_gaps to store cumulative distances on GPU
    dist_gaps = cp.zeros_like(lats)
    prev_lat = lats[0]
    prev_lng = lngs[0]
    cum_dist = 0
    
    # Calculate total distance
    for i in range(1, len(lats)):
        lat = lats[i]
        lng = lngs[i]
        dist = calculate_distance(prev_lat, prev_lng, lat, lng)
        cum_dist += dist
        dist_gaps[i] = cum_dist
        prev_lat = lat
        prev_lng = lng
    
    total_dist = cum_dist
    
    # Calculate total time
    total_time = calculate_time_difference(group['location_read_at'].iloc[0], group['location_read_at'].iloc[-1])
    
    # Calculate time gaps
    time_gaps = (group['location_read_at'] - time_id).dt.total_seconds().to_array()
    
    # Append trip data to list
    trip_data.append([booking_id, driver_id, time_id] + [lngs, lats, total_dist, total_time, time_gaps, dist_gaps])

# Create cuDF DataFrame from trip data
output_df = cudf.DataFrame(trip_data, columns=['booking_id', 'driver_id', 'time_id', 'lngs', 'lats', 'dist', 'time', 'time_gap', 'dist_gap'])

# Write output cuDF DataFrame to CSV
output_df.to_csv(output_file, index=False)


### Check the distance travelled from another file

In [5]:
import csv

def filter_csv_by_booking_id(csv_file, booking_id):
    with open(csv_file, 'r', newline='') as file:
        reader = csv.DictReader(file)
        
        # Initialize distance variable
        distance_travelled_km = None
        
        # Iterate over each row in the CSV file
        for row in reader:
            # Check if the booking_id matches the desired booking_id
            if row['booking_id'] == booking_id:
                # Extract the distance_travelled_km for the matching row
                distance_travelled_km = row['distance_travelled_km']
                break
        
        # Check if distance_travelled_km was found
        if distance_travelled_km is not None:
            print(f"Distance travelled for booking_id {booking_id}: {distance_travelled_km} km")
        else:
            print(f"No distance travelled found for booking_id {booking_id}")

# Example usage:
input_csv_file = 'Pooling/anon_pooling_jan_24_amman.csv'
desired_booking_id = '29c5e8211f059fed952cc810e964c523e727221d0bd669001bb75c6ebd85f913'  # Replace 'ABC123' with the desired booking_id
filter_csv_by_booking_id(input_csv_file, desired_booking_id)


Distance travelled for booking_id 29c5e8211f059fed952cc810e964c523e727221d0bd669001bb75c6ebd85f913: 11.79921054840088 km


### Unix to DateTime

In [None]:
import datetime

# Get the Unix timestamp
unix_timestamp = 1609459200  # Replace with your actual Unix timestamp

# Convert to datetime
dt = datetime.datetime.fromtimestamp(unix_timestamp)

# Print the datetime in a desired format
print(dt.strftime("%Y-%m-%d %H:%M:%S"))

### Creating a Test CSV file with n trips

In [7]:
import csv
import os.path

def filter_entries(input_file, output_file, booking_ids):
    # Check if the output file exists, if not, create it with headers
    file_exists = os.path.isfile(output_file)
    with open(output_file, 'a', newline='') as csvfile:
        fieldnames = []  # Initialize empty list for fieldnames
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if not file_exists:
            with open(input_file, 'r', newline='') as input_csv:
                reader = csv.DictReader(input_csv)
                fieldnames = reader.fieldnames  # Get fieldnames from input file
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()

        with open(input_file, 'r', newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                if row['booking_id'] in booking_ids:
                    writer.writerow(row)

# Example usage
input_file = 'Anon_Pings/anon_pooling_pings_jan_24_amman_2024-01-31.csv'
output_file = 'test.csv'
booking_ids = ['88165aea83997095058b3f6676c1e3bdeedb4802c52afc9c412b1c610713a1ca',
               '3846a90814f7e29b6b0c11717b40afd9fcd86ac7aae41fa9ffe19fbfcc4bfe26',
               '29c5e8211f059fed952cc810e964c523e727221d0bd669001bb75c6ebd85f913']

filter_entries(input_file, output_file, booking_ids)


### Test the distance functions:

In [21]:
import pandas as pd
from geopy.distance import geodesic
from datetime import datetime


# Function to calculate distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

lats = [30.64392,30.642129,30.64393,30.640667,30.637807,30.634062,30.630342,30.62768,30.624637,30.622056,30.620839,30.62065,30.620698,30.620622,30.620588,30.621499,30.625048,30.625105,30.625109,30.624056,30.623248,30.626844]
lngs = [104.115353,104.113091,104.110404,104.108335,104.106304,104.104013,104.101653,104.100465,104.097907,104.095813,104.091939,104.087057,104.083797,104.080276,104.076107,104.071857,104.072423,104.072982,104.073218,104.076707,104.076795,104.076552]

dist_gaps = [0] + [calculate_distance(lats[i], lngs[i], lats[i + 1], lngs[i + 1]) for i in range(len(lats)-1)]

# print(dist_gaps)

result1 = calculate_distance(30.64392, 104.115353, 30.642129, 104.113091)
result2 = calculate_distance(30.642129, 104.113091, 30.64393, 104.110404)
result3 = calculate_distance(30.64392, 104.115353, 30.64393, 104.110404)
print(result1)
print(result2)
print(result1+result2)
print(result3)


0.29400740273246023
0.325896974688779
0.6199043774212392
0.47439900315363936
