## Main: Data Cleaner


### Choose clean non-pooled data only

In [3]:
import pandas as pd
import json

clean_file_path = "clean_trips.json"
pooling_data = pd.read_csv("careems data/anon_pooling_jan_24_amman.csv")

filtered_pooling_data = pooling_data[pooling_data['day'] == '2024-01-31']


with open("fixed_traj_new.json", "r") as json_traj_file:
    new_data = [json.loads(line) for line in json_traj_file]  #read each line as a JSON object



with open(clean_file_path, 'w', encoding='utf-8') as clean_file :
    for entry in new_data:
        entry_time = entry["time"]
        entry_trip_id = entry["trip_id"]
        entry_driver_id = entry["driverID"]
                    
        for _, row in filtered_pooling_data.iterrows():
            pool_time = float(row["captain_engagement_time"] * 60)
            pool_trip_id = row["booking_id"]
            pool_driver_id = row["captain_id"]
            time_diff= pool_time - entry_time

            #adding time difference to entry
            entry_with_time_diff = entry.copy()  
            entry_with_time_diff["time_diff"] = time_diff  

            #writing good trips to clean file (same trip and driver id and >5 sec time diff)
            if entry_trip_id == pool_trip_id and entry_driver_id == pool_driver_id and abs(time_diff) <= 5:
                json.dump(entry_with_time_diff, clean_file)
                clean_file.write("\n")
                break
                       

print("Matching process completed. Check 'clean_trips.json' for results.")

Matching process completed. Check 'clean_trips.json' for results.


### Get unclean trips

In [4]:
import json

def find_unmatched_trips(new_file_path, clean_file_path, output_file_path):
    #read data from two JSON files
    new_trips = []
    with open(new_file_path, 'r') as new_file:
        for line in new_file:
            new_trips.append(json.loads(line))

    clean_trips = []
    with open(clean_file_path, 'r') as clean_file:
        for line in clean_file:
            clean_trips.append(json.loads(line))

    #extract trip IDs from clean_trips
    clean_trip_ids = {trip['trip_id'] for trip in clean_trips}

    #extract captain engagement times from filtered_pooling_data
    captain_engagement_times = {row['booking_id']: float(row['captain_engagement_time'] * 60) for _, row in filtered_pooling_data.iterrows()}


    unmatched_trips = []
    for trip in new_trips:
        if trip['trip_id'] not in clean_trip_ids:
            trip_id = trip['trip_id']
            if trip_id in captain_engagement_times:
                
                engagement_time = captain_engagement_times[trip_id]
                traj_time = trip['time']
                time_diff= engagement_time - traj_time

                trip['time_diff'] = time_diff
            
            unmatched_trips.append(trip)


    with open(output_file_path, 'w') as output_file:
        for trip in unmatched_trips:
            json.dump(trip, output_file)
            output_file.write('\n')


traj_file_path = 'fixed_traj_new.json'
clean_file_path = 'clean_trips.json'
unclean_file_path = 'unclean_trips.json'


find_unmatched_trips(traj_file_path, clean_file_path, unclean_file_path)

### Filter Pooling File by Day

In [5]:
import pandas as pd

# Load data from the original CSV file
pooling_data = pd.read_csv("careems data/anon_pooling_jan_24_amman.csv")

# Filter data for the specific day '2024-01-31'
filtered_pooling_data = pooling_data[pooling_data['day'] == '2024-01-31']

# Define the output filename
output_filename = "pooling_day.csv"

# Write the filtered data to the CSV file
filtered_pooling_data.to_csv(output_filename, index=False)

print(f"Filtered data written to {output_filename}")


Filtered data written to pooling_day.csv


### Current workspace: Check for missing trailing pings if within wait_time_to_customer

In [43]:
import json
import pandas as pd
import math


# Read the content of the JSON file
with open("unclean_trips.json", 'r', encoding='utf-8') as input_file:
    unclean_trips = [json.loads(line) for line in input_file]  # Each line contains a JSON object

# Load the CSV file into a DataFrame
pooling_day = pd.read_csv("pooling_day.csv")

trips_not_in_pooling = 0
no_time_diff = 0



# Iterate through each entry in the JSON data
# Iterate over each trip in unclean_trips
with open("filled_end_pings2.json", 'w', encoding='utf-8') as output_file:
    for trip in unclean_trips:
        trip_id = trip.get("trip_id")

        # Check if time_diff is valid (i.e., not None)
        time_diff = trip.get("time_diff", None)
        if time_diff is None:
            no_time_diff += 1
            # print(f"Trip {trip_id} has no valid time_diff, skipping...")
            continue  # Skip to the next trip if time_diff is not valid
        
        

        time_gap = trip.get("time_gap")
        dist_gap = trip.get("dist_gap")
        lats = trip.get("lats")
        lngs = trip.get("lngs")
        time = trip.get("time")

        # Match trip_id from JSON with booking_id from CSV
        matched_entry = pooling_day[pooling_day["booking_id"] == trip_id]

        # if matched_entry.empty:
        #     trips_not_in_pooling += 1
        #     # print(f"No matching entry for Trip {trip_id} in pooling_day.csv, skipping...")
        #     continue  # Skip to the next trip if no match is found

        # Get the wait_time_to_customer from CSV
        wait_time_to_customer = matched_entry["wait_time_at_customer"].iloc[0]
        # print(type(matched_entry))
        # print(matched_entry)
        # print(type(matched_entry["wait_time_at_customer"].iloc[0]))
        # print(matched_entry["wait_time_at_customer"].iloc[0])
        # Calculate the number of additional elements to append
        num_elements = math.floor(time_diff / 5)


        # dump the data to a csv file as a CHECKER
        # with open("checkpoint_00.csv", "a") as file: 
        #     file.write(f"{trip_id},{wait_time_to_customer},{time_diff},{num_elements}\n")


        # Check the last changing coordinate in lats and lngs
        if lats and lngs:
            last_coord = (lats[-1], lngs[-1])
            count = 0
            for lat, lng in zip(reversed(lats), reversed(lngs)):
                if (lat, lng) == last_coord:
                    count += 1
                    # print((lat,lng))
                else:
                    break
            
        # if(count>1):
        #check if time of stationary coordinates + time diff from captain engagement time is within 5 seconds of wait time to customer
            x = time-time_gap[-count]+time_diff - (wait_time_to_customer*60)

            if (count>1) & (abs(x)<=5) & (time_diff>=0):
                # write in a csv file the trip_id, count*5, and wait_time_to_customer
                # with open("checkpoint_01.csv", "a") as file:
                #     file.write(f"{trip_id},{x},{wait_time_to_customer}\n")
                # continue

        # Append elements based on the condition
                for _ in range(num_elements):
                    # Update the time_gap
                    new_time = time_gap[-1] + 5 if time_gap else 5
                    time_gap.append(new_time)

                    # Update the dist_gap, lats, and lngs
                    if dist_gap:
                        dist_gap.append(dist_gap[-1])
                        lats.append(lats[-1])
                        lngs.append(lngs[-1])
                    # else:
                    #     dist_gap.append(0)
                    #     lats.append(0)
                    #     lngs.append(0)

                # Modify the corresponding entry with new values
                # Convert to the expected types if needed
                trip["time_gap"] = time_gap
                trip["dist_gap"] = dist_gap
                trip["lats"] = lats
                trip["lngs"] = lngs

    # Write the updated JSON data to a new file
        # for trip in unclean_trips:
                json.dump(trip, output_file)
                output_file.write("\n")  # Write each trip on a new line



### to delete

In [None]:
import json
import pandas as pd
import math

# Load data from unclean_trips.json
with open('unclean_trips.json', 'r') as f:
    unclean_trips = json.load(f)

# Load data from pooling.csv
pooling_data = pd.read_csv('pooling_day.csv')

# Assuming the pooling.csv has a column named 'wait_time_to_customer'
wait_time_to_customer = pooling_data['wait_time_to_customer'][0]  # Change index as appropriate

# Extract time_diff from unclean_trips
time_diff = unclean_trips.get('time_diff', 0)

# Calculate the number of extra elements to be added
extra_elements_count = math.floor(time_diff / 5)

# Get existing lists from unclean_trips
time_gap = unclean_trips.get('time_gap', [])
dist_gap = unclean_trips.get('dist_gap', [])
lats = unclean_trips.get('lats', [])
lngs = unclean_trips.get('lngs', [])

# Find the last changing coordinate in lats and lngs
last_unique_lat, last_unique_lng = None, None

# Iterate from the end to find the last unique coordinate
for i in range(len(lats) - 1, -1, -1):
    if lats[i] != last_unique_lat or lngs[i] != last_unique_lng:
        last_unique_lat, last_unique_lng = lats[i], lngs[i]
        break

# Count the number of elements from this last unique coordinate to the end
last_change_index = lats.index(last_unique_lat)
elements_since_last_change = len(lats) - last_change_index

# Condition check: if this count multiplied by 5 is less than wait_time_to_customer
if (elements_since_last_change * 5) < wait_time_to_customer:
    # Update the time_gap
    last_time = time_gap[-1] if time_gap else 0
    for i in range(extra_elements_count):
        new_time = last_time + (5 * (i + 1))
        time_gap.append(new_time)
        dist_gap.append(dist_gap[-1] if dist_gap else 0)  # Append the last element value
        lats.append(lats[-1] if lats else last_unique_lat)  # Append the last lat value
        lngs.append(lngs[-1] if lngs else last_unique_lng)  # Append the last lng value

    # Set the time attribute
    unclean_trips['time'] = 5 * extra_elements_count

    # Save the updated JSON back to file
    with open('updated_unclean_trips.json', 'w') as f:
        json.dump(unclean_trips, f, indent=4)

    print("Updated unclean_trips.json based on the specified condition.")

else:
    print("Condition not met, no updates made to unclean_trips.json.")


In [2]:
test = [1,2,3,4,5,6,7,8,9,10]
print(test[-4])

entry[30] == entry[-1]
entry[30 - c +1] == entry[-c]


7
