In [1]:
import pandas as pd
import json
import numpy as np
from geopy.distance import geodesic

# Load data from the JSON file
data = []
with open('clean_trips.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Convert to DataFrame
df = pd.DataFrame(data)

# Function to calculate cumulative distance
def calculate_cumulative_distance(lats, lngs):
    cumulative_distance = [0.0]
    for i in range(1, len(lats)):
        start = (lats[i-1], lngs[i-1])
        end = (lats[i], lngs[i])
        dist = geodesic(start, end).kilometers
        cumulative_distance.append(cumulative_distance[-1] + dist)
    return cumulative_distance

# Function to check for zero time differences
def has_zero_time_diff(time_gap):
    time_diffs = np.diff(time_gap)
    return np.any(time_diffs == 0)

# Apply the check to each trip
df['has_zero_time_diff'] = df['time_gap'].apply(has_zero_time_diff)

# Filter trips with zero time differences
trips_with_zero_time_diff = df[df['has_zero_time_diff'] == True]

# Save the filtered trips to a separate file for further investigation
trips_with_zero_time_diff.to_json('trips_with_zero_time_diff.json', orient='records', lines=True)

# Print the number of trips with zero time differences
print(f'Number of trips with zero time differences: {len(trips_with_zero_time_diff)}')

# Optional: Print the filtered trips for quick inspection
print(trips_with_zero_time_diff[['trip_id', 'time_gap']])


Number of trips with zero time differences: 5440
                                                trip_id  \
0     0012cf835ee80e59fefbe618282b2edc082940ddba6a46...   
1     0016d7b5510107039e65db431a77047ccd381c975ac910...   
2     00173c2c51ac526bcabae2e29f42cfbdcd5f4868f4bb0a...   
3     001f4a8d66681721bead8b8d279e99020314facd7d79de...   
4     002c53591ec14d4303a02f331e9422ae2c26090f1f0146...   
...                                                 ...   
6390  ffccf54189787f2ec5b5b1be358f491bca1f34c85c7a47...   
6391  ffcd2786b8d9eacd47e32fa44ddc66828a1e0a0ce05660...   
6392  ffd5db961599ffc7a421ffe838c8b92803419ebad3e7b6...   
6394  fff3fc11ad8c2bdc86d5ceafbc8d5952b2e325d1561cf0...   
6395  fff663197bf574326959a718cc055b7261f630aefd69b8...   

                                               time_gap  
0     [0.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0, 40.0...  
1     [0.0, 5.0, 10.0, 242.0, 282.0, 292.0, 292.0, 2...  
2     [0.0, 5.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0,...  
3     [0.0

In [2]:
import pandas as pd
import json
import numpy as np
from geopy.distance import geodesic

# Load data from the JSON file
data = []
with open('clean_trips.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Convert to DataFrame
df = pd.DataFrame(data)

# Function to calculate cumulative distance
def calculate_cumulative_distance(lats, lngs):
    cumulative_distance = [0.0]
    for i in range(1, len(lats)):
        start = (lats[i-1], lngs[i-1])
        end = (lats[i], lngs[i])
        dist = geodesic(start, end).kilometers
        cumulative_distance.append(cumulative_distance[-1] + dist)
    return cumulative_distance

# Function to find indices with zero time differences
def find_zero_time_diff_indices(time_gap):
    time_diffs = np.diff(time_gap)
    zero_diff_indices = np.where(time_diffs == 0)[0]  # Get indices where time_diff is zero
    return zero_diff_indices.tolist()

# Apply the check to each trip
df['zero_time_diff_indices'] = df['time_gap'].apply(find_zero_time_diff_indices)

# Filter trips with zero time differences
trips_with_zero_time_diff = df[df['zero_time_diff_indices'].apply(len) > 0]

# Save the filtered trips to a separate file for further investigation
trips_with_zero_time_diff.to_json('trips_with_zero_time_diff_indices.json', orient='records', lines=True)

# Print the number of trips with zero time differences
print(f'Number of trips with zero time differences: {len(trips_with_zero_time_diff)}')

# Optional: Print the filtered trips for quick inspection
print(trips_with_zero_time_diff[['trip_id', 'zero_time_diff_indices']])


Number of trips with zero time differences: 5440
                                                trip_id  \
0     0012cf835ee80e59fefbe618282b2edc082940ddba6a46...   
1     0016d7b5510107039e65db431a77047ccd381c975ac910...   
2     00173c2c51ac526bcabae2e29f42cfbdcd5f4868f4bb0a...   
3     001f4a8d66681721bead8b8d279e99020314facd7d79de...   
4     002c53591ec14d4303a02f331e9422ae2c26090f1f0146...   
...                                                 ...   
6390  ffccf54189787f2ec5b5b1be358f491bca1f34c85c7a47...   
6391  ffcd2786b8d9eacd47e32fa44ddc66828a1e0a0ce05660...   
6392  ffd5db961599ffc7a421ffe838c8b92803419ebad3e7b6...   
6394  fff3fc11ad8c2bdc86d5ceafbc8d5952b2e325d1561cf0...   
6395  fff663197bf574326959a718cc055b7261f630aefd69b8...   

                                 zero_time_diff_indices  
0                                    [58, 64, 104, 199]  
1                                                   [5]  
2                                            [109, 249]  
3     [88,

In [3]:
import pandas as pd
import json
import numpy as np
from geopy.distance import geodesic

# Load data from the JSON file
data = []
with open('clean_trips.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Convert to DataFrame
df = pd.DataFrame(data)

# Function to calculate cumulative distance
def calculate_cumulative_distance(lats, lngs):
    cumulative_distance = [0.0]
    for i in range(1, len(lats)):
        start = (lats[i-1], lngs[i-1])
        end = (lats[i], lngs[i])
        dist = geodesic(start, end).kilometers
        cumulative_distance.append(cumulative_distance[-1] + dist)
    return cumulative_distance

# Function to find indices with zero time differences
def find_zero_time_diff_indices(time_gap):
    time_diffs = np.diff(time_gap)
    zero_diff_indices = np.where(time_diffs == 0)[0]  # Get indices where time_diff is zero
    return zero_diff_indices.tolist()

# Apply the check to each trip
df['zero_time_diff_indices'] = df['time_gap'].apply(find_zero_time_diff_indices)

# Filter trips with zero time differences
trips_with_zero_time_diff = df[df['zero_time_diff_indices'].apply(len) > 0]

# Reorder columns to place zero_time_diff_indices after trip_id
columns = ['trip_id', 'zero_time_diff_indices'] + [col for col in df.columns if col not in ['trip_id', 'zero_time_diff_indices']]
trips_with_zero_time_diff = trips_with_zero_time_diff[columns]

# Save the filtered trips to a separate file for further investigation
trips_with_zero_time_diff.to_json('trips_with_zero_time_diff.json', orient='records', lines=True)

# Print the number of trips with zero time differences
print(f'Number of trips with zero time differences: {len(trips_with_zero_time_diff)}')

# Optional: Print the filtered trips for quick inspection
print(trips_with_zero_time_diff[['trip_id', 'zero_time_diff_indices']])


Number of trips with zero time differences: 5440
                                                trip_id  \
0     0012cf835ee80e59fefbe618282b2edc082940ddba6a46...   
1     0016d7b5510107039e65db431a77047ccd381c975ac910...   
2     00173c2c51ac526bcabae2e29f42cfbdcd5f4868f4bb0a...   
3     001f4a8d66681721bead8b8d279e99020314facd7d79de...   
4     002c53591ec14d4303a02f331e9422ae2c26090f1f0146...   
...                                                 ...   
6390  ffccf54189787f2ec5b5b1be358f491bca1f34c85c7a47...   
6391  ffcd2786b8d9eacd47e32fa44ddc66828a1e0a0ce05660...   
6392  ffd5db961599ffc7a421ffe838c8b92803419ebad3e7b6...   
6394  fff3fc11ad8c2bdc86d5ceafbc8d5952b2e325d1561cf0...   
6395  fff663197bf574326959a718cc055b7261f630aefd69b8...   

                                 zero_time_diff_indices  
0                                    [58, 64, 104, 199]  
1                                                   [5]  
2                                            [109, 249]  
3     [88,