In [1]:
import json

def extract_column_headers(file_path):
    headers = set()

    with open(file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line.strip())
            headers.update(json_obj.keys())

    return sorted(headers)

def print_column_headers(file_path):
    headers = extract_column_headers(file_path)
    print("Column Headers:")
    print(", ".join(headers))

if __name__ == "__main__":
    # Replace 'your_file.json' with the path to your JSON file
    json_file_path = 'clean_trips.json'
    print_column_headers(json_file_path)


Column Headers:
dateID, dist, dist_gap, driverID, lats, lngs, time, timeID, time_diff, time_gap, trip_id, weekID


In [None]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame

# Decompose dateID
df['year'] = pd.to_datetime(df['dateID']).dt.year
df['month'] = pd.to_datetime(df['dateID']).dt.month
df['day'] = pd.to_datetime(df['dateID']).dt.day
df['day_of_week'] = pd.to_datetime(df['dateID']).dt.dayofweek
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Calculate speed from dist and time_gap
df['speed'] = df['dist'] / df['time_gap']

# Extract hour from time
df['hour'] = pd.to_datetime(df['time']).dt.hour

# Cyclical time features
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Geospatial Feature Example: Calculate distance from a fixed point (latitude, longitude)
from geopy.distance import great_circle
fixed_point = (40.7128, -74.0060)  # Example: New York City coordinates
df['dist_to_nyc'] = df.apply(lambda row: great_circle((row['lats'], row['lngs']), fixed_point).kilometers, axis=1)

# Driver Aggregation Example
driver_agg = df.groupby('driverID').agg({
    'dist': 'mean',
    'time_gap': 'mean'
}).reset_index()
driver_agg.columns = ['driverID', 'avg_dist_per_driver', 'avg_time_gap_per_driver']

df = df.merge(driver_agg, on='driverID', how='left')
