Please note that this notebook has bad feature engineering that yielded poor results. However, it gives an idea on how code can run using parallel tasks on GPU, not tested though.

In [None]:
!pip install lightgbm

In [None]:
import pandas as pd
import json
import numpy as np
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import lightgbm as lgb
import time
from joblib import Parallel, delayed

# Start timing the script
start_time = time.time()

# Load data from the JSON file
data = []
with open('clean_trips.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Convert to DataFrame
df = pd.DataFrame(data)

# Function to calculate cumulative distance
def calculate_cumulative_distance(lats, lngs):
    cumulative_distance = [0.0]
    for i in range(1, len(lats)):
        start = (lats[i-1], lngs[i-1])
        end = (lats[i], lngs[i])
        dist = geodesic(start, end).kilometers
        cumulative_distance.append(cumulative_distance[-1] + dist)
    return cumulative_distance

# Function to calculate segmented speeds
def calculate_segmented_speeds(cumulative_distance, time_gap):
    speeds = []
    for i in range(1, len(cumulative_distance)):
        distance_diff = cumulative_distance[i] - cumulative_distance[i-1]
        time_diff = time_gap[i] - time_gap[i-1]
        if time_diff != 0:
            speed = distance_diff / (time_diff / 3600)  # speed in km/h
        else:
            speed = 0  # or some other handling mechanism
        speeds.append(speed)
    speeds.insert(0, 0.0)  # Insert 0.0 speed for the first element as there's no previous segment
    return speeds

# Function to find indices with zero time differences
def find_zero_time_diff_indices(time_gap):
    time_diffs = np.diff(time_gap)
    zero_diff_indices = np.where(time_diffs == 0)[0]  # Get indices where time_diff is zero
    return zero_diff_indices.tolist()

# Apply feature engineering
def feature_engineering(df):
    df['cumulative_distance'] = Parallel(n_jobs=-1)(delayed(calculate_cumulative_distance)(row['lats'], row['lngs']) for _, row in df.iterrows())
    df['segmented_speeds'] = Parallel(n_jobs=-1)(delayed(calculate_segmented_speeds)(row['cumulative_distance'], row['time_gap']) for _, row in df.iterrows())
    df['zero_time_diff_indices'] = df['time_gap'].apply(find_zero_time_diff_indices)
    
    # Extracting statistical features from lists
    df['avg_segmented_speed'] = df['segmented_speeds'].apply(np.mean)
    df['max_segmented_speed'] = df['segmented_speeds'].apply(np.max)
    df['min_segmented_speed'] = df['segmented_speeds'].apply(np.min)
    df['std_segmented_speed'] = df['segmented_speeds'].apply(np.std)
    
    df['avg_distance_diff'] = df['cumulative_distance'].apply(lambda x: np.mean(np.diff(x)))
    df['max_distance_diff'] = df['cumulative_distance'].apply(lambda x: np.max(np.diff(x)))
    df['min_distance_diff'] = df['cumulative_distance'].apply(lambda x: np.min(np.diff(x)))
    df['std_distance_diff'] = df['cumulative_distance'].apply(lambda x: np.std(np.diff(x)))
    
    df['time_of_day'] = df['timeID'] // 60  # Convert minutes to hours
    df['day_of_week'] = df['dateID'] % 7
    
    # Calculate average speed over the entire trip
    df['average_speed'] = df['dist'] / (df['time'] / 3600)  # speed in km/h
    
    return df

df = feature_engineering(df)

# Define features and target variable
features = [
    'avg_segmented_speed', 'max_segmented_speed', 'min_segmented_speed', 'std_segmented_speed',
    'avg_distance_diff', 'max_distance_diff', 'min_distance_diff', 'std_distance_diff',
    'time_of_day', 'day_of_week', 'average_speed'
]
X = df[features]
y = df['time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LightGBM model with GPU support if available
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 100,
    'learning_rate': 0.1,
    'device': 'gpu' if lgb.cuda.is_available() else 'cpu'  # Use GPU if available
}

model = lgb.LGBMRegressor(**params)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

# Select an example from the test set for prediction
example_index = X_test.index[0]
example_X = X_test.loc[example_index]
example_y_true = y_test.loc[example_index]

# Make prediction
example_y_pred = model.predict([example_X])[0]

print(f'Example trip ID: {df.loc[example_index, "trip_id"]}')
print(f'Predicted ETA: {example_y_pred}')
print(f'Actual ETA: {example_y_true}')

# End timing the script
end_time = time.time()
print(f'Total script runtime: {end_time - start_time} seconds')
