#### Step 1: Import Libraries

In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.ensemble import RandomForestRegressor
import plotly.graph_objects as go
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


#### Step 2: Load and Preprocess Data

In [2]:
# Load the JSON file
data = []
with open('clean_trips.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

# Calculate trip time in seconds
df['trip_time'] = df['time']

# Drop rows with missing values (if any)
df.dropna(inplace=True)

#### Step 3: Prepare Features and Target

In [3]:
# Feature extraction functions
def extract_features(row):
    features = {}
    features['mean_time_gap'] = np.mean(row['time_gap'])
    features['std_time_gap'] = np.std(row['time_gap'])
    features['min_time_gap'] = np.min(row['time_gap'])
    features['max_time_gap'] = np.max(row['time_gap'])
    
    features['mean_lat'] = np.mean(row['lats'])
    features['std_lat'] = np.std(row['lats'])
    features['min_lat'] = np.min(row['lats'])
    features['max_lat'] = np.max(row['lats'])
    
    features['mean_lng'] = np.mean(row['lngs'])
    features['std_lng'] = np.std(row['lngs'])
    features['min_lng'] = np.min(row['lngs'])
    features['max_lng'] = np.max(row['lngs'])
    
    features['mean_dist_gap'] = np.mean(row['dist_gap'])
    features['std_dist_gap'] = np.std(row['dist_gap'])
    features['min_dist_gap'] = np.min(row['dist_gap'])
    features['max_dist_gap'] = np.max(row['dist_gap'])
    
    return pd.Series(features)

# Apply feature extraction to each row
feature_df = df.apply(extract_features, axis=1)

# Combine features with other relevant columns
X = pd.concat([feature_df, df[['driverID', 'weekID', 'timeID', 'dateID']]], axis=1)
y = df['trip_time']


#### Step 4: Split the Data

In [4]:
# Convert non-numeric columns to numeric or drop them
non_numeric_cols = X.select_dtypes(include=['object']).columns
X[non_numeric_cols] = X[non_numeric_cols].apply(lambda col: pd.factorize(col)[0])

# Ensure all features are numeric
assert X.select_dtypes(include=['object']).empty, "There are still non-numeric columns in X"

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#### Step 5: Hyperparameter Tuning and Model Training

In [5]:
# param_grid = {
#     'n_estimators': list(range(1, 202, 50)),
#     'max_depth': list(range(1, 8)),
#     'min_samples_leaf': list(range(1, 10)),
#     'max_leaf_nodes': [None] + list(range(10, 20, 2)),
#     'random_state': [42],
#     'criterion': ['squared_error', 'absolute_error']
# }

# grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, refit=True, verbose=1, scoring='neg_root_mean_squared_error')
# grid_search.fit(X_train, y_train)

# best_params = grid_search.best_params_
# model = RandomForestRegressor(**best_params)
# model.fit(X_train, y_train)

##### Narrow down the parameter grid search

##### Only one parameter

In [6]:
from sklearn.ensemble import RandomForestRegressor

# Recommended initial hyperparameter values
n_estimators = 100
max_depth = None  # Let the tree grow until all leaves are pure or until they contain less than min_samples_split samples
min_samples_leaf = 1  # The minimum number of samples required to be at a leaf node
max_leaf_nodes = None  # Let the trees grow until all leaves are pure or until they contain less than min_samples_split samples
random_state = 42  # Ensures reproducibility
criterion = 'squared_error'  # Default criterion for regression tasks

# Initialize and train the model with the recommended values
model = RandomForestRegressor(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_leaf=min_samples_leaf,
    max_leaf_nodes=max_leaf_nodes,
    random_state=random_state,
    criterion=criterion
)

model.fit(X_train, y_train)

# Evaluate the model
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'Root Mean Squared Error (RMSE): {rmse}')


Root Mean Squared Error (RMSE): 11.367639492976384




#### Step 6: Evaluate the Model with Learning Curves

In [7]:
train_sizes, train_scores, valid_scores = learning_curve(
    model, X_train, y_train, train_sizes=np.linspace(0.1, 1, 10), cv=5, scoring='neg_root_mean_squared_error')

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_sizes, y=-np.mean(train_scores, axis=1), name='Training error'))
fig.add_trace(go.Scatter(x=train_sizes, y=-np.mean(valid_scores, axis=1), name='Validation error'))
fig.show()

#### Step 7: Make Predictions and Calculate Errors

In [11]:
# def estimations(test_data, model):
#     test_data['trip_time_estimate'] = model.predict(test_data.drop(columns=['trip_time']))
#     test_data['error'] = test_data['trip_time'] - test_data['trip_time_estimate']
#     test_data = test_data.query("error < 3600")
#     return test_data

# plot_df = estimations(df.loc[X_test.index], model)

# # Plotting and metrics functions can be reused as is
# plot_graphs(plot_df)
# get_metrics(plot_df)

def estimations(test_data, model, feature_names):
    test_data = test_data[feature_names]
    test_data['trip_time_estimate'] = model.predict(test_data)
    test_data['error'] = test_data['trip_time'] - test_data['trip_time_estimate']
    test_data = test_data.query("error < 3600")
    return test_data

# Get the feature names used during training
feature_names = X_train.columns.tolist()

plot_df = estimations(df.loc[X_test.index], model, feature_names)

# Assuming plot_graphs and get_metrics are defined elsewhere
plot_graphs(plot_df)
get_metrics(plot_df)

KeyError: "['mean_time_gap', 'std_time_gap', 'min_time_gap', 'max_time_gap', 'mean_lat', 'std_lat', 'min_lat', 'max_lat', 'mean_lng', 'std_lng', 'min_lng', 'max_lng', 'mean_dist_gap', 'std_dist_gap', 'min_dist_gap', 'max_dist_gap'] not in index"