In [319]:
import pandas as pd
import json

pd.set_option('display.max_rows', None)

# Load the JSON file
with open('public_cases.json', 'r') as f:
    data = json.load(f)

# Flatten the nested structure
records = []
for entry in data:
    record = entry['input']
    record['expected_output'] = entry['expected_output']
    records.append(record)

# Create DataFrame
df = pd.DataFrame(records)

def ends_with_49(x):
    return round(x % 1, 2) == 0.49

def ends_with_99(x):
    return round(x % 1, 2) == 0.99

def ends_with_6(x):
    return int(x) % 10 == 6

df['miles_traveled_per_day'] = [x / y * 1.0  for x, y in zip(df['miles_traveled'], df['trip_duration_days'])]
df['total_receipts_amount_per_day'] = [x / y * 1.0  for x, y in zip(df['total_receipts_amount'], df['trip_duration_days'])]
df['total_receipts_amount_per_mile'] = [x / y * 1.0  for x, y in zip(df['total_receipts_amount'], df['miles_traveled'])]
df['ends_with_49'] = [1 if ends_with_49(x) else 0 for x in df['total_receipts_amount']]
df['ends_with_99'] = [1 if ends_with_99(x) else 0 for x in df['total_receipts_amount']]
df['ends_with_49_99'] = [1 if ends_with_49(x) or ends_with_99(x) else 0 for x in df['total_receipts_amount']]
df['day_5'] = [1 if x == 5 else 0 for x in df['trip_duration_days']]
df['day_6'] = [1 if x == 6 else 0 for x in df['trip_duration_days']]
df['day_5_or_6'] = [1 if x == 5 or x == 6 else 0 for x in df['trip_duration_days']]
df['day_7'] = [1 if x == 7 else 0 for x in df['trip_duration_days']]
df['day_8'] = [1 if x == 8 else 0 for x in df['trip_duration_days']]
df['day_9'] = [1 if x == 9 else 0 for x in df['trip_duration_days']]
df['day_10'] = [1 if x == 10 else 0 for x in df['trip_duration_days']]
df['day_11'] = [1 if x == 11 else 0 for x in df['trip_duration_days']]
df['day_12'] = [1 if x == 12 else 0 for x in df['trip_duration_days']]
df['day_13'] = [1 if x == 13 else 0 for x in df['trip_duration_days']]
df['day_7_or_8'] = [1 if x == 7 or x == 8 else 0 for x in df['trip_duration_days']]
df['recidual'] = [round(x % 0.5, 2) * 100 for x in df['total_receipts_amount']]
df['sweet_miles_traveled_per_day'] = [1 if x >180 and x < 220 else 0 for x in df['miles_traveled_per_day']]
df['medium_high_spending'] = [1 if x > 600 and x < 800 else 0 for x in df['total_receipts_amount']]
df['bonus'] = [1 if x > 180 and y < 100 else 0 for x, y in zip(df['miles_traveled_per_day'], df['total_receipts_amount_per_day'])]
df['miles_900_1050'] = [1 if x > 900 and x < 1100 else 0 for x in df['miles_traveled']]
df['ends_with_6'] = [1 if ends_with_6(x) else 0 for x in df['miles_traveled']]


# Display first few rows
df.head(10)

Unnamed: 0,trip_duration_days,miles_traveled,total_receipts_amount,expected_output,miles_traveled_per_day,total_receipts_amount_per_day,total_receipts_amount_per_mile,ends_with_49,ends_with_99,ends_with_49_99,...,day_11,day_12,day_13,day_7_or_8,recidual,sweet_miles_traveled_per_day,medium_high_spending,bonus,miles_900_1050,ends_with_6
0,3,93.0,1.42,364.51,31.0,0.473333,0.015269,0,0,0,...,0,0,0,0,42.0,0,0,0,0,0
1,1,55.0,3.6,126.06,55.0,3.6,0.065455,0,0,0,...,0,0,0,0,10.0,0,0,0,0,0
2,1,47.0,17.97,128.91,47.0,17.97,0.38234,0,0,0,...,0,0,0,0,47.0,0,0,0,0,0
3,2,13.0,4.67,203.52,6.5,2.335,0.359231,0,0,0,...,0,0,0,0,17.0,0,0,0,0,0
4,3,88.0,5.78,380.37,29.333333,1.926667,0.065682,0,0,0,...,0,0,0,0,28.0,0,0,0,0,0
5,1,76.0,13.74,158.35,76.0,13.74,0.180789,0,0,0,...,0,0,0,0,24.0,0,0,0,0,1
6,3,41.0,4.52,320.12,13.666667,1.506667,0.110244,0,0,0,...,0,0,0,0,2.0,0,0,0,0,0
7,1,140.0,22.71,199.68,140.0,22.71,0.162214,0,0,0,...,0,0,0,0,21.0,0,0,0,0,0
8,3,121.0,21.17,464.07,40.333333,7.056667,0.174959,0,0,0,...,0,0,0,0,17.0,0,0,0,0,0
9,3,117.0,21.99,359.1,39.0,7.33,0.187949,0,1,1,...,0,0,0,0,49.0,0,0,0,0,0


In [321]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np


# Define features (X) and target (y)
X = df[['trip_duration_days', 'miles_traveled', 'total_receipts_amount', 'miles_traveled_per_day', 'total_receipts_amount_per_day', 
        'day_5', 'day_6', 'day_7_or_8', 'ends_with_49_99', 'sweet_miles_traveled_per_day']]
y = df['expected_output']

# Split data into training and testing sets (optional, but good practice)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Initialize and train a GradientBoostingRegressor model
model = GradientBoostingRegressor(
    n_estimators=350,             # Number of boosting stages (trees) to perform
    learning_rate=0.05,            # Shrinks the contribution of each tree
    max_depth=3,                  # Maximum depth of the individual regression estimators
    subsample=1.0,                # Fraction of samples to be used for fitting the individual base learners
    min_samples_split=2,          # The minimum number of samples required to split an internal node
    min_samples_leaf=1,           # The minimum number of samples required to be at a leaf node
    random_state=42               # For reproducibility
)
model.fit(X_train, y_train)

# 3. Make predictions
y_pred = model.predict(X_test)

# Evaluate the model (optional)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Mean Squared Error on test set: {rmse:.2f}")

predicted_y = model.predict(X)
df['predicted_output'] = predicted_y

Mean Squared Error on test set: 70.95


In [277]:
# 2. Initialize and train a GradientBoostingRegressor model
model = GradientBoostingRegressor(
    n_estimators=350,             # Number of boosting stages (trees) to perform
    learning_rate=0.05,            # Shrinks the contribution of each tree
    max_depth=3,                  # Maximum depth of the individual regression estimators
    subsample=1.0,                # Fraction of samples to be used for fitting the individual base learners
    min_samples_split=2,          # The minimum number of samples required to split an internal node
    min_samples_leaf=1,           # The minimum number of samples required to be at a leaf node
    random_state=42               # For reproducibility
)
model.fit(X, y)

# 3. Make predictions
y_pred = model.predict(X)

# Evaluate the model (optional)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"Mean Squared Error on test set: {rmse:.2f}")

Mean Squared Error on test set: 52.19


In [278]:
import joblib
joblib.dump(model, 'gbr_model.pkl')


['gbr_model.pkl']

In [279]:
import joblib

# Load the model from the file
loaded_model = joblib.load('gbr_model.pkl')

# Use it for prediction
predictions = loaded_model.predict(X_test)

X_test['prediction'] = predictions
X_test['label'] = y_test
X_test.head()

Unnamed: 0,trip_duration_days,miles_traveled,total_receipts_amount,miles_traveled_per_day,total_receipts_amount_per_day,day_5,day_6,day_7_or_8,ends_with_49_99,sweet_miles_traveled_per_day,prediction,label
521,10,1192.0,23.47,119.2,2.347,0,0,0,0,0,1215.114243,1157.87
737,12,296.0,326.83,24.666667,27.235833,0,0,0,0,0,957.006376,981.72
740,10,532.0,1223.36,53.2,122.336,0,0,0,0,0,1703.836534,1631.49
660,4,1065.0,119.34,266.25,29.835,0,0,0,0,0,814.327939,781.82
411,2,933.0,1589.58,466.5,794.79,0,0,0,0,0,1480.860471,1489.99


In [313]:
data = [{
    'trip_duration_days': 10,
    'miles_traveled': 1192,
    'total_receipts_amount': 23.47,
    'miles_traveled_per_day': 119.2,
    'total_receipts_amount_per_day': 2.347,
    'day_5': 0,
    'day_6': 0,
    'day_7_or_8': 0,
    'ends_with_49_99': 0,
    'sweet_miles_traveled_per_day': 0,
}]
df_test = pd.DataFrame(data)

float(loaded_model.predict(df_test)[0])


1215.1142430610398

In [318]:
for x, y, z, k, predicted_label, label in zip(X_test['trip_duration_days'], X_test['total_receipts_amount'], X_test['miles_traveled'], X_test['miles_traveled_per_day'], X_test['prediction'], X_test['label']):
    if abs(predicted_label - label) > 99:
        print(f"trip_duration_days: {x}, total_receipts_amount: {y}, miles_traveled: {z}, miles_traveled_per_day: {k}, prediction: {predicted_label}, actual: {label}")


trip_duration_days: 10, total_receipts_amount: 23.47, miles_traveled: 1192.0, miles_traveled_per_day: 119.2, prediction: 1375.5916921473975, actual: 1157.87
trip_duration_days: 11, total_receipts_amount: 502.37, miles_traveled: 273.0, miles_traveled_per_day: 24.818181818181817, prediction: 961.7762016002175, actual: 862.61
trip_duration_days: 9, total_receipts_amount: 1764.97, miles_traveled: 885.0, miles_traveled_per_day: 98.33333333333333, prediction: 1796.4169030841974, actual: 1694.37
trip_duration_days: 9, total_receipts_amount: 62.12, miles_traveled: 868.0, miles_traveled_per_day: 96.44444444444444, prediction: 1159.9111371016925, actual: 1022.81
trip_duration_days: 12, total_receipts_amount: 1300.05, miles_traveled: 104.0, miles_traveled_per_day: 8.666666666666666, prediction: 1643.791857220815, actual: 1779.92
trip_duration_days: 11, total_receipts_amount: 269.95, miles_traveled: 198.0, miles_traveled_per_day: 18.0, prediction: 856.3122004998593, actual: 695.66
trip_duration_da