In [22]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/int20h-2023-hackathon/nodes.csv
/kaggle/input/int20h-2023-hackathon/nodes_test.csv
/kaggle/input/int20h-2023-hackathon/orders.csv
/kaggle/input/int20h-2023-hackathon/test.csv
/kaggle/input/int20h-2023-hackathon/final_test.csv


In [23]:
orders = pd.read_csv("/kaggle/input/int20h-2023-hackathon/orders.csv")
nodes  = pd.read_csv("/kaggle/input/int20h-2023-hackathon/nodes.csv")
nodes = nodes[nodes["Id"].isin(orders.Id)]

In [24]:
#Fill NA with mean
nodes.speed.fillna(nodes.speed.mean(), inplace=True)

nodes.speed = nodes.speed / 3.6 # From km/h to m/s
nodes['expected_node_time'] = nodes.distance / nodes.speed

orders = orders.merge(nodes.groupby('Id').sum()['expected_node_time'], left_on='Id', right_index=True).rename({'expected_node_time': 'expected_time'}, axis=1)

In [25]:
orders.running_time = pd.to_datetime(orders.running_time)
orders.completed_time = pd.to_datetime(orders.completed_time)

In [26]:
orders.running_time = orders.running_time.dt.hour * 60 + orders.running_time.dt.minute
orders.completed_time = orders.completed_time.dt.hour * 60 + orders.completed_time.dt.minute

apply_rbf = lambda X: rbf_kernel(X, [[450]], gamma=0.00002) + rbf_kernel(X, [[1050]], gamma=0.000006)

In [27]:
orders = orders.sort_values('running_time')
orders.loc[(orders['running_time'] > 1300) & (orders['completed_time'] < 1000), 'completed_time'] = 1440 + orders.loc[(orders['running_time'] > 1300) & (orders['completed_time'] < 1000), 'completed_time']

In [28]:
# Using the information about latest 8 rides (at the time of a ride start)
# Comparing the true target and the expected target calculated using distance/speed from nodes file
# This will improve the prediction of a speed

latest8_list = []

for ind, row in orders.iterrows():    
    latest8 = orders.loc[orders.completed_time < row.running_time].tail(8).mean()
    latest8_value = (latest8.delta_time - latest8.expected_time) / latest8.delta_time + 1
    latest8_list.append(latest8_value)


latest8_list[:8] = [1] * 8
orders['latest8'] = latest8_list
orders['expected_time'] = orders.latest8 * orders.expected_time

In [29]:
df = pd.merge(orders, nodes, left_on='Id', right_on='Id')
df.drop(['node_start', 'node_finish'], axis=1, inplace=True)

# One order has 0 distance, removed it from the set
df = df.loc[df['route_distance_km'] != 0]

In [30]:
df['node_part_time'] = df['expected_node_time'] / df['expected_time']
df['node_part_distance'] = df['distance'] / df['route_distance_km'] / 1000

In [31]:
df['current_time'] = df.groupby('Id').expected_node_time.transform(np.cumsum) / 60 + df.running_time

df['time_rbf'] = apply_rbf(df[['current_time']])

In [32]:
df['delta_time'] = df['delta_time'] * df['node_part_time']
df.set_index('Id', inplace=True)

In [33]:
X = df.drop(['expected_time', 'delta_time', 'running_time', 'completed_time' ,'current_time'], axis=1)
y = df['delta_time']

In [13]:
test_ind = np.random.choice(X.index.unique(), size=1000, replace=False)
X_test = X.loc[test_ind]
y_test = y.loc[test_ind]
X_train = X.loc[~X.index.isin(test_ind)]
y_train = y.loc[~X.index.isin(test_ind)]

In [14]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

In [15]:
y_pred_node = rf.predict(X_test)
mean_squared_error(y_pred_node, y_test, squared=False)

2.3565900305778023

In [16]:
y_pred_node = pd.Series(y_pred_node, index=y_test.index)
y_pred_total = y_pred_node.groupby('Id').sum()
y_test_total = y_test.groupby('Id').sum()
mean_squared_error(y_pred_total, y_test_total, squared=False)

89.02716025029565

We can see that the RMSE is quite good(hopefully)
So we can

In [59]:
test_orders = pd.read_csv('/kaggle/input/int20h-2023-hackathon/final_test.csv')
test_nodes = pd.read_csv('/kaggle/input/int20h-2023-hackathon/nodes_test.csv')

In [60]:
test_nodes

Unnamed: 0,Id,node_start,node_finish,distance,speed
0,6198,8952394129,8952394128,138.795710,37.0
1,6198,2059503754,4548172320,95.273001,33.0
2,6198,2059504508,2059503754,137.647881,35.0
3,6198,1570776534,1977018578,4.383708,30.0
4,6198,1977018580,1977018576,24.195593,31.0
...,...,...,...,...,...
102491,527850,290891780,5957304897,40.933260,45.0
102492,527850,5957304897,3902949792,63.421598,34.0
102493,527850,3902949792,290404192,4.303810,16.0
102494,527850,290404192,5957304888,5.250640,37.0


In [61]:
#Fill NA with mean
test_nodes.speed.fillna(test_nodes.speed.mean(), inplace=True)

test_nodes.speed = test_nodes.speed / 3.6 # From km/h to m/s
test_nodes['expected_node_time'] = test_nodes.distance / test_nodes.speed

test_orders = test_orders.merge(test_nodes.groupby('Id').sum()['expected_node_time'], left_on='Id', right_index=True).rename({'expected_node_time': 'expected_time'}, axis=1)

test_orders.running_time = pd.to_datetime(test_orders.running_time)

test_orders.running_time = test_orders.running_time.dt.hour * 60 + test_orders.running_time.dt.minute

test_orders = test_orders.sort_values('running_time')

latest8_list = []

for ind, row in test_orders.iterrows():    
    latest8 = orders.loc[orders.completed_time < row.running_time].tail(8).mean()
    latest8_value = (latest8.delta_time - latest8.expected_time) / latest8.delta_time + 1
    latest8_list.append(latest8_value)


latest8_list[:8] = [1] * 8
test_orders['latest8'] = latest8_list
test_orders['expected_time'] = test_orders.latest8 * test_orders['expected_time']

test_df = pd.merge(test_orders, test_nodes, left_on='Id', right_on='Id')
test_df.drop(['node_start', 'node_finish'], axis=1, inplace=True)

# One order has 0 distance, removed it from the set
test_df = test_df.loc[test_df['route_distance_km'] != 0]

test_df['node_part_time'] = test_df['expected_node_time'] / test_df['expected_time']
test_df['node_part_distance'] = test_df['distance'] / test_df['route_distance_km'] / 1000


test_df['current_time'] = test_df.groupby('Id').expected_node_time.transform(np.cumsum) / 60 + test_df.running_time
test_df['time_rbf'] = apply_rbf(test_df[['current_time']])

test_df.set_index('Id', inplace=True)

In [62]:
X_nodes = df.drop(['expected_time', 'running_time' ,'current_time', 'completed'], axis=1)

In [63]:
X_nodes

Unnamed: 0_level_0,completed_time,route_distance_km,delta_time,latest8,distance,speed,expected_node_time,node_part_time,node_part_distance,time_rbf
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1232212655921391683,40,8.023,1.138437,1.000000,8.071906,7.222222,1.117649,0.001751,0.001006,0.031318
1232212655921391683,40,8.023,6.260332,1.000000,54.631245,8.888889,6.146015,0.009631,0.006809,0.031371
1232212655921391683,40,8.023,6.153574,1.000000,85.583772,14.166667,6.041207,0.009467,0.010667,0.031424
1232212655921391683,40,8.023,15.087167,1.000000,74.058342,5.000000,14.811668,0.023211,0.009231,0.031552
1232212655921391683,40,8.023,0.242481,1.000000,3.174041,13.333333,0.238053,0.000373,0.000396,0.031554
...,...,...,...,...,...,...,...,...,...,...
4378890784988522255,1444,4.739,14.759764,1.250794,80.225910,10.000000,8.022591,0.028439,0.016929,0.405027
4378890784988522255,1444,4.739,13.561506,1.250794,71.665261,9.722222,7.371284,0.026130,0.015122,0.404795
4378890784988522255,1444,4.739,30.910351,1.250794,154.010596,9.166667,16.801156,0.059558,0.032499,0.404267
4378890784988522255,1444,4.739,13.212848,1.250794,61.843045,8.611111,7.181773,0.025458,0.013050,0.404041
