In [1]:
import pandas as pd
from pathlib import Path
import json
from pprint import pprint
import math
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm

DATA_DIR = Path("data/benchmarking_2")

In [2]:
def get_in_road_percentage(steps_df: pd.DataFrame) -> float:
    ret = steps_df["out_of_road"].value_counts(normalize=True).at[False]
    return ret

In [3]:
def get_n_sidewalk_crashes(steps_df: pd.DataFrame) -> int:
    """Count number of crash episodes to not count same crash multiple times"""
    try:
        # count number of times "crash" becomes True for some time
        n_crashes = steps_df["crash_sidewalk"].diff().value_counts().at[True]

        # need to divide by 2 beacouse diff counts twice
        n_crashes /= 2

        # just in case crash is last episode and we have 3.5 crash episodes make it 4
        n_crashes = math.ceil(n_crashes)
    except KeyError:
        n_crashes = 0

    return n_crashes

In [4]:
def process_steps(steps_infos: list) -> dict:
    """Accepts a list of steps and returns a dict of interesting data"""

    steps_df = pd.DataFrame(steps_infos)
    steps_data = {}
    last_step = steps_df.iloc[-1]

    steps_data = {
        "termination.arrive_dest": last_step["arrive_dest"],
        "termination.timeout": last_step["max_step"],
        "route_completion": last_step["route_completion"],
        "seed": last_step["env_seed"],
        "in_road_percentage": get_in_road_percentage(steps_df),
        "n_sidewalk_crashes": get_n_sidewalk_crashes(steps_df),
    }

    return steps_data

In [5]:
def get_scenarios_df(dir: Path):
    paths = list(dir.glob("*/*.json"))

    scenarios = []
    for file_path in tqdm(paths):

        with open(file_path, "r") as f:
            scenario_data = json.load(f)

        dir = file_path.__str__().split("/")[-2]
        _, dr, _, dt = dir.split("_")
        scenario_data["decision_repeat"] = int(dr)
        scenario_data["dt"] = float(dt)

        steps_infos = scenario_data.pop("steps_infos")
        scenario_data.update(process_steps(steps_infos))
        scenarios.append(scenario_data)
    return pd.DataFrame(scenarios)

In [6]:
df = get_scenarios_df(DATA_DIR)

100%|██████████| 1200/1200 [00:06<00:00, 189.66it/s]


In [7]:
df = df.set_index(
    [
        "dt",
        "decision_repeat",
        "seed",
    ],
    verify_integrity=True,
)

df = df.sort_index()

In [8]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,def.map_seq,def.bv_data,def.max_steps,start_ts,initialized_ts,scenario_done_ts,init_time,scenario_time,total_time,n_crashed_vehicles,termination.arrive_dest,termination.timeout,route_completion,in_road_percentage,n_sidewalk_crashes
dt,decision_repeat,seed,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0.02,5,0,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",3364,14297.994562,14298.385102,14335.377040,0.390540,36.991938,37.382478,0,True,False,0.993165,1.000000,0
0.02,5,1,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",1892,14497.187548,14497.460049,14568.143965,0.272501,70.683916,70.956416,2,False,True,0.347422,1.000000,0
0.02,5,2,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",3574,14664.514194,14664.816701,14703.971781,0.302506,39.155080,39.457586,0,True,False,0.993434,1.000000,0
0.02,5,3,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",2092,14936.599530,14936.911753,14964.727519,0.312222,27.815767,28.127989,0,True,False,0.989208,1.000000,0
0.02,5,4,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",2815,15083.427330,15083.768677,15122.644177,0.341347,38.875499,39.216847,1,True,False,0.990902,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.04,20,95,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",423,11750.692365,11751.003348,11751.365077,0.310982,0.361729,0.672712,0,False,False,0.159905,0.973684,1
0.04,20,96,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",330,11779.477297,11779.781817,11780.598203,0.304520,0.816386,1.120907,1,True,False,0.991643,1.000000,3
0.04,20,97,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",352,11812.076223,11812.620152,11812.769872,0.543929,0.149720,0.693649,0,False,False,0.132175,0.937500,1
0.04,20,98,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",342,11851.082107,11851.520293,11852.703743,0.438186,1.183450,1.621636,3,True,False,0.995902,1.000000,0


In [9]:
# calculate driving score
# ! Problem changing the values here in the analysis can change % error
def calculate_driving_score(df):
    df["driving_score"] = (
        df["route_completion"]
        * df["in_road_percentage"]
        * (0.65) ** df["n_sidewalk_crashes"]
        * (0.60) ** df["n_crashed_vehicles"]
    )
    return df


df = calculate_driving_score(df)

In [10]:
df["driving_score"].describe()

count    1200.000000
mean        0.557781
std         0.332933
min         0.001882
25%         0.265636
50%         0.594773
75%         0.990249
max         1.000638
Name: driving_score, dtype: float64

## Getting scenario featuers


In [11]:
def get_features(col_name):
    x = df[col_name]
    x = x.reset_index(drop=True)
    x = pd.json_normalize(x)
    x = pd.concat(
        [pd.json_normalize(x[col]).add_prefix(f"{col}.") for col in x], axis=1
    )
    # convert lists to strings
    x = x.map(lambda x: ",".join(x) if isinstance(x, list) else x)
    x = pd.get_dummies(x)
    x = x.add_prefix(f"{col_name}.")
    return x

In [12]:
def_columns = ["def.map_seq", "def.bv_data"]


map_feat = get_features("def.map_seq")
bv_feat = get_features("def.bv_data")

S = map_feat.join(bv_feat).to_numpy()

In [13]:
S.shape

(1200, 1309)

## Get fid vector


In [14]:
import numpy as np

fid = df.index.droplevel("seed")
fid = fid.to_numpy()

fid = np.vstack(fid)

fid.shape, fid,

((1200, 2),
 array([[ 0.02,  5.  ],
        [ 0.02,  5.  ],
        [ 0.02,  5.  ],
        ...,
        [ 0.04, 20.  ],
        [ 0.04, 20.  ],
        [ 0.04, 20.  ]]))

### Adding to scenario data


In [15]:
X = np.hstack((fid, S))
X.shape

(1200, 1311)

## Get DScore vector


In [16]:
y = df["driving_score"].to_numpy()
y.shape, y

((1200,),
 array([0.99316503, 0.12507196, 0.99343402, ..., 0.08054419, 0.21511478,
        0.38666393]))

# Fitting GPR


### Handling NaNs


In [47]:
X[pd.isna(X)] = -1
X

array([[0.02, 5.0, 75.58282470703125, ..., False, False, False],
       [0.02, 5.0, -1, ..., False, False, False],
       [0.02, 5.0, -1, ..., False, False, False],
       ...,
       [0.04, 20.0, -1, ..., False, False, False],
       [0.04, 20.0, -1, ..., False, False, False],
       [0.04, 20.0, -1, ..., False, False, False]], dtype=object)

In [48]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split

In [60]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=0, shuffle=True
)

y_test.shape

(120,)

In [61]:
kernel = 1 * RBF()
gaussian_process = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
gaussian_process.fit(X_train, y_train)
gaussian_process.kernel_

0.638**2 * RBF(length_scale=0.0135)

In [69]:
mean_prediction, std_prediction = gaussian_process.predict(X_test, return_std=True)

mean_prediction, std_prediction

(array([ 0.0240111 ,  0.01662964,  0.05744131,  0.16288696,  0.46188753,
         0.54357702,  0.61222045,  0.14169585,  0.36843489,  0.69329385,
         0.31340557, -0.14531149,  0.04643631,  0.06223434,  0.08347742,
         0.24441599, -0.14333465,  0.66742837, -0.03917267,  0.23515189,
         0.        ,  0.19858977,  0.16288696, -0.14481927,  0.50746854,
         0.08664729,  0.70910537,  0.63827417,  0.61454013, -0.36047707,
         0.50652703,  0.75432066,  0.61623991,  0.54279063,  0.45428789,
         0.3295611 ,  0.37006182, -0.10750587,  0.61466575,  0.71424538,
         0.19759225,  1.07701895,  0.40686704,  0.55503983,  0.54485993,
         0.22718029,  0.34112886, -0.31622496, -0.02404514,  0.69872637,
         0.67776344,  0.1980185 , -0.18746139,  0.75432066,  0.61687844,
         0.31340557,  0.22069013,  0.17563622,  0.75486968,  0.1413926 ,
         0.84314034,  0.9048062 ,  0.04112913,  0.45298107,  0.11886325,
         0.        ,  0.76743365,  0.39620822,  0.4

In [70]:
gaussian_process.score(X_test, y_test)

-0.7127280235620792

In [71]:
from sklearn.model_selection import cross_val_score


scores = cross_val_score(
    GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9), X, y, cv=10
)
scores

In [53]:
print(
    "%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std())
)

-1.82 accuracy with a standard deviation of 2.51
