In [1]:
import pandas as pd
from pathlib import Path
import json
from pprint import pprint
import math
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np

DATA_DIR = Path("data/benchmarking/nogif_1000")
DATA_DIR = Path("data/benchmarking/nocars")
SEED = 2137
np.random.seed(SEED)

In [2]:
def get_in_road_percentage(steps_df: pd.DataFrame) -> float:
    ret = steps_df["out_of_road"].value_counts(normalize=True).at[False]
    return ret

In [3]:
def get_n_sidewalk_crashes(steps_df: pd.DataFrame) -> int:
    """Count number of crash episodes to not count same crash multiple times"""
    try:
        # count number of times "crash" becomes True for some time
        n_crashes = steps_df["crash_sidewalk"].diff().value_counts().at[True]

        # need to divide by 2 beacouse diff counts twice
        n_crashes /= 2

        # just in case crash is last episode and we have 3.5 crash episodes make it 4
        n_crashes = math.ceil(n_crashes)
    except KeyError:
        n_crashes = 0

    return n_crashes

In [4]:
def process_steps(steps_infos: list) -> dict:
    """Accepts a list of steps and returns a dict of interesting data"""

    steps_df = pd.DataFrame(steps_infos)
    steps_data = {}
    last_step = steps_df.iloc[-1]

    steps_data = {
        "termination.arrive_dest": last_step["arrive_dest"],
        "termination.timeout": last_step["max_step"],
        "route_completion": last_step["route_completion"],
        "seed": last_step["env_seed"],
        "in_road_percentage": get_in_road_percentage(steps_df),
        "n_sidewalk_crashes": get_n_sidewalk_crashes(steps_df),
    }

    return steps_data

In [5]:
def get_scenarios_df(dir: Path):
    paths = list(dir.glob("*/*.json"))

    scenarios = []
    for file_path in tqdm(paths):

        with open(file_path, "r") as f:
            scenario_data = json.load(f)

        dir = file_path.__str__().split("/")[-2]
        _, dr, _, dt = dir.split("_")
        scenario_data["decision_repeat"] = int(dr)
        scenario_data["dt"] = float(dt)

        steps_infos = scenario_data.pop("steps_infos")
        scenario_data.update(process_steps(steps_infos))
        scenarios.append(scenario_data)
    return pd.DataFrame(scenarios)

In [6]:
df = get_scenarios_df(DATA_DIR)

100%|██████████| 8031/8031 [00:52<00:00, 151.77it/s]


In [7]:
df = df.set_index(
    [
        "dt",
        "decision_repeat",
        "seed",
    ],
    verify_integrity=True,
)

df = df.sort_index()

In [8]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,def.map_seq,def.bv_data,def.max_steps,start_ts,initialized_ts,scenario_done_ts,init_time,scenario_time,total_time,n_crashed_vehicles,termination.arrive_dest,termination.timeout,route_completion,in_road_percentage,n_sidewalk_crashes
dt,decision_repeat,seed,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0.02,5,0,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",3364,295.398451,295.756016,296.732468,0.357566,0.976452,1.334017,0,True,False,0.992768,1.000000,0
0.02,5,1,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",1892,305.808808,306.018085,306.512072,0.209277,0.493987,0.703264,0,True,False,0.989633,1.000000,0
0.02,5,2,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",3574,312.702427,312.918462,313.648016,0.216035,0.729554,0.945589,0,True,False,0.993424,1.000000,0
0.02,5,3,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",2092,321.409763,321.608535,322.189560,0.198772,0.581025,0.779797,0,True,False,0.989005,1.000000,0
0.02,5,4,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",2815,329.054510,329.256271,329.887509,0.201761,0.631238,0.832999,0,True,False,0.991921,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0.04,20,664,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",305,17467.084677,17467.420044,17467.856325,0.335367,0.436280,0.771647,0,True,False,0.993178,1.000000,3
0.04,20,665,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",245,17498.128562,17498.434849,17498.812548,0.306286,0.377699,0.683985,0,True,False,0.984222,1.000000,0
0.04,20,666,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",351,17529.623422,17529.960517,17530.012054,0.337095,0.051537,0.388632,0,False,False,0.159883,0.947368,1
0.04,20,667,"[{'id': 'I', 'pre_block_socket_index': None}, ...","[{'length': 4.515, 'width': 1.852, 'height': 1...",459,17563.773948,17564.276805,17564.507056,0.502857,0.230251,0.733108,0,False,False,0.416300,0.985075,3


In [9]:
# calculate driving score
# ! Problem changing the values here in the analysis can change % error
def calculate_driving_score(df):
    df["driving_score"] = (
        df["route_completion"]
        * df["in_road_percentage"]
        * (0.65) ** df["n_sidewalk_crashes"]
        * (0.60) ** df["n_crashed_vehicles"]
    )
    return df


df = calculate_driving_score(df)

In [10]:
df["driving_score"].describe()

count    8031.000000
mean        0.679699
std         0.391191
min         0.000180
25%         0.250473
50%         0.990056
75%         0.992989
max         1.002060
Name: driving_score, dtype: float64

# Getting featuers


## Getting scenario featuers


In [11]:
def get_features(col_name):
    x = df[col_name]
    x = x.reset_index(drop=True)
    x = pd.json_normalize(x)
    x = pd.concat(
        [pd.json_normalize(x[col]).add_prefix(f"{col}.") for col in x], axis=1
    )
    # convert lists to strings
    x = x.map(lambda x: ",".join(x) if isinstance(x, list) else x)
    x = pd.get_dummies(x)
    x = x.add_prefix(f"{col_name}.")
    return x

In [12]:
def_columns = ["def.map_seq", "def.bv_data"]


map_feat = get_features("def.map_seq")
bv_feat = get_features("def.bv_data")

S = map_feat.join(bv_feat).to_numpy()

In [13]:
S.shape

(8031, 163)

## Get fid vector


In [14]:
fid = df.index.droplevel("seed")
fid = fid.to_numpy()

fid = np.vstack(fid)

fid.shape, fid,

((8031, 2),
 array([[ 0.02,  5.  ],
        [ 0.02,  5.  ],
        [ 0.02,  5.  ],
        ...,
        [ 0.04, 20.  ],
        [ 0.04, 20.  ],
        [ 0.04, 20.  ]]))

### Adding to scenario data


In [15]:
X = np.hstack((fid, S))
X.shape

(8031, 165)

## Get DScore vector


In [16]:
y = df["driving_score"].to_numpy()
y.shape, y

((8031,),
 array([0.99276847, 0.98963269, 0.99342371, ..., 0.09845456, 0.11262007,
        0.17744145]))

# Fitting GPR


### Handling NaNs


In [17]:
X[pd.isna(X)] = -1
X.shape

(8031, 165)

In [18]:
X.shape[0] / 12

669.25

In [19]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut, ShuffleSplit
from sklearn.metrics import r2_score

## LOO testing


In [20]:
# splitter = LeaveOneOut()
# splitter.get_n_splits(X)

splitter = ShuffleSplit(test_size=100, n_splits=5, random_state=SEED)
splitter.get_n_splits(X)

5

In [21]:
data = []

for i, (train_index, test_index) in enumerate(splitter.split(X)):
    kernel = RBF(length_scale_bounds=(1e-5, 1e5))
    gaussian_process = GaussianProcessRegressor(
        kernel=kernel, n_restarts_optimizer=9, random_state=SEED
    )

    gaussian_process.fit(X[train_index], y[train_index])

    y_pred, y_std = gaussian_process.predict(X[test_index], return_std=True)
    y_true = y[test_index]
    # print(
    #     f"For index: \n{test_index} we predicted \n{y_pred} +- \n{y_std} but True value is: \n{y_true}"
    # )
    print(".")
    data.append([test_index, y_true, y_pred, y_std])

.
.
.
.
.


In [22]:
r2_scores = []

for d in data:
    array = np.squeeze(d).T
    # print(array)
    results = pd.DataFrame(array, columns=["index", "y_true", "y_pred", "std"])
    results["y_pred"] = results["y_pred"].clip(0, 1)
    r2_scores.append(r2_score(results["y_true"], results["y_pred"]))


r2_scores = np.array(r2_scores)
print(f"r2_scores = {r2_scores}")
print(f"{r2_scores.mean() = :.2f}")

r2_scores = [ 0.25069163  0.37942173  0.39596091 -0.12199026  0.21485918]
r2_scores.mean() = 0.22


In [23]:
results = pd.DataFrame(
    np.squeeze(data[0]).T, columns=["index", "y_true", "y_pred", "std"]
)
results["y_pred"] = results["y_pred"].clip(0, 1)
# results

In [24]:
r2_score(results["y_true"], results["y_pred"])

0.2506916281883248

In [25]:
rmse = (results["y_true"] - results["y_pred"]).abs()
rmse.mean()

0.25358348689548127

In [26]:
results

Unnamed: 0,index,y_true,y_pred,std
0,3000.0,0.988883,1.000000,0.238961
1,2496.0,0.075014,0.000000,0.382445
2,6712.0,0.994878,0.727332,0.382445
3,1422.0,0.273446,0.049487,0.865862
4,4014.0,0.995162,0.575014,0.238961
...,...,...,...,...
95,842.0,0.646564,0.478707,0.382445
96,3982.0,0.115380,0.620673,0.238961
97,6229.0,0.986503,0.719486,0.382445
98,7288.0,0.031577,0.000000,0.382445
