In [1]:
import pandas as pd
from pathlib import Path
import json
from pprint import pprint
import math
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
from functools import cache


DATA_DIR = Path("data/benchmarking/nogif_1000")
DATA_DIR = Path("data/benchmarking/nocars")
SEED = 123
np.random.seed(SEED)

In [2]:
def get_in_road_percentage(steps_df: pd.DataFrame) -> float:
    ret = steps_df["out_of_road"].value_counts(normalize=True).at[False]
    return ret

In [3]:
def get_n_sidewalk_crashes(steps_df: pd.DataFrame) -> int:
    """Count number of crash episodes to not count same crash multiple times"""
    try:
        # count number of times "crash" becomes True for some time
        n_crashes = steps_df["crash_sidewalk"].diff().value_counts().at[True]

        # need to divide by 2 beacouse diff counts twice
        n_crashes /= 2

        # just in case crash is last episode and we have 3.5 crash episodes make it 4
        n_crashes = math.ceil(n_crashes)
    except KeyError:
        n_crashes = 0

    return n_crashes

In [4]:
def process_steps(steps_infos: list) -> dict:
    """Accepts a list of steps and returns a dict of interesting data"""

    steps_df = pd.DataFrame(steps_infos)
    steps_data = {}
    last_step = steps_df.iloc[-1]

    steps_data = {
        "termination.arrive_dest": last_step["arrive_dest"],
        "termination.timeout": last_step["max_step"],
        "route_completion": last_step["route_completion"],
        "seed": last_step["env_seed"],
        "in_road_percentage": get_in_road_percentage(steps_df),
        "n_sidewalk_crashes": get_n_sidewalk_crashes(steps_df),
    }

    return steps_data

In [5]:
@cache
def get_scenarios_df(dir: Path):
    paths = list(dir.glob("*/*.json"))

    scenarios = []
    for file_path in tqdm(paths):

        with open(file_path, "r") as f:
            scenario_data = json.load(f)

        dir = file_path.__str__().split("/")[-2]
        _, dr, _, dt = dir.split("_")
        scenario_data["decision_repeat"] = int(dr)
        scenario_data["dt"] = float(dt)

        steps_infos = scenario_data.pop("steps_infos")
        scenario_data.update(process_steps(steps_infos))
        scenarios.append(scenario_data)
    return pd.DataFrame(scenarios)

In [6]:
df = get_scenarios_df(DATA_DIR)

100%|██████████| 5202/5202 [00:23<00:00, 217.00it/s]


In [7]:
df = df.set_index(
    [
        "dt",
        "decision_repeat",
        "seed",
    ],
    verify_integrity=True,
)

df = df.sort_index()

In [8]:
idx = pd.IndexSlice

In [9]:
# MAX_SCENARIOS = 100
# df = df.loc[idx[:, :, :MAX_SCENARIOS]]

In [10]:
# calculate driving score
# ! Problem changing the values here in the analysis can change % error
def calculate_driving_score(df):
    df["driving_score"] = (
        df["route_completion"]
        * df["in_road_percentage"]
        * (0.65) ** df["n_sidewalk_crashes"]
        * (0.60) ** df["n_crashed_vehicles"]
    )
    return df


df = calculate_driving_score(df)

In [11]:
df["driving_score"].describe()

count    5202.000000
mean        0.561996
std         0.343338
min         0.001882
25%         0.233103
50%         0.595016
75%         0.990899
max         1.000638
Name: driving_score, dtype: float64

# Getting featuers


In [12]:
def drop_boring_columns(df):
    # if a column has one elements its boring lets drop it
    for series_name, series in df.items():
        if len(series.unique()) == 1:
            df = df.drop(series_name, axis=1)
            print(f"Dropped: {series_name}")

    return df

In [13]:
scenario_columns = ["def.map_seq", "def.bv_data"]
scenario_df = df[scenario_columns]

map_df = pd.json_normalize(scenario_df["def.map_seq"])
normalized_map_df = pd.concat(
    [pd.json_normalize(map_df[col]).add_prefix(f"{col}.") for col in map_df], axis=1
)

normalized_map_df = drop_boring_columns(normalized_map_df)
normalized_map_df

Dropped: 0.id
Dropped: 0.pre_block_socket_index
Dropped: 1.pre_block_socket_index


Unnamed: 0,1.angle,1.radius,1.length,1.dir,1.id,1.change_lane_num,1.t_type,1.decrease_increase,1.inner_radius,1.exit_radius,...,5.radius,5.length,5.dir,5.id,5.pre_block_socket_index,5.change_lane_num,5.t_type,5.decrease_increase,5.inner_radius,5.exit_radius
0,125.061348,56.134972,75.582825,1.0,C,,,,,,...,40.848209,58.112240,0.0,C,4X-socket0,,,,,
1,,10.000000,,,T,0.0,1.0,0.0,,,...,,65.340332,,S,4T-socket1,,,,,
2,,10.000000,,,T,0.0,0.0,0.0,,,...,,26.290545,,r,4C-socket0,,,,,
3,,10.000000,,,X,0.0,,1.0,,,...,10.000000,,,T,4C-socket0,0.0,1.0,1.0,,
4,,10.000000,,,T,0.0,2.0,1.0,,,...,,28.557396,,R,4T-socket2,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5197,,10.000000,,,T,0.0,0.0,0.0,,,...,29.888020,45.586308,0.0,C,4X-socket0,,,,,
5198,60.000000,,,,O,,,,36.541653,12.180551,...,,,,O,4R-socket0,,,,20.94009,6.98003
5199,69.473236,34.517368,50.876991,0.0,C,,,,,,...,,28.330769,,R,4T-socket2,,,,,
5200,103.999573,47.944279,66.222031,1.0,C,,,,,,...,10.000000,,,T,4C-socket0,0.0,2.0,1.0,,


In [14]:
n_occurences = normalized_map_df.count()
best_featuers = n_occurences.sort_values(ascending=False)

N_FEATURES = 20
best_featuers = best_featuers.iloc[:N_FEATURES].index.to_list()
best_featuers

best_featuers
#pick best featuers
S = normalized_map_df[best_featuers] 

# leave as it is
S = normalized_map_df

In [15]:
fid = df.reset_index()[["dt", "decision_repeat"]]
# fid

## Input factorisation


In [16]:
X = pd.concat([fid, S], axis=1)
X


X = X.apply(lambda c: pd.factorize(c)[0] if c.dtype == object else c, axis=0)

In [17]:
y = df.reset_index()["driving_score"]
y

0       0.993165
1       0.125072
2       0.993434
3       0.989208
4       0.594541
          ...   
5197    0.213857
5198    0.595677
5199    0.073317
5200    0.063543
5201    0.214906
Name: driving_score, Length: 5202, dtype: float64

# Fitting GPR


### Handling NaNs


In [18]:
X[pd.isna(X)] = -1
X.shape, y.shape

((5202, 56), (5202,))

In [19]:
X.shape[0] / 12

433.5

In [20]:
X = X.to_numpy()
y = y.to_numpy()

In [21]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut, ShuffleSplit
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error

# Offline testing

## Shuffle Splits


In [22]:
# splitter = LeaveOneOut()
# splitter.get_n_splits(X)

# splitter = ShuffleSplit(test_size=100, n_splits=5, random_state=SEED)
splitter = ShuffleSplit(random_state=SEED)
splitter.get_n_splits(X)

10

In [23]:
data = []

for i, (train_index, test_index) in enumerate(splitter.split(X)):
    kernel = RBF(length_scale_bounds=(1e-5, 1e5))
    # kernel = SequenceKernel()
    model = KNeighborsRegressor()
    # model = GaussianProcessRegressor(
    #     kernel=kernel, n_restarts_optimizer=9, random_state=SEED
    # )

    model.fit(X[train_index], y[train_index])

    y_pred = model.predict(X[test_index])
    # y_pred, y_std = model.predict(X[test_index], return_std=True)
    y_true = y[test_index]
    # print(
    #     f"For index: \n{test_index} we predicted \n{y_pred} +- \n{y_std} but True value is: \n{y_true}"
    # )
    print(".")
    data.append([test_index, y_true, y_pred])

.
.
.
.
.
.
.
.
.
.


In [24]:
r2_scores = []
rmses = []
root_mean_squared_errors = []
for d in data:
    array = np.squeeze(d).T
    # print(array)
    results = pd.DataFrame(array, columns=["index", "y_true", "y_pred"])
    results["y_pred"] = results["y_pred"].clip(0, 1)
    r2_scores.append(r2_score(results["y_true"], results["y_pred"]))
    rmses.append(root_mean_squared_error(results["y_true"], results["y_pred"]))



r2_scores = np.array(r2_scores)
rmses = np.array(rmses)

print(f"r2_scores = {r2_scores}")
print(f"{r2_scores.mean() = :.2f}")
print("\n")
print(f"rmses = {rmses}")
print(f"{rmses.mean() = :.2f}")

r2_scores = [0.29777642 0.24710212 0.3160739  0.20855195 0.29614549 0.26564753
 0.34796699 0.34372989 0.27654605 0.29064565]
r2_scores.mean() = 0.29


rmses = [0.27840309 0.29947348 0.28508318 0.29504751 0.2833709  0.2879548
 0.27023735 0.28035435 0.28964066 0.28710182]
rmses.mean() = 0.29


In [25]:
results = pd.DataFrame(
    np.squeeze(data[0]).T, columns=["index", "y_true", "y_pred", "std"]
)
results["y_pred"] = results["y_pred"].clip(0, 1)
# results

ValueError: Shape of passed values is (521, 3), indices imply (521, 4)

In [26]:
r2_score(results["y_true"], results["y_pred"])

0.32112108424819574

In [27]:
root_mean_squared_error(results["y_true"], results["y_pred"])
mean_absolute_error(results["y_true"], results["y_pred"])

0.25064369599179526

In [28]:
results

Unnamed: 0,index,y_true,y_pred,std
0,933.0,0.420382,0.000000,0.363885
1,365.0,0.644080,1.000000,0.363885
2,537.0,0.001012,0.235281,0.224693
3,524.0,0.993222,1.000000,0.224693
4,1128.0,0.244153,0.312478,0.363885
...,...,...,...,...
117,552.0,0.074724,0.402227,0.224693
118,986.0,0.273171,0.734015,0.363885
119,97.0,0.648457,0.339592,0.852709
120,172.0,0.993334,1.000000,0.363885
