In [1]:
import pandas as pd
from pathlib import Path
import json
from pprint import pprint
import math
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np

DATA_DIR = Path("data/benchmarking/nogif_1000")
DATA_DIR = Path("data/benchmarking/nocars")
SEED = 2137
np.random.seed(SEED)

In [2]:
def get_in_road_percentage(steps_df: pd.DataFrame) -> float:
    ret = steps_df["out_of_road"].value_counts(normalize=True).at[False]
    return ret

In [3]:
def get_n_sidewalk_crashes(steps_df: pd.DataFrame) -> int:
    """Count number of crash episodes to not count same crash multiple times"""
    try:
        # count number of times "crash" becomes True for some time
        n_crashes = steps_df["crash_sidewalk"].diff().value_counts().at[True]

        # need to divide by 2 beacouse diff counts twice
        n_crashes /= 2

        # just in case crash is last episode and we have 3.5 crash episodes make it 4
        n_crashes = math.ceil(n_crashes)
    except KeyError:
        n_crashes = 0

    return n_crashes

In [4]:
def process_steps(steps_infos: list) -> dict:
    """Accepts a list of steps and returns a dict of interesting data"""

    steps_df = pd.DataFrame(steps_infos)
    steps_data = {}
    last_step = steps_df.iloc[-1]

    steps_data = {
        "termination.arrive_dest": last_step["arrive_dest"],
        "termination.timeout": last_step["max_step"],
        "route_completion": last_step["route_completion"],
        "seed": last_step["env_seed"],
        "in_road_percentage": get_in_road_percentage(steps_df),
        "n_sidewalk_crashes": get_n_sidewalk_crashes(steps_df),
    }

    return steps_data

In [5]:
def get_scenarios_df(dir: Path):
    paths = list(dir.glob("*/*.json"))

    scenarios = []
    for file_path in tqdm(paths):

        with open(file_path, "r") as f:
            scenario_data = json.load(f)

        dir = file_path.__str__().split("/")[-2]
        _, dr, _, dt = dir.split("_")
        scenario_data["decision_repeat"] = int(dr)
        scenario_data["dt"] = float(dt)

        steps_infos = scenario_data.pop("steps_infos")
        scenario_data.update(process_steps(steps_infos))
        scenarios.append(scenario_data)
    return pd.DataFrame(scenarios)

In [6]:
df = get_scenarios_df(DATA_DIR)

  0%|          | 0/10032 [00:00<?, ?it/s]

100%|██████████| 10032/10032 [00:30<00:00, 328.68it/s]


In [7]:
df = df.set_index(
    [
        "dt",
        "decision_repeat",
        "seed",
    ],
    verify_integrity=True,
)

df = df.sort_index()

In [8]:
idx = pd.IndexSlice

In [9]:
# MAX_SCENARIOS = 100
# df = df.loc[idx[:, :, :MAX_SCENARIOS]]

In [10]:
# calculate driving score
# ! Problem changing the values here in the analysis can change % error
def calculate_driving_score(df):
    df["driving_score"] = (
        df["route_completion"]
        * df["in_road_percentage"]
        * (0.65) ** df["n_sidewalk_crashes"]
        * (0.60) ** df["n_crashed_vehicles"]
    )
    return df


df = calculate_driving_score(df)

In [11]:
df["driving_score"].describe()

count    10032.000000
mean         0.688012
std          0.387943
min          0.000180
25%          0.271996
50%          0.990132
75%          0.993000
max          1.003083
Name: driving_score, dtype: float64

# Getting featuers


In [12]:
def drop_boring_columns(df):
    # if a column has one elements its boring lets drop it
    for series_name, series in df.items():
        if len(series.unique()) == 1:
            df = df.drop(series_name, axis=1)
            print(f"Dropped: {series_name}")

    return df

In [13]:
scenario_columns = ["def.map_seq", "def.bv_data"]
scenario_df = df[scenario_columns]

map_df = pd.json_normalize(scenario_df["def.map_seq"])
normalized_map_df = pd.concat(
    [pd.json_normalize(map_df[col]).add_prefix(f"{col}.") for col in map_df], axis=1
)

normalized_map_df = drop_boring_columns(normalized_map_df)
normalized_map_df

Dropped: 0.id
Dropped: 0.pre_block_socket_index
Dropped: 1.pre_block_socket_index


Unnamed: 0,1.radius,1.length,1.dir,1.angle,1.id,1.change_lane_num,1.decrease_increase,1.t_type,1.exit_radius,1.inner_radius,...,5.length,5.dir,5.angle,5.id,5.pre_block_socket_index,5.change_lane_num,5.decrease_increase,5.t_type,5.exit_radius,5.inner_radius
0,56.134972,75.582825,1.0,125.061348,C,,,,,,...,58.112240,0.0,85.752541,C,4X-socket0,,,,,
1,10.000000,,,,T,0.0,0.0,1.0,,,...,65.340332,,,S,4T-socket1,,,,,
2,10.000000,,,,T,0.0,0.0,0.0,,,...,26.290545,,,r,4C-socket0,,,,,
3,10.000000,,,,X,0.0,1.0,,,,...,,,,T,4C-socket0,0.0,1.0,1.0,,
4,10.000000,,,,T,0.0,1.0,2.0,,,...,28.557396,,,R,4T-socket2,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10027,,78.279800,,,S,,,,,,...,53.760792,0.0,75.961784,C,4X-socket0,,,,,
10028,,38.739941,,,R,,,,,,...,,,,T,4C-socket0,0.0,1.0,2.0,,
10029,46.643707,64.735664,1.0,100.655251,C,,,,,,...,35.488621,,,R,4C-socket0,,,,,
10030,10.000000,,,,T,0.0,0.0,0.0,,,...,,,,T,4X-socket2,0.0,0.0,0.0,,


In [14]:
n_occurences = normalized_map_df.count()
best_featuers = n_occurences.sort_values(ascending=False)

N_FEATURES = 20
best_featuers = best_featuers.iloc[:N_FEATURES].index.to_list()
best_featuers

S = normalized_map_df[best_featuers]
best_featuers

['3.pre_block_socket_index',
 '4.id',
 '4.pre_block_socket_index',
 '1.id',
 '5.pre_block_socket_index',
 '5.id',
 '2.pre_block_socket_index',
 '2.id',
 '3.id',
 '5.length',
 '1.length',
 '3.length',
 '2.radius',
 '4.radius',
 '3.radius',
 '2.length',
 '5.radius',
 '1.radius',
 '4.length',
 '3.angle']

In [15]:
fid = df.reset_index()[["dt", "decision_repeat"]]
# fid

## Input factorisation


In [16]:
X = pd.concat([fid, S], axis=1)
X


X = X.apply(lambda c: pd.factorize(c)[0] if c.dtype == object else c, axis=0)

In [17]:
y = df.reset_index()["driving_score"]
y

0        0.992768
1        0.989633
2        0.993424
3        0.989005
4        0.991921
           ...   
10027    0.321407
10028    0.405315
10029    0.644866
10030    0.993334
10031    0.228466
Name: driving_score, Length: 10032, dtype: float64

# Fitting GPR


### Handling NaNs


In [18]:
X[pd.isna(X)] = -1
X.shape, y.shape

((10032, 22), (10032,))

In [19]:
X.shape[0] / 12

836.0

In [20]:
X = X.to_numpy()
y = y.to_numpy()

In [21]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut, ShuffleSplit
from sklearn.metrics import r2_score

## Shuffle Splits


In [22]:
import numpy as np

from sklearn.base import clone
from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
from sklearn.gaussian_process.kernels import GenericKernelMixin, Hyperparameter, Kernel


class SequenceKernel(GenericKernelMixin, Kernel):
    """
    A minimal (but valid) convolutional kernel for sequences of variable
    lengths."""

    def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)):
        self.baseline_similarity = baseline_similarity
        self.baseline_similarity_bounds = baseline_similarity_bounds

    @property
    def hyperparameter_baseline_similarity(self):
        return Hyperparameter(
            "baseline_similarity", "numeric", self.baseline_similarity_bounds
        )

    def _f(self, s1, s2):
        """
        kernel value between a pair of sequences
        """
        return sum(
            [1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2]
        )

    def _g(self, s1, s2):
        """
        kernel derivative between a pair of sequences
        """
        return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])

    def __call__(self, X, Y=None, eval_gradient=False):
        if Y is None:
            Y = X

        if eval_gradient:
            return (
                np.array([[self._f(x, y) for y in Y] for x in X]),
                np.array([[[self._g(x, y)] for y in Y] for x in X]),
            )
        else:
            return np.array([[self._f(x, y) for y in Y] for x in X])

    def diag(self, X):
        return np.array([self._f(x, x) for x in X])

    def is_stationary(self):
        return False

    def clone_with_theta(self, theta):
        cloned = clone(self)
        cloned.theta = theta
        return cloned


kernel = SequenceKernel()

In [23]:
# splitter = LeaveOneOut()
# splitter.get_n_splits(X)

splitter = ShuffleSplit(test_size=100, n_splits=5, random_state=SEED)
splitter = ShuffleSplit(random_state=SEED)
splitter.get_n_splits(X)

10

In [24]:
data = []

for i, (train_index, test_index) in enumerate(splitter.split(X)):
    kernel = RBF(length_scale_bounds=(1e-5, 1e5))
    # kernel = SequenceKernel()
    gaussian_process = GaussianProcessRegressor(
        kernel=kernel, n_restarts_optimizer=9, random_state=SEED
    )

    gaussian_process.fit(X[train_index], y[train_index])

    y_pred, y_std = gaussian_process.predict(X[test_index], return_std=True)
    y_true = y[test_index]
    # print(
    #     f"For index: \n{test_index} we predicted \n{y_pred} +- \n{y_std} but True value is: \n{y_true}"
    # )
    print(".")
    data.append([test_index, y_true, y_pred, y_std])

In [25]:
r2_scores = []

for d in data:
    array = np.squeeze(d).T
    # print(array)
    results = pd.DataFrame(array, columns=["index", "y_true", "y_pred", "std"])
    results["y_pred"] = results["y_pred"].clip(0, 1)
    r2_scores.append(r2_score(results["y_true"], results["y_pred"]))


r2_scores = np.array(r2_scores)
print(f"r2_scores = {r2_scores}")
print(f"{r2_scores.mean() = :.2f}")

r2_scores = [0.32112108 0.39518821 0.27563    0.09064001 0.07562609 0.02367214
 0.22799263 0.02897976 0.30235733 0.28358121]
r2_scores.mean() = 0.20


In [26]:
results = pd.DataFrame(
    np.squeeze(data[0]).T, columns=["index", "y_true", "y_pred", "std"]
)
results["y_pred"] = results["y_pred"].clip(0, 1)
# results

In [27]:
r2_score(results["y_true"], results["y_pred"])

0.32112108424819574

In [28]:
rmse = (results["y_true"] - results["y_pred"]).abs()
rmse.mean()

0.25064369599179526

In [29]:
results

Unnamed: 0,index,y_true,y_pred,std
0,933.0,0.420382,0.000000,0.363885
1,365.0,0.644080,1.000000,0.363885
2,537.0,0.001012,0.235281,0.224693
3,524.0,0.993222,1.000000,0.224693
4,1128.0,0.244153,0.312478,0.363885
...,...,...,...,...
117,552.0,0.074724,0.402227,0.224693
118,986.0,0.273171,0.734015,0.363885
119,97.0,0.648457,0.339592,0.852709
120,172.0,0.993334,1.000000,0.363885
