# Additional features


## Dependencies


The dependencies used are as follows


In [1]:
from sklearn.preprocessing import LabelEncoder, RobustScaler

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.simplefilter("ignore")

## Preprocessing


The next thing we are going to add is additional features, such as

- Average , best and worst qualifying times produced for each driver for each race
- Driver experience by year and by race
- Driver podiums by season
- Driver positions by season (number of times each position as the season progresses)


In [2]:
df = pd.read_csv("../assets/data/processed/circuit.csv")

In [3]:
get_worst_q = lambda row: row[["q1", "q2", "q3"]].min()
get_best_q = lambda row: row[["q1", "q2", "q3"]].max()
get_avg_q = lambda row: row[["q1", "q2", "q3"]].mean()

df["qMin"] = df.apply(get_worst_q, axis=1)
df["qMax"] = df.apply(get_best_q, axis=1)
df["qAvg"] = df.apply(get_avg_q, axis=1)

In [4]:
min_year = df.groupby("driverRef").min()["driverAgeAtRace"]
min_year = (min_year.reset_index()).rename(
    {"driverAgeAtRace": "driverAgeStarted"}, axis=1
)
exp_by_race = lambda x: [
    sum(df.loc[:i, "driverRef"] == r["driverRef"]) for i, r in x.iterrows()
]

df = df.merge(min_year, how="left", on="driverRef")
df["driverExpYear"] = df["driverAgeAtRace"] - df["driverAgeStarted"]
df["driverExpRace"] = exp_by_race(df)

In [5]:
podiums = df[df["positionFinal"].isin([1, 2, 3])][
    ["raceYear", "raceRound", "driverRef"]
]
podiums = podiums.groupby(by=["raceYear", "raceRound"]).agg({"driverRef": ",".join})

df = df.merge(
    podiums, how="left", on=["raceYear", "raceRound"], suffixes=("", "Podium")
)
df["driverIsPodium"] = df.apply(
    lambda x: x["driverRef"] in x["driverRefPodium"], axis=1
)
df["driverPodiums"] = df.groupby(["raceYear", "driverRef"])["driverIsPodium"].cumsum()
df = df.drop(["driverRefPodium", "driverIsPodium"], axis=1)

years = df["raceYear"].drop_duplicates().to_numpy()
drivers = df["driverRef"].drop_duplicates().to_numpy()

for year in years:
    for driver in drivers:
        mask = (df["raceYear"] == year) & (df["driverRef"] == driver)
        races = df.loc[mask, "driverPodiums"].iloc[:-1]
        races.loc[-1] = 0
        races.index += 1
        races.sort_index(inplace=True)
        races = races.to_numpy()
        df.loc[mask, "driverPodiums"] = races

In [6]:
for i in range(2, 19):
    position = df[df["positionFinal"].isin([i])][["raceYear", "raceRound", "driverRef"]]
    position = position.groupby(by=["raceYear", "raceRound"]).agg(
        {"driverRef": ",".join}
    )

    df = df.merge(
        position, how="left", on=["raceYear", "raceRound"], suffixes=("", f"Pos{i}")
    )
    df[f"driverIsPos{i}"] = df.apply(
        lambda x: x["driverRef"] in x[f"driverRefPos{i}"], axis=1
    )
    df[f"driverPos{i}"] = df.groupby(["raceYear", "driverRef"])[
        f"driverIsPos{i}"
    ].cumsum()
    df = df.drop([f"driverRefPos{i}", f"driverIsPos{i}"], axis=1)

years = df["raceYear"].drop_duplicates().to_numpy()
drivers = df["driverRef"].drop_duplicates().to_numpy()

features = [
    "driverPos2",
    "driverPos3",
    "driverPos4",
    "driverPos5",
    "driverPos6",
    "driverPos7",
    "driverPos8",
    "driverPos9",
    "driverPos10",
    "driverPos11",
    "driverPos12",
    "driverPos13",
    "driverPos14",
    "driverPos15",
    "driverPos16",
    "driverPos17",
    "driverPos18",
]

for year in years:
    for driver in drivers:
        mask = (df["raceYear"] == year) & (df["driverRef"] == driver)
        races = df.loc[mask, features].iloc[:-1]
        races.loc[-1] = np.zeros(17, dtype=int)
        races.index += 1
        races.sort_index(inplace=True)
        races = races.to_numpy()
        df.loc[mask, features] = races

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7235 entries, 0 to 7234
Data columns (total 70 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   carNumber                7235 non-null   float64
 1   positionGrid             7235 non-null   int64  
 2   positionFinal            7235 non-null   int64  
 3   pointsDriverEarned       7235 non-null   float64
 4   lapsCompleted            7235 non-null   int64  
 5   timeTakenInMillisec      3581 non-null   float64
 6   fastestLap               6914 non-null   float64
 7   fastestLapRank           7130 non-null   float64
 8   fastestLapTime           6914 non-null   float64
 9   maxSpeed                 6914 non-null   float64
 10  driverRef                7235 non-null   object 
 11  driverNumber             7235 non-null   float64
 12  driverNationality        7235 non-null   object 
 13  constructorRef           7235 non-null   object 
 14  constructorNationality  

Since the datatypes are already correct, and there is no nulls because they are derivated attributes, we can continue with encoding and normalization.


## Encoding and normalization


Once preprocessed, we proceed to re-encode and re-normalize. In addition, we will remove previously added data for comparison purposes.


In [8]:
X = df.drop(
    [
        "positionFinal",
        "pointsDriverEarned",
        "lapsCompleted",
        "timeTakenInMillisec",
        "fastestLap",
        "fastestLapRank",
        "fastestLapTime",
        "maxSpeed",
        "driverStatus",
        "pointsConstructorEarned",
        "constructorPosition",
        "weather",
        "weatherWarm",
        "weatherCold",
        "weatherDry",
        "weatherWet",
        "weatherCloudy",
        "circuitType",
        "circuitDirection",
        "circuitLength",
        "circuitLaps",
        "circuitDist",
    ],
    axis=1,
)

enc = LabelEncoder()
for c in X.columns:
    if X[c].dtype == "object":
        X[c] = enc.fit_transform(X[c])

scaler = RobustScaler()
X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

Finally we write both dataframes for the following sections


In [9]:
df.to_csv("../assets/data/processed/additional.csv", index=False)
X.to_csv("../assets/data/processed/additional_X.csv", index=False)