# Driver ratings data


## Dependencies


The dependencies used are as follows


In [1]:
from sklearn.preprocessing import LabelEncoder, RobustScaler

import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import warnings

warnings.simplefilter("ignore")

## Preprocessing


The next thing we are going to add driver ratings. These ratings are taken from the official F1 game of ea, which began to be created from 2020, so we do not have more historical. Therefore, for the treatment of nulls, what we will do is normalize the range of the driver's experience to the previously calculated range based on the number of races, and associate the rest of the attributes with an average of the nearest neighbors based on that experience.

In case you want to consult the urls used, you can check the script as there are several of them.


In [2]:
df = pd.read_csv("../assets/data/processed/additional.csv")
driver_ratings_df = pd.read_csv("../assets/data/scraping/driver_ratings_ea.csv")

In [3]:
mnm = min(driver_ratings_df["driverExp"])
rg = max(driver_ratings_df["driverExp"]) - mnm
arr = (driver_ratings_df["driverExp"] - mnm) / rg
rg2 = df["driverExpRace"].max() - 1
normalized = (arr * rg2) + 1

driver_ratings_df["driverExp"] = round(normalized)

In [4]:
for year, driver, exp, rac, awa, pac, ovr in driver_ratings_df.itertuples(index=False):
    mask = (df["raceYear"] == int(year)) & (
        df["driverRef"].apply(lambda x: x in driver)
    )

    df.loc[mask, "driverExp"] = exp
    df.loc[mask, "driverRac"] = rac
    df.loc[mask, "driverAwa"] = awa
    df.loc[mask, "driverPac"] = pac
    df.loc[mask, "driverOvr"] = ovr

In [5]:
n_features = 5
n_neighbors = 5

driver_exp = dict()

for e in range(1, df["driverExpRace"].max() + 1):
    close_exp = sorted(driver_ratings_df.to_numpy(), key=lambda x: np.abs(x[2] - e))[
        :n_neighbors
    ]
    avg_ratings = []
    for i in range(2, n_features + 2):
        ratings = []
        for j in range(n_neighbors):
            ratings.append(close_exp[j][i])
        avg_ratings.append(round(sum(ratings) / n_neighbors))
    driver_exp[e] = avg_ratings

m = df["driverOvr"].isnull()
p = "driverExpRace"
df["driverExp"] = df["driverExp"].fillna(df.loc[m, p].apply(lambda x: driver_exp[x][0]))
df["driverRac"] = df["driverRac"].fillna(df.loc[m, p].apply(lambda x: driver_exp[x][1]))
df["driverAwa"] = df["driverAwa"].fillna(df.loc[m, p].apply(lambda x: driver_exp[x][2]))
df["driverPac"] = df["driverPac"].fillna(df.loc[m, p].apply(lambda x: driver_exp[x][3]))
df["driverOvr"] = df["driverOvr"].fillna(df.loc[m, p].apply(lambda x: driver_exp[x][4]))

Once the mapping is done, let's check the datatypes.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7235 entries, 0 to 7234
Data columns (total 75 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   carNumber                7235 non-null   float64
 1   positionGrid             7235 non-null   int64  
 2   positionFinal            7235 non-null   int64  
 3   pointsDriverEarned       7235 non-null   float64
 4   lapsCompleted            7235 non-null   int64  
 5   timeTakenInMillisec      3581 non-null   float64
 6   fastestLap               6914 non-null   float64
 7   fastestLapRank           7130 non-null   float64
 8   fastestLapTime           6914 non-null   float64
 9   maxSpeed                 6914 non-null   float64
 10  driverRef                7235 non-null   object 
 11  driverNumber             7235 non-null   float64
 12  driverNationality        7235 non-null   object 
 13  constructorRef           7235 non-null   object 
 14  constructorNationality  

Now check if there are any nulls.


In [7]:
df.isnull().sum()

carNumber                     0
positionGrid                  0
positionFinal                 0
pointsDriverEarned            0
lapsCompleted                 0
timeTakenInMillisec        3654
fastestLap                  321
fastestLapRank              105
fastestLapTime              321
maxSpeed                    321
driverRef                     0
driverNumber                  0
driverNationality             0
constructorRef                0
constructorNationality        0
raceYear                      0
raceRound                     0
grandPrix                     0
raceTime                      0
circuitRef                    0
circuitLocation               0
circuitCountry                0
circuitLat                    0
circuitLng                    0
circuitAlt                    0
driverStatus                  0
driverWins                    0
pointsConstructorEarned       0
constructorPosition           0
constructorWins               0
q1                            0
q2      

## Encoding and normalization


We proceed to re-encode and re-normalize. In addition, we will remove previously added data for comparison purposes.


In [8]:
X = df.drop(
    [
        "positionFinal",
        "pointsDriverEarned",
        "lapsCompleted",
        "timeTakenInMillisec",
        "fastestLap",
        "fastestLapRank",
        "fastestLapTime",
        "maxSpeed",
        "driverStatus",
        "pointsConstructorEarned",
        "constructorPosition",
        "weather",
        "weatherWarm",
        "weatherCold",
        "weatherDry",
        "weatherWet",
        "weatherCloudy",
        "circuitType",
        "circuitDirection",
        "circuitLength",
        "circuitLaps",
        "circuitDist",
        "qMin",
        "qMax",
        "qAvg",
        "driverAgeStarted",
        "driverExpYear",
        "driverExpRace",
        "driverPodiums",
        "driverPos2",
        "driverPos3",
        "driverPos4",
        "driverPos5",
        "driverPos6",
        "driverPos7",
        "driverPos8",
        "driverPos9",
        "driverPos10",
        "driverPos11",
        "driverPos12",
        "driverPos13",
        "driverPos14",
        "driverPos15",
        "driverPos16",
        "driverPos17",
        "driverPos18",
    ],
    axis=1,
)

enc = LabelEncoder()
for c in X.columns:
    if X[c].dtype == "object":
        X[c] = enc.fit_transform(X[c])

scaler = RobustScaler()
X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

Finally we write both dataframes for the following sections


In [9]:
df.to_csv("../assets/data/processed/driver_ratings.csv", index=False)
X.to_csv("../assets/data/processed/driver_ratings_X.csv", index=False)