In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
import os

In [2]:
dir = os.getcwd()
project_root = os.path.abspath(os.path.join(dir, '..'))
data_root = os.path.join(project_root, 'data')

tel = pd.read_csv(filepath_or_buffer=os.path.join(data_root, 'f1_telemetry_agg.csv'))
df_train = pd.read_csv(filepath_or_buffer=os.path.join(data_root, 'f1_train.csv'))
df_test = pd.read_csv(filepath_or_buffer=os.path.join(data_root, 'f1_test.csv'))

df_train = df_train.merge(tel, how='left', on=['RoundNumber', 'LapNumber', 'DriverNumber'])
df_test = df_test.merge(tel, how='left', on=['RoundNumber', 'LapNumber', 'DriverNumber'])

df_train.shape

(17127, 31)

In [3]:
df_train_imputed = df_train.copy()
df_train = df_train.dropna()

In [4]:
impute_cols = [
    'RoundNumber', 'LapNumber', 'DriverNumber',  
    'TyreLife', 'TrackTemp', 'FuelLevel', 'SpeedST',                
    'AvgCorneringSpeed', 'AvgRPM', 'AvgThrottle', 'BrakePct' , 'GearShifts', 'MaxSpeed'
]

imputer = IterativeImputer(
    estimator=BayesianRidge(),
    max_iter=15, 
    random_state=42,
    verbose=0
)

data_to_impute = df_train_imputed[impute_cols].copy()

imputed_matrix = imputer.fit_transform(data_to_impute)

df_train_imputed[impute_cols] = imputed_matrix

In [5]:
def eval_model(df_train, df_test, col_to_drop, model):
    X_train = df_train.drop(columns=col_to_drop)
    X_test = df_test.drop(columns=col_to_drop)
    y_train = df_train['Target']
    y_test = df_test['Target']
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'MAE on target: {mean_absolute_error(y_test, y_pred)}')
    y_pred = y_pred * df_test['QualiBest']
    mae = mean_absolute_error(df_test['LapTime'], y_pred)
    mape = mean_absolute_percentage_error(df_test['LapTime'], y_pred)
    print(f'MAE: {mae}')
    print(f'MAPE: {mape}')
    return df_test['LapTime'], y_pred

In [6]:
df_train_imputed.isna().sum()

DriverNumber         0
LapTime              0
LapNumber            0
SpeedST              0
TyreLife             0
TrackTemp            0
Rainfall             0
Position             0
RoundNumber          0
FuelLevel            0
QualiBest            0
Target               0
DriverPower          0
TeamPace             0
C_INTERMEDIATE       0
Length               0
Traction             0
Abrasion             0
TrackEvolution       0
TyreStress           0
Lateral              0
Downforce            0
CompoundRating       0
GapAhead             0
GapBehind            0
AvgCorneringSpeed    0
MaxSpeed             0
AvgRPM               0
GearShifts           0
AvgThrottle          0
BrakePct             0
dtype: int64

In [7]:
df_train.shape

(17104, 31)

In [8]:
col_to_drop = ['LapTime', 'DriverNumber', 'RoundNumber', 'LapNumber', 'Target', 'QualiBest']
model = RandomForestRegressor(
    n_estimators=100, 
    min_samples_leaf=25,
    max_depth=15,
    n_jobs=-1,
    max_features='sqrt',
    random_state=42
    )
print('--- Model with imputed data ---')
_, _ = eval_model(df_train_imputed, df_test, col_to_drop, model)
print('--- Model with deleted ---')
y_test, y_pred = eval_model(df_train, df_test, col_to_drop, model)

--- Model with imputed data ---
MAE on target: 0.010828160546820882
MAE: 0.8085907032209383
MAPE: 0.010014105207687514
--- Model with deleted ---
MAE on target: 0.010459369821280547
MAE: 0.7822369672753097
MAPE: 0.009662962975866113


As we can see, the imputed data only introduces noise, so we will delete records without telemetry

In [9]:
full_df = pd.concat([df_train_imputed, df_test], axis=0)
full_df.to_csv(os.path.join(data_root, 'full_f1_data.csv'), index=False)