In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import os

In [14]:
dir = os.getcwd()
project_root = os.path.abspath(os.path.join(dir, '..'))
data_root = os.path.join(project_root, 'data')
df = pd.read_csv(filepath_or_buffer=os.path.join(data_root, 'f1_data.csv'))

df.head()

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,TrackTemp,Rainfall,WindSpeed,RoundNumber
0,0 days 01:12:57.726000,NOR,4,0 days 00:01:57.099000,1.0,1.0,,,,0 days 00:00:20.913000,...,124,1.0,False,,False,False,19.2,False,3.9,1
1,0 days 01:12:57.726000,DOO,7,,1.0,1.0,,,,,...,124,,False,,True,False,19.2,False,3.9,1
2,0 days 01:12:57.726000,HAD,6,,1.0,1.0,,,,,...,124,,False,,True,False,19.2,False,3.9,1
3,0 days 01:12:57.726000,SAI,55,,1.0,1.0,,,,,...,124,,False,,True,False,19.2,False,3.9,1
4,0 days 01:13:00.002000,VER,1,0 days 00:01:59.392000,1.0,1.0,,,,0 days 00:00:20.705000,...,124,2.0,False,,False,False,19.2,False,3.9,1


In [15]:
df.columns

Index(['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint',
       'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
       'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest',
       'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime',
       'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason',
       'FastF1Generated', 'IsAccurate', 'TrackTemp', 'Rainfall', 'WindSpeed',
       'RoundNumber'],
      dtype='str')

In [16]:
cols_to_save = ['DriverNumber', 'LapTime', 'LapNumber', 'SpeedST', 'Compound', 'TyreLife', 'Team', 'TrackStatus', 'TrackTemp', 'Rainfall', 'WindSpeed', 'RoundNumber']
df = df[cols_to_save].copy()
df['LapTime'] = pd.to_timedelta(df['LapTime'])
df['LapTime'] = df['LapTime'].dt.total_seconds()
df['SpeedST'] = df.groupby('DriverNumber')['SpeedST'].transform(lambda x: x.ffill().bfill())

In [17]:
df.head()

Unnamed: 0,DriverNumber,LapTime,LapNumber,SpeedST,Compound,TyreLife,Team,TrackStatus,TrackTemp,Rainfall,WindSpeed,RoundNumber
0,4,117.099,1.0,227.0,INTERMEDIATE,1.0,McLaren,124,19.2,False,3.9,1
1,7,,1.0,318.0,INTERMEDIATE,1.0,Alpine,124,19.2,False,3.9,1
2,6,,1.0,319.0,INTERMEDIATE,1.0,Racing Bulls,124,19.2,False,3.9,1
3,55,,1.0,321.0,INTERMEDIATE,1.0,Williams,124,19.2,False,3.9,1
4,1,119.392,1.0,215.0,INTERMEDIATE,1.0,Red Bull Racing,124,19.2,False,3.9,1


In [18]:
df = df.dropna()

In [19]:
max_laps = df.groupby('RoundNumber')['LapNumber'].transform('max')
df['FuelLevel'] = max_laps - df['LapNumber']
df.head()

Unnamed: 0,DriverNumber,LapTime,LapNumber,SpeedST,Compound,TyreLife,Team,TrackStatus,TrackTemp,Rainfall,WindSpeed,RoundNumber,FuelLevel
0,4,117.099,1.0,227.0,INTERMEDIATE,1.0,McLaren,124,19.2,False,3.9,1,56.0
4,1,119.392,1.0,215.0,INTERMEDIATE,1.0,Red Bull Racing,124,19.2,False,3.9,1,56.0
5,81,120.807,1.0,226.0,INTERMEDIATE,1.0,McLaren,124,19.2,False,3.9,1,56.0
6,63,124.644,1.0,207.0,INTERMEDIATE,1.0,Mercedes,124,19.2,False,3.9,1,56.0
7,16,128.48,1.0,191.0,INTERMEDIATE,1.0,Ferrari,124,19.2,False,3.9,1,56.0


In [20]:
df.dtypes

DriverNumber      int64
LapTime         float64
LapNumber       float64
SpeedST         float64
Compound            str
TyreLife        float64
Team                str
TrackStatus       int64
TrackTemp       float64
Rainfall           bool
WindSpeed       float64
RoundNumber       int64
FuelLevel       float64
dtype: object

In [21]:
df['DriverNumber'] = df['DriverNumber'].astype('category')
df['Compound'].unique()

<StringArray>
['INTERMEDIATE', 'MEDIUM', 'HARD', 'SOFT']
Length: 4, dtype: str

In [22]:
# deleting all laps with yellow or red flags (significantly impacts time)
df = df[df['TrackStatus'] == 1]
df['TrackStatus'].unique()

array([1])

In [None]:
quali = pd.read_csv(filepath_or_buffer=os.path.join(data_root, 'f1_quali_data.csv'))

quali = pd.read_csv('./f1_quali_data.csv')
quali

FileNotFoundError: [Errno 2] No such file or directory: './f1_quali_data.csv'

In [None]:
quali['LapTime'] = pd.to_timedelta(quali['LapTime'])
quali['LapTime'] = quali['LapTime'].dt.total_seconds()
best_quali = quali.groupby('RoundNumber')['LapTime'].min().rename('QualiBest')
best_quali

In [None]:
df = df.merge(best_quali, on='RoundNumber', how='left')
df['Target'] = df['LapTime'] / df['QualiBest']

In [None]:
df = df.sort_values(by=['RoundNumber', 'LapNumber', 'DriverNumber'])
df

In [None]:
df = df.drop(columns='TrackStatus')
# first lap of the race is abnormally slow so we will get rid of it
df = df[df['LapNumber'] > 1].copy()

In [None]:
df["Target"].describe()
df.sort_values("Target", ascending=False).head(20)

In [None]:
num_cols = ["LapTime","Target","SpeedST","TyreLife","TrackTemp","WindSpeed","FuelLevel","QualiBest","LapNumber"]
print(df[num_cols].describe(percentiles=[.001,.01,.05,.5,.95,.99,.999]).T)

In [None]:
df_pre_shape = df.shape
df = df[df["Target"] <= 1.30]
df = df[df["SpeedST"] >= 240]
print(df_pre_shape, "->", df.shape)

Poczatek wyscigu -> wolniejsze okrązenia, pod koniec szybciej

In [None]:
def trend(df, xcol, ycol="Target", bins=20, title=None):
    b = pd.cut(df[xcol], bins=bins)
    g = df.groupby(b, observed=True)[ycol].agg(["mean","count"])
    x = b.cat.categories.mid

    plt.figure()
    plt.plot(x, g["mean"].values)
    plt.title(f"Mean {ycol} vs {xcol}")
    plt.xlabel(xcol)
    plt.ylabel(f"mean {ycol}")
    plt.show()

trend(df, "FuelLevel")
trend(df, "TrackTemp")
trend(df, "WindSpeed")  

In [None]:
cols = ["FuelLevel","LapNumber","SpeedST","TyreLife","TrackTemp","WindSpeed","Target"]
C = df[cols].corr(method="spearman", numeric_only=True)
print(C)

plt.figure(figsize=(7,5))
plt.imshow(C, interpolation="nearest")
plt.xticks(range(len(cols)), cols, rotation=45, ha="right")
plt.yticks(range(len(cols)), cols)
plt.colorbar()
plt.title("Korelacje miedzy cechami")
plt.tight_layout()
plt.show()

df.drop(columns= "LapNumber", inplace=True)

Korelacje globalnie:
- Fuelevel dodatnie: wiecej paliwa -> wolniej
- TyreLife globalnie słabe

In [None]:
def per_round(df, col):
    out = []
    for rnd, g in df.groupby("RoundNumber"):
        r = g[[col, "Target"]].corr(method="spearman").iloc[0,1]
        out.append((rnd, r))
    return pd.DataFrame(out, columns=["RoundNumber", "Spearman"]).dropna()

for c in ["FuelLevel","SpeedST","TrackTemp","WindSpeed"]:
    t = per_round(df, c)
    print(c, "median:", t["Spearman"].median(), "mean:", t["Spearman"].mean(), "n_rounds:", len(t))

Korelacje per round:
- FuelLevel i Tracktemp wysoko
- SpeedST globalnie wysoko, tutaj nisko (czyli zalezne od toru)

In [None]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge

y = df["Target"].values
X = df.drop(columns=["LapTime", "Target", "QualiBest", "RoundNumber"], errors="ignore")

groups = df["RoundNumber"].values
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

qb_test = df.iloc[test_idx]["QualiBest"].values
lt_test = df.iloc[test_idx]["LapTime"].values

In [None]:
# elementary baseline: always predict mean of target

mean_tr = y_train.mean()

pred_base_target = np.full_like(y_test, mean_tr)
print("Baseline MAE:", mean_absolute_error(y_test, pred_base_target))

pred_base_laptime = pred_base_target * qb_test
print("Baseline MAE (LapTime) [s]:", mean_absolute_error(lt_test, pred_base_laptime))

In [None]:
pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
])

ridge = Pipeline([
    ("pre", pre),
    ("model", Ridge(alpha=1.0, random_state=42))
])
ridge.fit(X_train, y_train)
pred_r = ridge.predict(X_test)
print("Ridge MAE:", mean_absolute_error(y_test, pred_r))

lt_pred = pred_r * qb_test
print("Ridge MAE(LapTime) [s]:", mean_absolute_error(lt_test, lt_pred))

GroupShuffleSplit, bo jesli zrobimy zwykły losowy podział po wierszach to w train i test będą te same rundy i wynik bedzie zawyzony.

Przyklad:

In [None]:
from sklearn.model_selection import train_test_split

X2 = df.drop(columns=["LapTime","Target","QualiBest","RoundNumber"], errors="ignore")
y2 = df["Target"].values

Xtr, Xte, ytr, yte = train_test_split(X2, y2, test_size=0.2, random_state=42)
ridge.fit(Xtr, ytr)
pred = ridge.predict(Xte)
print("Random split MAE:", mean_absolute_error(yte, pred))
print("Group split  MAE:", mean_absolute_error(y_test, pred_r))

In [None]:
output_path = os.path.join(data_root, "f1_data_cleaned.csv")
df.to_csv(output_path, index=False)