In [1]:
%pip install pandas
%pip install xgboost
%pip install scikit-learn



In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# Load data
all_data_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data.csv")
#all_data_df = pd.read_csv("data.csv")
all_data_df.drop(columns=['Starting Grid Position', 'Qualifying Position', 'Qualifying Time'], inplace=True)


In [5]:
# Create a RaceID to group drivers per race
all_data_df['RaceID'] = all_data_df['Season'].astype(str) + "_" + all_data_df['Round'].astype(str)
all_data_df['Driver actual name'] = all_data_df['Driver Name']
all_data_df['Team actual name'] = all_data_df['Team Name']


In [6]:
# Encode categorical columns
categorical_cols = [
    'Driver Name', 'Team Name', 'Engine Manufacturer',
    'Weather Forecast', 'Tire Strategy Estimate',
    'Race Name', 'Track Name'
]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    all_data_df[col] = le.fit_transform(all_data_df[col])
    label_encoders[col] = le

In [7]:
all_data_df['Track Temperature'] = all_data_df['Track Temperature'].str.extract('(\d+)').astype(float)
all_data_df['Air Temperature'] = all_data_df['Air Temperature'].str.extract('(\d+)').astype(float)
all_data_df['Wind Speed'] = all_data_df['Wind Speed'].str.extract('(\d+)').astype(float)
all_data_df['Avg Top Speed'] = all_data_df['Avg Top Speed'].str.extract('(\d+)').astype(float)
all_data_df['Rain Probability'] = all_data_df['Rain Probability'].str.extract('(\d+)').astype(float)



In [8]:
def time_to_seconds(time_str):
  """Converts time in mm:ss.ms format to seconds."""
  try:
    minutes, rest = time_str.split(":")
    seconds, milliseconds = rest.split(".")
    total_seconds = int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000
    return total_seconds
  except ValueError:
    return None

In [15]:
df = all_data_df.iloc[:-12]
predict_df = all_data_df.iloc[-12:]

In [16]:

df = df[df['Finish Position'] != 'RET']
df = df[df['Finish Position'] != 'DNF']
df = df[df['Finish Position'] != 'DSQ']
df = df[df['Finish Position'] != 'DNS']
df = df[df['Finish Position'] != 'Ret']
df.dropna(subset=['Finish Position'], inplace=True)
df['Finish Position'] = df['Finish Position'].astype(int)

In [17]:
df['Driver Experience'] = df['Driver Experience'].astype(int)
predict_df['Driver Experience'] = predict_df['Driver Experience'].astype(int)


In [18]:
# Feature columns
feature_cols = [
    'Driver Experience', 'Driver Avg Finish This Track', 'Team Avg Finish This Track',
    'Driver Championship Position', 'Driver Points Season',
    'Team Points Season', 'Track Temperature', 'Air Temperature',
    'Rain Probability', 'Wind Speed', 'Avg Top Speed',
    'Driver Name', 'Team Name', 'Engine Manufacturer', 'Weather Forecast', 'Race Name', 'Track Name'
]

In [19]:
# Train-test split by races
unique_races = df['RaceID'].unique()
train_races, test_races = train_test_split(unique_races, test_size=0.3, random_state=40)

train_df = df[df['RaceID'].isin(train_races)]
test_df = df[df['RaceID'].isin(test_races)]

X_train = train_df[feature_cols]
y_train = train_df['Finish Position']
group_train = train_df.groupby('RaceID').size().tolist()

X_test = test_df[feature_cols]
y_test = test_df['Finish Position']
group_test = test_df.groupby('RaceID').size().tolist()

X_predict = predict_df[feature_cols]
y_predict = predict_df['Finish Position']
group_predict = predict_df.groupby('RaceID').size().tolist()

In [20]:
# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group_train)

dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(group_test)

# Correct the group_predict for dpredict
group_predict = [predict_df.shape[0]]  # Use the total number of rows in predict_df

dpredict = xgb.DMatrix(X_predict)  # Use X_predict instead of X_test


In [21]:
# Train the XGBoost model
params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'rmse',
    'eta': 0.3,
    'max_depth': 10,
    'verbosity': 1
}

model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=[(dtest, 'test')],
    early_stopping_rounds=10
)

[0]	test-rmse:8.93626
[1]	test-rmse:8.93688
[2]	test-rmse:8.91798
[3]	test-rmse:8.95517
[4]	test-rmse:8.92726
[5]	test-rmse:8.92430
[6]	test-rmse:8.92032
[7]	test-rmse:8.90813
[8]	test-rmse:8.91386
[9]	test-rmse:8.90679
[10]	test-rmse:8.91692
[11]	test-rmse:8.92149
[12]	test-rmse:8.91768
[13]	test-rmse:8.92301
[14]	test-rmse:8.91097
[15]	test-rmse:8.91160
[16]	test-rmse:8.90815
[17]	test-rmse:8.89701
[18]	test-rmse:8.90146
[19]	test-rmse:8.90813
[20]	test-rmse:8.90922
[21]	test-rmse:8.91369
[22]	test-rmse:8.91914
[23]	test-rmse:8.92439
[24]	test-rmse:8.92034
[25]	test-rmse:8.92018
[26]	test-rmse:8.91655
[27]	test-rmse:8.91597


In [22]:
# Predict and rank
test_df['Predicted Score'] = model.predict(dtest)
test_df['Predicted Rank'] = test_df.groupby('RaceID')['Predicted Score'].rank(ascending=False)

# View predictions
print(test_df[['RaceID', 'Driver actual name', 'Team Name', 'Predicted Rank', 'Finish Position']].sort_values(by=['RaceID', 'Predicted Rank']))

      RaceID Driver actual name  Team Name  Predicted Rank  Finish Position
320  2022_14     George Russell          9             1.0                4
322  2022_16     George Russell          9             1.0                3
325  2022_19     George Russell          9             1.0                5
312   2022_6     George Russell          9             1.0                3
403  2023_12       Lance Stroll          2             1.0                9
..       ...                ...        ...             ...              ...
162   2025_5       Pierre Gasly          0             1.0               12
124   2025_5       Lance Stroll          2             2.0               14
108   2025_5    Fernando Alonso          2             3.0               12
92    2025_5     Lewis Hamilton          3             4.0               12
76    2025_5    Charles Leclerc          3             5.0                3

[136 rows x 5 columns]


In [23]:
predict_df['Predicted Score'] = model.predict(dpredict)
predict_df['Predicted Rank'] = predict_df.groupby('RaceID')['Predicted Score'].rank(ascending=False)

# View predictions
print(predict_df[['RaceID', 'Driver actual name', 'Team Name', 'Predicted Rank', 'Finish Position']].sort_values(by=['RaceID', 'Predicted Rank']))

     RaceID Driver actual name  Team Name  Predicted Rank Finish Position
535  2025_8  Gabriel Bortoleto         13             1.0             TBD
533  2025_8       Pierre Gasly          0             2.0             TBD
534  2025_8    Nico Hülkenberg          4             3.0             TBD
529  2025_8      Oscar Piastri          7             4.0             TBD
532  2025_8       Yuki Tsunoda         12             5.0             TBD
527  2025_8     George Russell          9             6.0             TBD
531  2025_8    Carlos Sainz Jr         18             7.0             TBD
530  2025_8         Alex Albon         18             8.0             TBD
525  2025_8     Lewis Hamilton          3             9.0             TBD
528  2025_8       Lando Norris          7            10.0             TBD
526  2025_8    Charles Leclerc          3            11.0             TBD
524  2025_8     Max Verstappen         12            12.0             TBD


**Conclusion**

For first Race:
*   Carlos Sainz Jr. predicted position is 5
*   Alexander Albon predicted position is 10
*   Constructor point of Atlassian Williams Racins is 10 + 1 = 11


For second Race:
*   Carlos Sainz Jr. predicted position is 7
*   Alexander Albon predicted position is 8
*   Constructor point of Atlassian Williams Racins is 6 + 4 = 10