In [None]:
%pip install pandas
%pip install xgboost
%pip install scikit-learn

In [34]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [143]:
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [35]:
# Load data
#all_data_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data.csv")
all_data_df = pd.read_csv("data.csv")
all_data_df.drop(columns=['Starting Grid Position', 'Qualifying Position', 'Qualifying Time'], inplace=True)


In [36]:
# Create a RaceID to group drivers per race
all_data_df['RaceID'] = all_data_df['Season'].astype(str) + "_" + all_data_df['Round'].astype(str)
all_data_df['Driver actual name'] = all_data_df['Driver Name']
all_data_df['Team actual name'] = all_data_df['Team Name']


In [37]:
# Encode categorical columns
categorical_cols = [
    'Driver Name', 'Team Name', 'Engine Manufacturer',
    'Weather Forecast', 'Tire Strategy Estimate',
    'Race Name', 'Track Name'
]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    all_data_df[col] = le.fit_transform(all_data_df[col])
    label_encoders[col] = le

In [38]:
all_data_df['Track Temperature'] = all_data_df['Track Temperature'].str.extract('(\d+)').astype(float)
all_data_df['Air Temperature'] = all_data_df['Air Temperature'].str.extract('(\d+)').astype(float)
all_data_df['Wind Speed'] = all_data_df['Wind Speed'].str.extract('(\d+)').astype(float)
all_data_df['Avg Top Speed'] = all_data_df['Avg Top Speed'].str.extract('(\d+)').astype(float)
all_data_df['Rain Probability'] = all_data_df['Rain Probability'].str.extract('(\d+)').astype(float)



In [39]:
def time_to_seconds(time_str):
  """Converts time in mm:ss.ms format to seconds."""
  try:
    minutes, rest = time_str.split(":")
    seconds, milliseconds = rest.split(".")
    total_seconds = int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000
    return total_seconds
  except ValueError:
    return None

In [40]:
df = all_data_df.iloc[:-16]
predict_df = all_data_df.iloc[-16:]

In [41]:

df = df[df['Finish Position'] != 'RET']
df = df[df['Finish Position'] != 'DNF']
df = df[df['Finish Position'] != 'DSQ']
df = df[df['Finish Position'] != 'DNS']
df = df[df['Finish Position'] != 'Ret']
df.dropna(subset=['Finish Position'], inplace=True)
df['Finish Position'] = df['Finish Position'].astype(int)

In [42]:
df['Driver Experience'] = df['Driver Experience'].astype(int)
predict_df['Driver Experience'] = predict_df['Driver Experience'].astype(int)


In [43]:
# Feature columns
feature_cols = [
    'Driver Experience', 'Driver Avg Finish This Track', 'Team Avg Finish This Track',
    'Driver Championship Position', 'Driver Points Season',
    'Team Points Season', 'Track Temperature', 'Air Temperature',
    'Rain Probability', 'Wind Speed', 'Avg Top Speed',
    'Driver Name', 'Team Name', 'Engine Manufacturer', 'Weather Forecast', 'Race Name', 'Track Name'
]

In [44]:
# Train-test split by races
unique_races = df['RaceID'].unique()
train_races, test_races = train_test_split(unique_races, test_size=0.3, random_state=40)

train_df = df[df['RaceID'].isin(train_races)]
test_df = df[df['RaceID'].isin(test_races)]

X_train = train_df[feature_cols]
y_train = train_df['Finish Position']
group_train = train_df.groupby('RaceID').size().tolist()

X_test = test_df[feature_cols]
y_test = test_df['Finish Position']
group_test = test_df.groupby('RaceID').size().tolist()

X_predict = predict_df[feature_cols]
y_predict = predict_df['Finish Position']
group_predict = predict_df.groupby('RaceID').size().tolist()

In [45]:
# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group_train)

dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(group_test)

# Correct the group_predict for dpredict
group_predict = [predict_df.shape[0]]  # Use the total number of rows in predict_df

dpredict = xgb.DMatrix(X_predict)  # Use X_predict instead of X_test


In [46]:
# Train the XGBoost model
params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'rmse',
    'eta': 0.3,
    'max_depth': 10,
    'verbosity': 1
}

model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=[(dtest, 'test')],
    early_stopping_rounds=10
)

[0]	test-rmse:8.64545
[1]	test-rmse:8.63108
[2]	test-rmse:8.60025
[3]	test-rmse:8.57104
[4]	test-rmse:8.54405
[5]	test-rmse:8.54302
[6]	test-rmse:8.53824
[7]	test-rmse:8.52010
[8]	test-rmse:8.51126
[9]	test-rmse:8.51157
[10]	test-rmse:8.49594
[11]	test-rmse:8.49150
[12]	test-rmse:8.48025
[13]	test-rmse:8.47258
[14]	test-rmse:8.47210
[15]	test-rmse:8.47775
[16]	test-rmse:8.47652
[17]	test-rmse:8.47044
[18]	test-rmse:8.46622
[19]	test-rmse:8.46248
[20]	test-rmse:8.44826
[21]	test-rmse:8.43765
[22]	test-rmse:8.44197
[23]	test-rmse:8.44299
[24]	test-rmse:8.43801
[25]	test-rmse:8.42872
[26]	test-rmse:8.42545
[27]	test-rmse:8.41747
[28]	test-rmse:8.41996
[29]	test-rmse:8.41899
[30]	test-rmse:8.41477
[31]	test-rmse:8.41792
[32]	test-rmse:8.41402
[33]	test-rmse:8.40656
[34]	test-rmse:8.40522
[35]	test-rmse:8.40304
[36]	test-rmse:8.40094
[37]	test-rmse:8.39553
[38]	test-rmse:8.39408
[39]	test-rmse:8.39877
[40]	test-rmse:8.39892
[41]	test-rmse:8.39786
[42]	test-rmse:8.40526
[43]	test-rmse:8.4073

In [47]:
# Predict and rank
test_df['Predicted Score'] = model.predict(dtest)
test_df['Predicted Rank'] = test_df.groupby('RaceID')['Predicted Score'].rank(ascending=False)

# View predictions
print(test_df[['RaceID', 'Driver actual name', 'Team Name', 'Predicted Rank', 'Finish Position']].sort_values(by=['RaceID', 'Predicted Rank']))

      RaceID Driver actual name  Team Name  Predicted Rank  Finish Position
456  2022_13        Jack Doohan         17             1.0                1
319  2022_13     George Russell          9             2.0                3
321  2022_15     George Russell          9             1.0                2
324  2022_18     George Russell          9             1.0                8
309   2022_3     George Russell          9             1.0                3
..       ...                ...        ...             ...              ...
162   2025_5       Pierre Gasly          0             1.0               12
124   2025_5       Lance Stroll          2             2.0               14
108   2025_5    Fernando Alonso          2             3.0               12
92    2025_5     Lewis Hamilton          3             4.0               12
76    2025_5    Charles Leclerc          3             5.0                3

[144 rows x 5 columns]


In [48]:
predict_df['Predicted Score'] = model.predict(dpredict)
predict_df['Predicted Rank'] = predict_df.groupby('RaceID')['Predicted Score'].rank(ascending=False)

# View predictions
print(predict_df[['RaceID', 'Driver actual name', 'Team Name', 'Predicted Rank', 'Finish Position']].sort_values(by=['RaceID', 'Predicted Rank']))

     RaceID Driver actual name  Team Name  Predicted Rank Finish Position
523  2025_7  Gabriel Bortoleto          5             1.0             TBD
521  2025_7   Franco iolapinto          0             2.0             TBD
526  2025_7       Pierre Gasly          0             3.0             TBD
519  2025_7       Esteban Ocon          4             4.0             TBD
517  2025_7   Carlos Sainz Jr.         19             5.0             TBD
518  2025_7    Fernando Alonso          2             6.0             TBD
520  2025_7    Nico Hülkenberg         14             7.5             TBD
524  2025_7    Nico Hülkenberg         14             7.5             TBD
522  2025_7       Yuki Tsunoda         12             9.0             TBD
525  2025_7    Alexander Albon         19            10.0             TBD
514  2025_7     George Russell          9            11.0             TBD
512  2025_7      Oscar Piastri          7            12.0             TBD
516  2025_7     Lewis Hamilton        

**Conclusion**

*   Carlos Sainz Jr. predicted position is 5
*   Alexander Albon predicted position is 10
*   Constructor point of Atlassian Williams Racins is 10 + 1 = 11

