In [1]:
# Team's nickname: Pretty Girls
#1 Thi Minh Ngoc Dao - 131112
#2 Joanna Marie Corpuz - 131120
#3 LI XINYI - 131082

In [2]:
# Data source: "https://github.com/toUpperCase78/formula1-datasets/tree/master"
# Data of formula1_season_raceResults from 2021 to 2024

# The results of the race: 
# Drivers prediction who will take the first 5 places in the Formula 1 race,which will take place on May 19:
     #1 Max Verstappen
     #2 Lewis Hamilton
     #3 Charles Leclerc
     #4 Sergio Perez
     #5 Carlos Sainz

# The model used in this approach is a Random Forest Classifier. 
# Variables: 
#1 avg_start_pos: The average Starting Grid of the driver across all previous races in the season.
#2 avg_finish_pos: The average finishing position of the driver across all previous races in the season.
#3 team_performance: A performance metric for the driver's team based on the team's average points.


In [3]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score

In [4]:
## Dataset preprocessing

In [5]:
def load_data():
    df2021 = pd.read_csv(r'/Users/macbook/Downloads/formula1_2021season_raceResults.csv')
    df2022 = pd.read_csv(r'/Users/macbook/Downloads/formula1_2022season_raceResults.csv')
    df2023 = pd.read_csv(r'/Users/macbook/Downloads/formula1_2023season_raceResults.csv')
    df2024 = pd.read_csv(r'/Users/macbook/Downloads/formula1_2024season_raceResults.csv')
    return pd.concat([df2021, df2022, df2023, df2024])

df = load_data()
df.head()

Unnamed: 0,Track,Position,No,Driver,Team,Starting Grid,Laps,Time/Retired,Points,+1 Pt,Fastest Lap,Set Fastest Lap,Fastest Lap Time
0,Bahrain,1,44,Lewis Hamilton,Mercedes,2.0,56,1:32:03.897,25.0,No,1:34.015,,
1,Bahrain,2,33,Max Verstappen,Red Bull Racing Honda,1.0,56,+0.745,18.0,No,1:33.228,,
2,Bahrain,3,77,Valtteri Bottas,Mercedes,3.0,56,+37.383,16.0,Yes,1:32.090,,
3,Bahrain,4,4,Lando Norris,McLaren Mercedes,7.0,56,+46.466,12.0,No,1:34.396,,
4,Bahrain,5,11,Sergio Perez,Red Bull Racing Honda,11.0,56,+52.047,10.0,No,1:33.970,,


In [6]:
#Checking numeric columns
df.describe(include=[np.number])

Unnamed: 0,No,Starting Grid,Laps,Points
count,1439.0,1438.0,1439.0,1439.0
mean,27.350938,10.488873,53.977067,5.062196
std,23.906252,5.765984,17.610613,7.224167
min,1.0,1.0,0.0,0.0
25%,10.0,5.25,51.0,0.0
50%,20.0,10.0,57.0,0.5
75%,44.0,15.0,66.0,9.0
max,99.0,20.0,78.0,26.0


In [7]:
#Converting columns to numeric
df['Position'] = pd.to_numeric(df['Position'], errors='coerce')
df.shape

(1439, 13)

In [8]:
#Checking categorical columns
df.describe(include=['O'])

Unnamed: 0,Track,Driver,Team,Time/Retired,+1 Pt,Fastest Lap,Set Fastest Lap,Fastest Lap Time
count,1439,1439,1439,1439,880,829,559,543
unique,29,30,19,878,2,815,2,536
top,Bahrain,Lewis Hamilton,Mercedes,+1 lap,No,1:31.488,No,1:16.666
freq,80,72,144,304,842,2,531,2


In [9]:
#Dropping columns
df.drop(['+1 Pt', 'Set Fastest Lap', 'Fastest Lap Time', 'Fastest Lap'],
             axis=1,
             inplace=True)

In [10]:
#Check null values
df.isnull().sum()

Track              0
Position         177
No                 0
Driver             0
Team               0
Starting Grid      1
Laps               0
Time/Retired       0
Points             0
dtype: int64

In [11]:
#Dropping missing values
df.dropna(inplace=True)
df.shape

(1261, 9)

In [12]:
df.isna().any()

Track            False
Position         False
No               False
Driver           False
Team             False
Starting Grid    False
Laps             False
Time/Retired     False
Points           False
dtype: bool

In [13]:
# Feature engineering to create new columns based on existing data
df = df.copy()  # Make a copy to avoid SettingWithCopyWarning
df['avg_start_pos'] = df.groupby('Driver')['Starting Grid'].transform('mean')
df['avg_finish_pos'] = df.groupby('Driver')['Position'].transform('mean')
df.loc[:, 'team_performance'] = df.groupby('Team')['Points'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())
df.loc[:, 'finish_in_top_5']=(df['Position'] <= 5).astype(int) 
                                                                
# Select features and target
features = ['avg_start_pos', 'avg_finish_pos', 'team_performance','Driver','Team','Track']
target = 'finish_in_top_5'  # Binary classification (1 if finish in top 5, else 0)

# Train a model to predict top 5 finishes
X = df[features]
y = df[target]

df

Unnamed: 0,Track,Position,No,Driver,Team,Starting Grid,Laps,Time/Retired,Points,avg_start_pos,avg_finish_pos,team_performance,finish_in_top_5
0,Bahrain,1.0,44,Lewis Hamilton,Mercedes,2.0,56,1:32:03.897,25.0,5.686567,4.611940,25.0,1
1,Bahrain,2.0,33,Max Verstappen,Red Bull Racing Honda,1.0,56,+0.745,18.0,3.058824,2.147059,18.0,1
2,Bahrain,3.0,77,Valtteri Bottas,Mercedes,3.0,56,+37.383,16.0,10.627119,10.186441,20.5,1
3,Bahrain,4.0,4,Lando Norris,McLaren Mercedes,7.0,56,+46.466,12.0,7.294118,7.000000,12.0,1
4,Bahrain,5.0,11,Sergio Perez,Red Bull Racing Honda,11.0,56,+52.047,10.0,6.818182,4.833333,14.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,Miami,15.0,3,Daniel Ricciardo,RB Honda RBPT,20.0,57,+50.956,0.0,11.490196,10.509804,2.6,0
114,Miami,16.0,77,Valtteri Bottas,Kick Sauber Ferrari,16.0,57,+52.356,0.0,10.627119,10.186441,0.0,0
115,Miami,17.0,18,Lance Stroll,Aston Martin Aramco Mercedes,11.0,57,+55.173,0.0,13.435484,10.677419,1.8,0
116,Miami,18.0,23,Alexander Albon,Williams Mercedes,14.0,57,+76.091,0.0,13.475000,12.100000,0.0,0


In [14]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_features = X_train.drop(columns=['Driver', 'Team','Track'])
X_test_features = X_test.drop(columns=['Driver', 'Team','Track'])

# The model used in this approach is a Random Forest Classifier. 
# Variables: 
#1 avg_start_pos: The average Starting Grid of the driver across all previous races in the season.
#2 avg_finish_pos: The average finishing position of the driver across all previous races in the season.
#3 team_performance: A performance metric for the driver's team based on the team's average points.

# Model training
grid = {'n_estimators': [50, 100, 200, 300],
        'max_features': np.linspace(1, X_train.shape[1], 5).astype(int)}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                                                 param_grid=grid,
                                                 scoring='accuracy',
                                                 n_jobs=1,
                                                 cv=5)
grid_search.fit(X_train_features, y_train)

# Best model
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test_features)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8853754940711462
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       179
           1       0.84      0.76      0.79        74

    accuracy                           0.89       253
   macro avg       0.87      0.85      0.86       253
weighted avg       0.88      0.89      0.88       253



In [15]:
# Predicting probabilities on the test set
X_test['probability'] = best_model.predict_proba(X_test_features)[:, 1]

# Predict top 5 drivers based on the model's probabilities and average finish position
X_test_sorted = X_test.sort_values(by=['probability', 'avg_finish_pos'], ascending=[False, True])
top_5_unique_drivers = X_test_sorted.drop_duplicates(subset=['Driver']).head(5)
top_5_unique_drivers

Unnamed: 0,avg_start_pos,avg_finish_pos,team_performance,Driver,Team,Track,probability
380,3.058824,2.147059,21.0,Max Verstappen,Red Bull Racing RBPT,Mexico,1.0
222,5.686567,4.61194,15.5,Lewis Hamilton,Mercedes,Belgium,1.0
21,5.290323,4.806452,15.8,Charles Leclerc,Ferrari,Saudi Arabia,1.0
402,6.818182,4.833333,20.4,Sergio Perez,Red Bull Racing Honda RBPT,Las Vegas,1.0
103,6.111111,5.428571,15.8,Carlos Sainz,Ferrari,Spain,1.0


In [16]:
print("Drivers prediction who will take the first 5 places in the Formula 1 race,which will take place on May 19:")
top_5_unique_drivers['Driver']

Drivers prediction who will take the first 5 places in the Formula 1 race,which will take place on May 19:


380     Max Verstappen
222     Lewis Hamilton
21     Charles Leclerc
402       Sergio Perez
103       Carlos Sainz
Name: Driver, dtype: object