In [2]:
pip install imblearn

Collecting imblearn
  Obtaining dependency information for imblearn from https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl.metadata
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Obtaining dependency information for imbalanced-learn from https://files.pythonhosted.org/packages/d8/0d/c3bfccc5d460eec8ff56889802aa88f5d07280d5282b307a74558e6edc44/imbalanced_learn-0.12.4-py3-none-any.whl.metadata
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0

[1m[[0

In [3]:
import pandas as pd
# comment in google colab
# from google.colab import drive
# drive.mount('/content/drive')
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, precision_score

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', None)

In [6]:
## load train, test, and future data
train = pd.read_csv('data/F1_results_train_v2.csv').drop('Points_R', axis = 1)
test = pd.read_csv('data/F1_results_test_v2.csv').drop('Points_R', axis = 1)
future = pd.read_csv('data/F1_future_v2.csv')

In [8]:
train.head()

Unnamed: 0,FullName,TeamName,year,RoundNumber,Position_Q,Position_R,Q1_Q,Q2_Q,Q3_Q,has_rain_Q,has_rain_R,Country,Location,EventFormat,Q1_Q_Rank,Q2_Q_Rank,Q3_Q_Rank,DriverPointsBefore,TeamPointsBefore
0,Alexander Albon,Toro Rosso,2019,1,13,14.0,82.757,82.636,82.636,0.0,0.0,Australia,Melbourne,conventional,9.0,13.0,99.0,0.0,0.0
1,Antonio Giovinazzi,Alfa Romeo Racing,2019,1,14,15.0,82.431,82.714,82.714,0.0,0.0,Australia,Melbourne,conventional,4.0,14.0,99.0,0.0,0.0
2,Carlos Sainz,McLaren,2019,1,18,20.0,83.084,83.084,83.084,0.0,0.0,Australia,Melbourne,conventional,18.0,99.0,99.0,0.0,0.0
3,Charles Leclerc,Ferrari,2019,1,5,5.0,82.017,81.739,81.442,0.0,0.0,Australia,Melbourne,conventional,1.0,4.0,5.0,0.0,0.0
4,Daniel Ricciardo,Renault,2019,1,12,19.0,82.921,82.57,82.57,0.0,0.0,Australia,Melbourne,conventional,13.0,12.0,99.0,0.0,0.0


In [32]:
## Define features and target variable
features = train.columns.drop('Position_R').tolist()
target = 'Position_R'

In [33]:
## define models to train
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'OVR': OneVsRestClassifier(LogisticRegression()),
    'SVC': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'GaussianNB': GaussianNB(),
    'KNeighbors': KNeighborsClassifier()
}

In [34]:
compare_models = pd.DataFrame(columns=['Model', 'Accuracy', 'f1_score_1', 'f1_score_2', 'f1_score_3', 'Recall_1', 'Recall_2', 'Recall_3', 'Precision_1', 'Precision_2', 'Precision_3'])

In [39]:
categorical_features = ['FullName', 'TeamName', 'Country', 'Location', 'EventFormat']

## train test split
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]
future = future[X_train.columns.tolist()]

## seperate categorical and numerical features
X_train_cat = X_train[categorical_features]
X_test_cat = X_test[categorical_features]
future_cat = future[categorical_features]

X_train_num = X_train.drop(categorical_features, axis = 1)
X_test_num = X_test.drop(categorical_features, axis = 1)
future_num = future.drop(categorical_features, axis = 1)

## encode categorical features with one hot encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output = False)
X_train_cat_encoded = pd.DataFrame(ohe.fit_transform(X_train_cat))
X_test_cat_encoded = pd.DataFrame(ohe.transform(X_test_cat))
future_cat_encoded = pd.DataFrame(ohe.transform(future_cat))

X_train_cat_encoded.columns = ohe.get_feature_names_out(categorical_features)
X_test_cat_encoded.columns = ohe.get_feature_names_out(categorical_features)
future_cat_encoded.columns = ohe.get_feature_names_out(categorical_features)

## normalize the numercial features
min_max_scaler = MinMaxScaler()
X_train_num_scaled = pd.DataFrame(min_max_scaler.fit_transform(X_train_num))
X_test_num_scaled = pd.DataFrame(min_max_scaler.transform(X_test_num))

X_train_num_scaled.columns = X_train_num.columns
X_test_num_scaled.columns = X_test_num.columns

## combine the encoded and normalized results
X_train_encoded = pd.concat((X_train_cat_encoded, X_train_num_scaled), axis = 1)
X_test_encoded = pd.concat((X_test_cat_encoded, X_test_num_scaled), axis = 1)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

for m in models:
  model = models[m]
  # Train the model
  model.fit(X_train_resampled, y_train_resampled)

  # Make predictions on the test set
  y_pred = model.predict(X_test_encoded)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  f1_score_1 = f1_score(y_test, y_pred, labels=[1], average='micro')
  f1_score_2 = f1_score(y_test, y_pred, labels=[2], average='micro')
  f1_score_3 = f1_score(y_test, y_pred, labels=[3], average='micro')
  recall_1 = recall_score(y_test, y_pred, labels=[1], average='micro')
  recall_2 = recall_score(y_test, y_pred, labels=[2], average='micro')
  recall_3 = recall_score(y_test, y_pred, labels=[3], average='micro')
  precision_1 = precision_score(y_test, y_pred, labels=[1], average='micro')
  precision_2 = precision_score(y_test, y_pred, labels=[2], average='micro')
  precision_3 = precision_score(y_test, y_pred, labels=[3], average='micro')
  compare_models.loc[len(compare_models)] = [m, accuracy, f1_score_1,f1_score_2,f1_score_3, recall_1, recall_2, recall_3, precision_1, precision_2, precision_3]

  print(m + ' done')

Random Forest done
Logistic Regression done
OVR done
SVC done
Decision Tree done
GaussianNB done
KNeighbors done


In [40]:
compare_models

Unnamed: 0,Model,Accuracy,f1_score_1,f1_score_2,f1_score_3,Recall_1,Recall_2,Recall_3,Precision_1,Precision_2,Precision_3
0,Random Forest,0.139241,0.382979,0.313725,0.210526,0.45,0.4,0.2,0.333333,0.258065,0.222222
1,Logistic Regression,0.121519,0.390244,0.352941,0.153846,0.4,0.6,0.15,0.380952,0.25,0.157895
2,OVR,0.126582,0.380952,0.333333,0.15,0.4,0.5,0.15,0.363636,0.25,0.15
3,SVC,0.086076,0.372093,0.0,0.170213,0.4,0.0,0.2,0.347826,0.0,0.148148
4,Decision Tree,0.121519,0.3,0.243902,0.142857,0.3,0.25,0.15,0.3,0.238095,0.136364
5,GaussianNB,0.073418,0.208333,0.204082,0.0,0.75,0.5,0.0,0.120968,0.128205,0.0
6,KNeighbors,0.091139,0.225806,0.098361,0.16129,0.35,0.15,0.25,0.166667,0.073171,0.119048
