# Model training

## Import libraries

In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

from keras.models import Sequential
from keras.layers import LSTM, Dense


## Reading data

In [2]:
data = pd.read_csv('../f1dataset.csv')

In [3]:
target_data = data['racePosition']
Y_scores = [1 if x <= 10 else 0 for x in target_data]
Y_scores = pd.DataFrame(Y_scores)
features_data = data.drop(['racePosition', 'raceIdOrdered', 'RacingExperience', 'maxPace', 'meanPace', 'nationality', 'year', 'yearStarted', 'driverExpYears', 'raceId', 'startingPosition'], axis=1)

In [4]:
features_data

Unnamed: 0,driverId,qualiResultPosition,q1Msec,q2Msec,q3Msec,circuitId,constructorId,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,21,16.0,95898,0,0,1,18,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
1,65,14.0,95453,0,0,1,15,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
2,84,19.0,96286,0,0,1,17,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
3,55,6.0,94257,0,0,1,22,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
4,50,12.0,95338,0,0,1,29,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7585,20,9.0,85523,84974,84961,24,117,28.763014,69.712329,1014.713699,0.0,33.942466,228.068493,1.635616
7586,846,7.0,85387,84903,84769,24,1,28.763014,69.712329,1014.713699,0.0,33.942466,228.068493,1.635616
7587,839,8.0,85735,85007,84830,24,214,28.763014,69.712329,1014.713699,0.0,33.942466,228.068493,1.635616
7588,840,14.0,85741,85359,0,24,117,28.763014,69.712329,1014.713699,0.0,33.942466,228.068493,1.635616


In [6]:
features_data = features_data[['driverId', 'constructorId', 'circuitId', 'qualiResultPosition', 'q1Msec', 'q2Msec', 'q3Msec', 'AirTemp', 'Humidity', 'Pressure', 'Rainfall', 'TrackTemp', 'WindDirection', 'WindSpeed']]
features_data = features_data.rename(columns={'driverId': 'DriverId', 'constructorId': 'TeamId', 'circuitId': 'CountryRaceId', 'qualiResultPosition': 'QualificationPosition', 'q1Msec': 'Q1', 'q2Msec':'Q2', 'q3Msec':'Q3'})

## Classical models

In [7]:
Scores_results = {}

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features_data, Y_scores, test_size=0.3, random_state=42, shuffle= False)

### Catboost

In [None]:
catboost_model = CatBoostClassifier()

param_grid = {
    'learning_rate': [0.01, 0.1, 0.005],
    'depth': [2, 4, 6, 8,],
    'iterations': [100, 200, 500],
    'loss_function': [ 'Logloss'],
    
}

grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=5, scoring='f1', verbose=False)
grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy на тестовом наборе:", accuracy)
f1 = f1_score(y_test, y_pred)
Scores_results['CatBoost'] = (round(f1, 2), grid_search.best_params_)
print("F1 на тестовом наборе:", f1)

### AdaBoost

In [None]:
adaboost_model = AdaBoostClassifier()

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [1, 0.5],
    'algorithm': ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(estimator=adaboost_model, param_grid=param_grid, cv=3, scoring='f1', verbose=3)
grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy на тестовом наборе:", accuracy)
f1 = f1_score(y_test, y_pred)
Scores_results['Ada Boost']  = (round(f1, 2), grid_search.best_params_)
print("F1 на тестовом наборе:", f1)

### Random Forest

In [None]:
RF_model = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 5 , 20],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2', None]

}

grid_search = GridSearchCV(estimator=RF_model, param_grid=param_grid, cv=5, scoring='f1', verbose=3)
grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy на тестовом наборе:", accuracy)
f1 = f1_score(y_test, y_pred)
Scores_results['Random Forest']  = (round(f1, 2), grid_search.best_params_)
print("F1 на тестовом наборе:", f1)

### Logistic Regression

In [None]:
LR_model = LogisticRegression()

param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [1, 2],
    'class_weight': ['balanced', None],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

grid_search = GridSearchCV(estimator=LR_model, param_grid=param_grid, cv=5, scoring='f1', verbose=3)
grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy на тестовом наборе:", accuracy)
f1 = f1_score(y_test, y_pred)
Scores_results['LR_model']  = (round(f1, 2), grid_search.best_params_)
print("F1 на тестовом наборе:", f1)

## ML approach

In [14]:
X_train_, X_test_, y_train_, y_test_ = X_train.values, X_test.values, y_train.values, y_test.values
num_samples, num_features = X_train_.shape
X_train_ = X_train_.reshape(num_samples, 1, num_features)
num_samples, num_features = X_test_.shape
X_test_ = X_test_.reshape(num_samples, 1, num_features)

### LSTM

In [None]:
model_lstm = Sequential()
model_lstm.add(LSTM(50, input_shape=(X_train_.shape[1], X_train_.shape[2])))
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_lstm.fit(X_train_, y_train_, epochs=50, batch_size=32, validation_data=(X_test_, y_test_))

loss, accuracy = model_lstm.evaluate(X_test_, y_test_)
print(f'Accuracy: {accuracy * 100:.2f}%')

y_pred = model_lstm.predict(X_test_)
y_pred_binary = (y_pred > 0.5).astype(int)


f1 = f1_score(y_test_, y_pred_binary)
Scores_results['LSTM']  = (round(f1, 2), {})


### MLP

In [None]:
model_MLP = Sequential()
model_MLP.add(Dense(64, input_shape=(X_train_.shape[1], X_train_.shape[2]), activation='relu'))
model_MLP.add(Dense(32, activation='relu'))
model_MLP.add(Dense(1, activation='sigmoid'))

model_MLP.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_MLP.fit(X_train_, y_train_, epochs=50, batch_size=32, validation_split=0.3)

loss, accuracy = model_MLP.evaluate(X_test_, y_test_)
print(f'Accuracy: {accuracy * 100:.2f}%')


y_pred = model_MLP.predict(X_test_)
y_pred_binary = (y_pred > 0.5).astype(int)


y_test_flat = y_test_.flatten()
y_pred_binary_flat = y_pred_binary.flatten()


f1 = f1_score(y_test_flat, y_pred_binary_flat)
Scores_results['MLP']  = (round(f1, 2), {})


In [21]:
results_scores = pd.DataFrame(Scores_results).T
results_scores = results_scores.drop([1], axis = 1)
results_scores = results_scores.rename(columns={0: 'F1'})
results_scores = results_scores.sort_values(by='F1', ascending=False)

In [22]:
results_scores

Unnamed: 0,F1
CatBoost,0.77
Random Forest,0.76
LSTM,0.76
MLP,0.76
Ada Boost,0.75
LR_model,0.75


## Save the best model

In [24]:
model = CatBoostClassifier(iterations=100,
                           depth=2,
                           learning_rate=0.005,
                           loss_function='Logloss',
                           verbose=False)

model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x293d91960>

In [25]:
model.save_model('model_predict')