In [1]:
%matplotlib inline

import datetime as dt
import numpy as np
import pandas as pd

import urllib.request
import json

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

from DBManager import *

import itertools


%load_ext autotime
%load_ext snakeviz

Using TensorFlow backend.


## Data

We are still usgin [ergast](http://ergast.com/mrd/db/) data but now we've downloaded the database image to MySQL.

Before we build the interfacing class we will look at the features.

## Features

As a starting point we'll use all features listed [here](http://www.f1-predictor.com/building-an-f1-prediction-engine-feature-engineering-part-ii/). We will choose a subset list, document it so that we have a feel of the type of data we'll need.


**Driver Features**

    Qualifying position
    Driver name
    Driver age at that time
    Years in F1
    Percentage difference in qualifying time from pole position * 100  (e.g. 102.3%)
    Starts in front-row
    Races won in career
    Races won in season till that race
    Races started
    Races finished
    Pole positions won
    Drivers championships won
    Driver championship classification last year
    Drivers championship position this season
    Max, min, avg positions gained/lost during last X races
    Max, min, avg finishing position in the last X races
    Correlation between qualifying and race results per driver
    Previous race final position
    Previous race qualifying position
    Positions gained in previous race
    Race and Qualifying position in same race last year
    Positions gained in same race last year
    Percentage difference from winner (in time) in the last race * 100  (e.g. 102.3%)
    Number of pit-stops in same race last year
    Avg lap-time excl. pit stops in last race
    Avg lap-time consistency excl. pit stops in last race
    Max/min/avg/std speed in previous race
    Rank on avg/std of speed in previous race

**Constructors Features**

    Constructors name
    Constructors championship won
    Constructors races won
    Constructors races won this that season
    Constructors championship won in last X years
    Constructors championship classification last year
    Constructors championship position at the time
    Max (Team-mate qualifying position, Driver qualifying position)
    Max, min, avg positions gained/lost during last X races
    Max, min, avg position in the last X races
    Percentage difference in top-speed from top in last-race * 100 (e.g. 99.5%)
    Times retired
    Times retired in last X races
    Max/min/avg speed in previous race
    Rank on avg/std of speed in previous race

** Other Features ** 

    Circuit name
    Race rank in season (i.e. 1-21)
    Year
    Average overtakes per race
    Correlation between race and qualifying results per circuit


# Building the dataset

We have to decide whether it shall be an overall comparison or a pairwise comparison. Pairs make sense but we need some way to ensemble them.

We will have a dummy for driver #1 (all drivers) and then another set of dummies for driver #2

A given observation is then given by:
* A driver #1
* A driver #2
* A circuit
* A year

So for each circuit-year (race) I need to get all combinations of driver pairs

First we need to get every circuit-year(race):

In [2]:
races = DBManager().getRaces()
races.set_index('raceId',inplace=True)
races.head(1)

Unnamed: 0_level_0,year,circuitId,name,date
raceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1009,2018,24,Abu Dhabi Grand Prix,2018-11-25


time: 158 ms


Now for each race we need to get all combinations of drivers. 

For each combination we add the circuit and the year as a feature.

Next we need to add the other features. Most features will be in terms of differences.

Lastly we need to add the result, 1 when Driver 1 wins and 0 when Driver 2 wins.

We'll start loading some auxiliar tables so that querying is faster:

In [3]:
# to find years in F1 we need a driverID-results-race year table
first_year = DBManager().getFirstYear_table()

results_na = DBManager().getResults_table()
results = results_na.copy().fillna(1000)

qualifying_na = DBManager().getQualifying_table()
qualifying = qualifying_na.copy()


drivers_table = DBManager().getDrivers_table()

races.sort_index(inplace=True)


time: 2.17 s


In [4]:
# # %%snakeviz

# # Cut the sample
# sieve = 2018
# races.drop(races.index[races["year"]>=2018],inplace=True)


# data = []

# # for i in range(120,150): #replace for in races.index
# #     raceId = races.index[i]
# for raceId in races.index:
#     print(raceId)
#     drivers = DBManager().getDriversFromRace(raceId = raceId)
#     rows = pd.DataFrame(list(itertools.combinations(drivers['driverId'], 2)), columns = ["driver1","driver2"])
    
#     #NonFeatures
#     rows["raceId"] = raceId
#     rows["date"] = races.loc[raceId]["date"]
    
#     #Features
#     rows["year"] = races.loc[raceId]["year"]
#     rows["circuitId"] = races.loc[raceId]["circuitId"]
# #     rows["qualifying"] = rows.apply(DBManager().getQualifyingFromPandas,axis=1)

#     idx = pd.IndexSlice
    
#     if raceId in qualifying.index.get_level_values(0): #Many missing qualifyings, check "missingQuali.sql MISSING TREATMENT
#         rows["qualifying"] = qualifying.loc[idx[raceId,rows["driver2"].tolist()],:]["position"].reset_index(drop=True) - qualifying.loc[idx[raceId,rows["driver1"].tolist()],:]["position"].reset_index(drop=True)
#     else: 
#         rows["qualifying"] = 0

#     rows["driverAge"] = (drivers_table.loc[rows["driver1"]]["dob"].reset_index(drop=True)).sub(drivers_table.loc[rows["driver2"]]["dob"].reset_index(drop=True)).dt.days/365
#     rows["yearsF1"] = - first_year.loc[rows["driver2"]].reset_index(drop=True) + first_year.loc[rows["driver1"]].reset_index(drop=True)
    
    
#     #True Values
#     rows["output_1"] = rows.apply(lambda x: results.loc[x["raceId"],x["driver2"]]["position"].iloc[0] - results.loc[x["raceId"],x["driver1"]]["position"].iloc[0],axis=1)
#     rows["output"] = 0
#     rows["output"][rows["output_1"]==0] = 0.5 # Both retired
#     rows["output"][rows["output_1"]>0] = 1 # Driver 2 lost
#     rows["output"][rows["output_1"]<0] = 0 # Driver 2 won
    
    
#     data.append(rows)

# dataset = pd.concat(data)
# dataset.to_pickle('dataset.pkl')

time: 9.13 ms


In [8]:
    feature_list = [
        "year",
        "circuitId",
        "qualifying",
        "driverAge",
        "yearsF1",
    ]

time: 1.84 ms


In [9]:
dataset = pd.read_pickle('dataset.pkl')

dataset.head()

Unnamed: 0,driver1,driver2,raceId,date,year,circuitId,qualifying,driverAge,yearsF1,output_1,output
0,1,2,1,2009-03-29,2009,1,-4.0,7.668493,7,-990.0,0.0
1,1,3,1,2009-03-29,2009,1,-6.0,-0.468493,1,-994.0,0.0
2,1,4,1,2009-03-29,2009,1,7.0,3.446575,6,-995.0,0.0
3,1,5,1,2009-03-29,2009,1,2.0,3.221918,0,0.0,0.5
4,1,6,1,2009-03-29,2009,1,-1.0,-0.010959,0,0.0,0.5


time: 66.1 ms


In [10]:
# Missing treatment
dataset["qualifying"] = dataset["qualifying"].fillna(0)

time: 6.52 ms


# Feedforward Neural Network

In [11]:
xcols = feature_list

ycols = [
    "output",
]

model_xdata = dataset[xcols].copy()
model_ydata = dataset[ycols].copy()

time: 62.2 ms


In [12]:
training_sieve = 2015

dummies_list = [
    "year",
    "circuitId"
]


model_data_dummies = pd.get_dummies(model_xdata,columns = dummies_list)
model_data_dummiesy = pd.get_dummies(model_ydata, columns = ["output"])

training_datax = model_data_dummies.loc[model_xdata["year"] <= training_sieve]
test_datax = model_data_dummies.loc[model_xdata["year"] > training_sieve]

training_datay = model_data_dummiesy.loc[model_xdata["year"] <= training_sieve]
test_datay = model_data_dummiesy.loc[model_xdata["year"] > training_sieve]



X_training = training_datax
X_test = test_datax


y_training = training_datay
y_test = test_datay

time: 527 ms


In [13]:
X_training.shape

(280999, 143)

time: 5.17 ms


In [14]:
X_test.shape

(8651, 143)

time: 8.09 ms


In [15]:
y_training.shape

(280999, 3)

time: 4.43 ms


In [40]:
model = Sequential()

# input should have 143
model.add(Dense(512, input_shape=(143,)))
model.add(Activation('sigmoid'))

model.add(Dense(512))
model.add(Activation('sigmoid'))

# output should have 10 (0-9)
model.add(Dense(3))
model.add(Activation('softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=["accuracy"])

time: 116 ms


In [41]:
model.fit(X_training, y_training, epochs=5, batch_size=25000, verbose=0);

time: 4min 7s


In [42]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy = {:.4f}".format(accuracy))

Accuracy = 0.6107
time: 1.21 s


# Future Research

* Add the other features

* When enconding the output of a pair in race we opted to send retired racers to the back of the line. Alternatively we could discard results in which a racer retires.

* Check NaN treatment