# Modules
These are needed everywhere.

In [12]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

import time

from dfply import *

from scipy.cluster.hierarchy import dendrogram, linkage

# Data Files

In [13]:
lap_times = pd.read_csv("f1db_csv/lap_times2018.csv")
df_drivers = pd.read_csv("f1db_csv/driver.csv")
df_races = pd.read_csv("f1db_csv/races.csv")

# Small Auxiliary Functions

Recovers the driver's last name from his id.

In [14]:
def driver_name(driverId):
    driver = df_drivers.loc[ df_drivers["driverId"]==driverId, "surname"].values[0]
    
    return driver

Recovers the race from its id. For example, 989 is Australia, 990 is Bahrain and 1001 is Belgium.

In [None]:
def race_name(raceId):
    race = df_races.loc[ df_races["raceId"]==raceId, "name"].values[0]
    
    return race

Turns milliseconds into a string with minutes and seconds.

In [15]:
def ms_to_minutes(milliseconds):
    minutes = int((milliseconds // 1000)//60)
    seconds = int((milliseconds // 1000) % 60)
    ms = round(milliseconds % 1000)
    
    # Extra format. We don't want 2 seconds, but 02 instead.
    if(seconds < 10):
        seconds = "0" + str(seconds)
    # Likewise, we don't want 1.1 seconds but 1.001.
    if(ms < 10):
        ms = "00" + str(ms)
    elif(ms < 100):
        ms = "0" + str(ms)
    else:
        ms = str(ms)
    
    return("%d:%s.%s" % (minutes, seconds, ms))

In [10]:
# Example
# print("Expected: 1:37.001")
# print("Obtained:", ms_to_minutes(97001.052632))

# Time differences
- delta_to_first calculates the gap to first place for each driver in each lap.
- plot_delta_to_first calls delta_to_first and shows a gap vs lap plot. It plots all drivers in the same image

### Notes
- delta_to_first works only with the drivers that completed the same number of laps (it drops the others). This drops a lot of people if we use the default data base, lap_times. If you want to include the drivers with higher gaps or DNF's, you should fill in missing laps with the methods below and feed the resulting data frame to the df parameter.
- delta_to_first calculates the gaps in milliseconds. plot_delta_to_first then formats the axes of the plot so that it shows minutes and seconds.

In [16]:
def delta_to_first(race, df=lap_times):
    race_df  = (df >>
                mask(X.raceId == race) >>
                arrange(X.lap))

    # Number of laps
    num_laps = max(race_df.lap)

    # Drivers that finished the race
    drivers = (race_df >>
               mask(X.lap == num_laps) >>
               select(X.driverId))
    drivers = np.unique(drivers.driverId)

    # Differences
    diff = {}
    total_times = {}
    driver_times = {}

    for driver in drivers:
        diff[driver] = []
        total_times[driver] = 0
        driver_times[driver] = 0

    for i in range(1,num_laps+1):
        # lt stands for lap times
        lt = (race_df >> mask(X.lap==i))

        # Find first place
        first = lt.loc[ lt["position"].idxmin() ]["driverId"]

        # Get lap times
        for driver in drivers:
            driver_times[driver] = lt.loc[ lt["driverId"]==driver, "milliseconds"].values[0]
            total_times[driver] += driver_times[driver]

        # Calculate differences
        for driver in drivers:
            diff[driver].append(np.abs(total_times[first] - total_times[driver]))
        
    return(diff, drivers)

def plot_delta_to_first(race, df=lap_times):
    # Plot all differences in the same graph
    diff2, drivers = delta_to_first(989, df=df)

    formatter = FuncFormatter(lambda ms, x: time.strftime('%M:%S', time.gmtime(ms // 1000)))
    fig, ax = plt.subplots();

    for driver in drivers:
        plt.plot(diff2[driver], label=driver_name(driver))

        ax.legend(loc='center right', bbox_to_anchor=(1.35, 0.5))

        ax.yaxis.set_major_formatter(formatter)

# Define our distance
Given a race, we define a metric between two drivers as: $\frac{1}{n} \sqrt{\sum_{i=1}^n (t_i - s_i)^2}$, where:
- $t_i$ is the lap time of the first driver in lap $i$
- $s_i$ is the lap time of the second driver in lap $i$
- $n$ is the number of laps in the race.

In other words, we're treating a driver's lap times in a single race as a vector that has $n$ entries. Then we calculate the $\ell_2$ metric between the vectors of two different drivers.

### Normalization
We divide by $n$ to normalize by the number of laps. The underlying assumption here is that races with more laps have larger gaps. Hopefully, this $\frac{1}{n}$ should eliminate that variation. Unfortunately, we haven't verified this. Maybe we should not normalize or use a completely different factor altogether. We could investigate this in the future.

## Functions
- lap_differences: Calculates $v-w$, where $v$ is the vector of lap times of driver1 in raceId ($w$ for driver2).
    - It returns differences in seconds.
    - If $v$ and $w$ have different lengths, it eliminates entries from the longer vector so that they have the same length.

- delta_per_lap: Calculates the actual gap between two drivers in each lap. The gap in lap $k$  is the sum of differences in lap times up to lap $k$. For example, if driver1 was faster than driver2 by 1 second in lap 1, and 3 in lap 2, the gap after lap 2 will be of 4 seconds.
    - It calls lap_differences to get the differences in lap times. For this reason, delta_per_lap returns results in seconds.

delta_per_lap and delta_to_first do very similar things. The difference is that delta_to_first automatically detects the first place driver and computes the gap for all the drivers. delta_per_lap, on the other hand, calculates the gap between two specific drivers.

In [18]:
def lap_differences(driver1, driver2, raceId, df=lap_times):
    # Gives a list with the difference lap times of driver1 minus those of driver2
    race_times  = (df >>
                   mask(X.raceId == raceId, (X.driverId==driver1) | (X.driverId==driver2)) >>
                   arrange(X.lap))

    # Get times for each driver
    times1 = (race_times >>
              mask(X.driverId==driver1) >>
              select(X.milliseconds, X.lap))
    
    times2 = (race_times >>
              mask(X.driverId==driver2) >>
              select(X.milliseconds, X.lap))
    
    # Get the laps that they both did
    laps1 = max(times1.lap)
    laps2 = max(times2.lap)
    minlaps = min(laps1, laps2)

    # The difference
    diff = np.array((times1.milliseconds).add(-times2.milliseconds.values))
    
    # Turn it into seconds (not milliseconds) before returning
    return(diff/1000)

def delta_per_lap(driver1, driver2, raceId, df=lap_times):
    # Get the difference in each lap time
    diff = lap_differences(driver1, driver2, raceId, df=df)
    
    # To get the difference on track, do the cumulative sum
    delta = [0]
    for i in range(len(diff)):
        delta.append( delta[i]+diff[i] )
    
    return(np.array(delta))

# sum of squared entries of a vector
def sum_of_squares(vector):
    return(np.sum(np.square(np.array(vector))))

# mean of the squared entries of a vector
def mse(vector):
    return(np.mean(np.square(np.array(vector))))

# l2 norm of a vector, divided by the number of entries
def l2(vector):
    vector = np.array(vector)
    return( np.sqrt(np.sum(np.square(vector)))/vector.size )

race_distance gives the distance matrix of all drivers in a single race. In other words, the $i,j$ entry of race_distance is the distance between drivers $i$ and $j$. The method parameter says how we will measure that distance. The default choice is the l2 metric described above.

The season variable indicates if we're doing results over the whole season or only from a single race. In the first case, we need all distance matrices to be ordered in a common way so that we can compare and add matrices from different races. We will sort them in a more meaningful way later, but for now we need a standard. For that, we sort them by the driverId provided by our datasets (np.unique sorts numbers in ascending order).

If we only want results on a single race, we order them by their finishing position. This makes heatmaps look nice.

In [23]:
def race_distances(raceId, df=lap_times, method=l2, season=False):
    # Get the drivers that finished in this race
    race_times  = (df >>
                   mask(X.raceId == raceId) >>
                   arrange(X.lap))

    # Number of laps
    num_laps = max(race_times.lap)

    # Drivers that finished the race (sort them by position)
    drivers = (race_times >>
               mask(X.lap == num_laps) >>
               arrange(X.position))

    # We sort the drivers depending on whether we're doing results on the season or on a race
    if(season):
        # We don't order them in any specific way when we look at the whole season
        # We don't have an a priori way to order the drivers over the whole season.
        drivers = np.unique(drivers.driverId.values)
    else:
        # We order them by position when we look at a single race.
        drivers = drivers.driverId.values
        _, idx = np.unique(drivers, return_index=True)
        drivers = drivers[np.sort(idx)]

    # Get all distances
    distances = []
    for i in range(len(drivers)):
        row = []
        for j in range(len(drivers)):
            driveri = drivers[i]
            driverj = drivers[j]

            row.append( method(delta_per_lap(driveri, driverj, raceId, df=df)) )

        distances.append(row)

    distances = np.array(distances)
    return(distances, drivers)

# Methods for filling missing laps
The metrics above require that all drivers have the same number of laps. This rarely happens in a race, so we need to fill in missing laps. There are two cases:
 - A driver finished more than a lap behind the leader.
 - A driver crashed, had a mechanical failure, or otherwise failed to finish the race.

## Big gaps
This case is easy to fix. We just copy the last available lap time enough times.

Expanding on that description, say a certain race is 60 laps long. Let $w$ be the winning driver and suppose another driver, say $d$, is 2 laps behind $w$ when the race finished. $d$ is not allowed to finish the 2 remaining laps because it will not change the result: he is way behind $w$. Because of this, our database has 60 lap times for $w$, but only 58 for $d$. To fix this, we take $d$'s time on lap 58 and copy it into laps 59 and 60. So if $d$'s last time was 1:30, we copy that 1:30 two more times.

In [None]:
def fill_laps_behind(data, race, driver):
    # Get the total laps in the race
    num_laps = max((data >> mask(X.raceId == race)).lap)
    
    # Get the laps our driver did
    driver_laps = max((data >> mask(X.raceId == race, X.driverId==driver)).lap)
    
    # Now we get the last available lap
    last_lap = (data >> mask(X.raceId==race, X.driverId==driver, X.lap==driver_laps))
    
    # And we copy that lap until we fill all remaining laps
    new_data = data.copy()
    for i in range(driver_laps+1, num_laps+1):
        filler_lap = last_lap.copy()
        
        # Update the lap number
        filler_lap.lap = i
        
        # Add to the race data frame
        new_data = pd.concat([new_data, filler_lap])

    # Sort the array into a nice shape so as to not mess up with future analises.
    new_data = (new_data >>
                arrange(X.raceId, X.driverId, X.lap))

    return(new_data)

## DNF's
Fixing DNF's is trickier than big gaps. On the one hand, it is a big deal when a driver does not finish a race. DNF's are penalized in real competition because the driver does not score points. On the other hand, we don't want to assing too harsh a penalty in a single race because it might bias a season-wide metric. In other words, if we are not careful, one DNF will ruin a driver's complete season. For example, let's say the distance between a driver that finished the race and a driver with DNF is $\infty$. This might make sense in a single race, but if we add the distances in two different races, it will still be $\infty$. It doesn't matter if the DNF'd driver outperforms the competition in the following races. His distance will always be $\infty$ (and don't even get me started with two or more DNF's).

### Our approach
To fix this, we try to find a middle ground. In general, we try to use other drivers' lap times by applying a certain function to the lap times of a specific lap.
- slowest_time: After a driver leaves the race, we assign the slowest lap time from all the other drivers.
- same_position: If our driver was in $n$-th position when he crashed, we will assign the lap time of the driver that is currently in $n$-th position (or the slowest, if there are less than $n$ drivers in the race).
- average_time: We assign the average time in each lap.

For example, let's say driver 1 has lap times of 1:01, 1:01 and 1:00, and driver 2 did 1:05, 1:02 and 0:59. Suppose driver 3 was in 2nd place when he crashed in the second lap, after posting a time of 1:03 in the first lap. In the next laps, driver 1 stayed in 1st place and driver 2, in second. According to our methods, the lap times for driver 3 would be:
- slowest_time:  1:03, 1:02, and 1:00.
- same_position: 1:03, 1:02, and 0:59.
- average_time:  1:03, 1:01.5, and 0:59.5.

Naturally, same_position and average_time don't penalize DNF's. However, they are coded already and they might be useful if we refine this penalty system.

In [22]:
def slowest_time(data, race, driver, lap_index):
    # Picks the slowest time in the lap (ie. the biggest time)
    lap_df = (data >>
              mask(X.raceId==race, X.lap==lap_index))
    
    return(max(lap_df.milliseconds.values))

def same_position(data, race, driver, lap_index):
    # Finds the position of our driver in his last lap
    driver_df = (data >> mask(X.raceId==race, X.driverId==driver))
    
    # If the driver didn't finish one lap, we're in trouble
    if(driver_df.size == 0):
        return(slowest_time(data, race, driver, lap_index))

    # If he did, we get the last lap and his position in his last lap.
    last_lap = max(driver_df.lap.values)
    position = (driver_df >> mask(X.lap==last_lap)).position.values[0]

    # If our driver DNF'd from last place, we default to the slowest time.
    # We get the positions in this lap, but we remove 21's (ie. drivers that already retired)
    places = (data >> mask(X.raceId==race, X.lap==lap_index, X.position != 21)).position.values
    last_place = max(places)
    
    
    last_place = max((data >> mask(X.raceId==race, X.lap==lap_index)).position.values)
    if(position > last_place):
        return(slowest_time(data, race, driver, lap_index))

    # And we pick the lap time of the driver in our driver's last known position
    position_df = (data >> mask(X.raceId==race, X.position==position, X.lap==lap_index))
    new_time = position_df.milliseconds.values[0]
    
    return(new_time)

def average_time(data, race, driver, lap_index):
    # Now we don't worry about which driver retired. We just calculate the average of the other times.
    time_df = (data >> mask(X.raceId==race, X.lap==lap_index)).milliseconds.values
    
    return(int(round(np.mean(time_df))))

This is the generic function for filling in missing laps. This function calls one of the above to find the exact value. At the same time, it populates the database in the same format as the lap_times database, that is, it will add rows with the following columns: raceId, driverId, lap, position, time, milliseconds.

There were 20 drivers in 2018, so we assign the 21st position to a driver after a DNF. If we want to do this for another year, we have to add a line that counts how many drivers participated.

In [21]:
def fill_laps(data, race, driver, method=slowest_time):
    # Get the total laps in the race
    num_laps = max((data >> mask(X.raceId == race)).lap)
    
    # Get the laps our driver did
    driver_df = (data >> mask(X.raceId == race, X.driverId==driver))
    
    # We register if the driver retired without completing a lap.
    if(driver_df.size==0):
        # No laps completed
        driver_laps=0
        
        # Get the data from the first lap
        row = (data >> mask(X.raceId==race, X.lap==1))
        test_driver = row.driverId.values[0]   # This way we choose one row from the data frame
        row = (row >> mask(X.driverId==test_driver))
        
        # Create a copy for our driver
        last_driver_lap = row.copy()
        
        # Populate with our driver's information
        last_driver_lap.driverId = driver
        last_driver_lap.position = 21

    # If not, we proceed as normal.
    else:
        driver_laps = max(driver_df.lap.values)
        last_driver_lap = (driver_df >> mask(X.lap == driver_laps))
    
    # And we copy that lap until we fill all remaining laps
    new_data = data.copy()
    for lap_index in range(driver_laps+1, num_laps+1):
        filler_time = method(data, race, driver, lap_index)
        
        # Update the lap number, time and position
        filler_lap = last_driver_lap.copy()
        
        filler_lap.lap = lap_index
        filler_lap.position = 21
        filler_lap.milliseconds = filler_time
        filler_lap.time = ms_to_minutes(filler_time)
        
        # Add to the race data frame
        new_data = pd.concat([new_data, filler_lap])

    # Sort the array into a nice shape so as to not mess up with future analises.
    new_data = (new_data >>
                arrange(X.raceId, X.driverId, X.lap))

    return(new_data)

# Driver Rankings
Now that our metric works well on a single race, we will try to get more from it. One thing we can try is to define a drivers ranking. The idea is the following: in a single race, the two drivers that have the largest distance between them are the first and last place. Everyone else is between them. We can detect who is in second place as the driver that is closest to first place; third place is second closest, and so on. The functions below implement this idea.

Finally, sort_matrix takes a square $n \times n$ matrix $M$ and a permutation $\sigma$ of $\{1, \dots, n\}$ and reorders the rows and columns of $M$ according to $\sigma$. For example, if $n=3$ and $\sigma$ is the permutation $(123)$ (ie. $\sigma$ sends 1 to 2, 2 to 3 and 3 to 1), sort_matrix moves the first row and column of $M$ to the second row and column, and so on.

Combining these ideas, we can sort a distance matrix according to our drivers ranking. The fastest driver will be in the first row (and column). The one closest to him will be in the second row, and so on. This is what extractdiam does. extractdiam2 does the opposite: the second to last driver is the one closest to last place, and so on (hopefully these two methods will give the same result, but I'm not 100% certain).

**Note:** To be honest, when we find the drivers that have the greatest distance between them, we can't find out who is faster (distances are symmetric). At this point, we can only guess who should be first place. If we don't get it right, it shouldn't change the end result too much. At worst, the distance matrix will be sorted from slowest to fastest instead of fastest to slowest. We might fix this in the future.

In [None]:
def extractdiam(distmatrix):
    entriesmax=np.where( distmatrix==np.amax(distmatrix) )
    #print(entriesmax)
    i=entriesmax[0][0]
    n=distmatrix.shape[0]
    distancestopt=distmatrix[i,:]
    sorteddistancestopt=np.sort(distancestopt)
    indices=[]
    for k in range(0,n):
        indices.append(np.where(distmatrix[i,:] == sorteddistancestopt[k]))
    #newmatrix=np.delete(distmatrix,i,0)
    #newmatrix=np.delete(newmatrix,j,1)
    #return i,j,newmatrix
    return distancestopt, i,sorteddistancestopt, indices

def extractdiam2(distmatrix):
    entriesmax=np.where( distmatrix==np.amax(distmatrix) )
    #print(entriesmax)
    i=entriesmax[1][0]
    n=distmatrix.shape[0]
    distancestopt=distmatrix[i,:]
    sorteddistancestopt=-np.sort(-distancestopt, )
    indices=[]
    for k in range(0,n):
        indices.append(np.where(distmatrix[i,:] == sorteddistancestopt[k]))
    #newmatrix=np.delete(distmatrix,i,0)
    #newmatrix=np.delete(newmatrix,j,1)
    #return i,j,newmatrix
    return distancestopt, i,sorteddistancestopt, indices

def sort_matrix(matrix, sorted_indices):
    new_matrix = matrix.copy()
    n = len(sorted_indices)
    for i in range(n):
        for j in range(n):
            ii = sorted_indices[i]
            jj = sorted_indices[j]
            
            new_matrix[i][j] = matrix[ii][jj]
    
    return(new_matrix)