# Objective
Build a binary classifier that given a sequence of lap times will predict if a pit-stop will happen or not the next lap .. in other words I call this project End-of-Stint-or-NOT

Data Source:
- Ergast Developer API: https://ergast.com/mrd/

## Table of Content:
* [Data Preparation](#Section1)
    * [Import data](#section_1_1)
    * [Pit Stop Table Transformation](#section_1_2)
    * [Lap Times Table Transformation](#section_1_3)
    * [Left Join New Pit-Stop with New Lap-Times](#section_1_4)
    * [TBC](#section_1_5)
    * [TBC](#section_1_6)

## Data Preparation <a class="anchor" id="Section1"></a>

In [None]:
import pandas as pd
import numpy as np

### Import Data <a class="anchor" id="section_1_1"></a>

In [None]:
laps_master = pd.read_csv('data/lap_times.csv')
races_master = pd.read_csv('data/races.csv')
quali_master = pd.read_csv('data/qualifying.csv')
drivers_master = pd.read_csv('data/drivers.csv')
constructors_master = pd.read_csv('data/constructors.csv')
results_master = pd.read_csv('data/results.csv')
circuits_master = pd.read_csv('data/circuits.csv')
pits_master = pd.read_csv('data/pit_stops.csv')

In [None]:
pits_master

### Pit Stop Table Transformation <a class="anchor" id="section_1_2"></a>

Create new data frame with a list of laps when a pit stop was occuring for each driver, for each race

In [None]:
pits_df_new = pits_master.groupby(['raceId', 'driverId'])['lap'].apply(list).reset_index(name='laps_when_pitstop')
pits_df_new

#### Preview the lap times table
Let's take a look at a random race, and random driver, and see how the lap times look .. just to better understand what transformation needs to be done on the data

In [None]:
laps_master[laps_master.raceId == 841][laps_master.driverId == 17]

### Lap Times Table Transformation <a class="anchor" id="section_1_3"></a>

Create a new data frame containing a list of all the lap times in one row, for an entire race, for each driver

In [None]:
laps_df_new = laps_master.groupby(['raceId', 'driverId'])['milliseconds'].apply(list).reset_index(name='race_lap_times')
laps_df_new

### Left Join New Pit-Stop Table with New Lap-Times Table <a class="anchor" id="section_1_4"></a>

In [None]:
merged = pd.merge(pits_df_new, laps_df_new, on=['raceId', 'driverId'], how='left')
merged

### Lap Times Before Pit-Stop Sequence Partitioning <a class="anchor" id="section_1_5"></a>

In [None]:
def partition_lapTime_into_sequences(pitStop_laps, race_lapTimes):
    # NOTE: no need to return the last stint, since it is not followed by a pit stop... 
    #       only return sequence of lap times that are followed by a pit stop

    # returns: list of lap time sequences (which as lists) ... so list of lists
    
    # remove pit stops from first lap... those occur because of a collision, so they should not be looked at when predicting the end of the stint
    if 1 in pitStop_laps:
        pitStop_laps = pitStop_laps[1:]                        # remove first lap pit stop, as it was not a regular, planned one
        race_lapTimes = race_lapTimes[1:]                      # remove the first lap time, since the stint was "corrupted" by the emergency pitstop 
        pitStop_laps[:] = [x - 1 for x in pitStop_laps]        # subtract one lap from the pit-stop lap count, to account for the first lap being removed
    
    if len(pitStop_laps) < 1:
        return np.nan    # no real stints have occured. Pitted on lap 1, then never pitted again during the race.
    
    sequences = []
    prev_pit = pitStop_laps[0]
    
    if len(pitStop_laps) == 1:   # if the race is a one-stop race 
        sequences.append(race_lapTimes[:prev_pit-1])   # the off-by-one accounts for not taking into consideration the lap with the pit-stop as part of the sequence
    else:                      # multi-stop race as
        
        for current_pit in pitStop_laps:
            if current_pit == prev_pit:           # this is only true when prev_pit = pitStop_laps[0]
                sequences.append(race_lapTimes[:current_pit-1])         # create first stint
                # the off-by-one accounts for not taking into consideration the lap with the pit-stop as part of the sequence
            else:
                sequences.append(race_lapTimes[prev_pit:current_pit-1]) # create next sequence from (prev-pit-lap, current_pit-lap)
                prev_pit = current_pit                             # update pointer to previous pit ... this will be needed for the next pit
    return sequences

### Sequencing Function Test cases

In [None]:
sample_input_pits = merged.iloc[13, :].laps_when_pitstop
sample_input_lapTimes = merged.iloc[13, :].race_lap_times

print("input pits: ", sample_input_pits)
print("input laps: ", sample_input_lapTimes)

print("output: ", partition_lapTime_into_sequences(sample_input_pits, sample_input_lapTimes))

To DO: write test cases

### Get Lap Times of Final Stint (as a non-pit-stint)

In [None]:
def get_last_stint_lap_times(pitStop_laps, race_lapTimes):
    # returns the last stint's lap times, since it is not followed by a pit stop .. so it is non-pit-stop stint 

    last_pit = pitStop_laps[-1]
    return race_lapTimes[last_pit:]

### Test get_last_stint_lap_times function 

In [None]:
sample_input_pits = merged.iloc[13, :].laps_when_pitstop
sample_input_lapTimes = merged.iloc[13, :].race_lap_times

print("input pits: ", sample_input_pits)
print("input laps: ", sample_input_lapTimes)

print("output: ", get_last_stint_lap_times(sample_input_pits, sample_input_lapTimes))

In [None]:
sample_input_pits = merged.iloc[1, :].laps_when_pitstop
sample_input_lapTimes = merged.iloc[1, :].race_lap_times

print("input pits: ", sample_input_pits)
print("input laps: ", sample_input_lapTimes)

print("output: ", get_last_stint_lap_times(sample_input_pits, sample_input_lapTimes))

### Apply sequence partitioning function the merged data set

In [None]:
merged['stints'] = merged.apply(lambda x: partition_lapTime_into_sequences(x.laps_when_pitstop, x.race_lap_times), axis=1)
merged['last_stint'] = merged.apply(lambda x: get_last_stint_lap_times(x.laps_when_pitstop, x.race_lap_times), axis=1)
merged

Check if there are any missing stints

In [None]:
merged.isnull().sum()

There are some missing values based on the sequence partitioning transformation that we have just applied. Let's see where they are.

In [None]:
merged[merged.isnull().any(axis=1)]

As I have thought, all cases are just races when there was only one pit stop, on lap 1, so for the scope of this end-of-stint classifier we can safely remove theses cases, as they do not affect the task at hand

In [None]:
merged = merged.dropna()
merged

In [None]:
end_of_stint_sequences = merged['stints']
end_of_stint_sequences[0]

In [None]:
last_stint_sequences = merged['last_stint']
last_stint_sequences[0]

We need to flatten the structure of the data. We need a list of lists, not a Pandas Series of lists of lists

In [None]:
temp = end_of_stint_sequences.tolist()  # lists of lists of lists
print("Before:", temp[0:3])
print()

In [None]:
# Use list.extend() to convert a a 3D list to a 2D lists
end_of_stint_sequences = []
for elem in temp:
    end_of_stint_sequences.extend(elem)       # this will make it lists of lists

print("After:", end_of_stint_sequences[0:3])
print("Sample Size = ", len(end_of_stint_sequences))

### Generate not end of stint sequences --- this method did not work

My logic here is the following: Don't generate random laptimes, nor stints with random length.

What I propose is: remove the last n laps from a real stint, and label it as a 'not-end-of-stint' kind of a sequence.

The parameter n needs to be experimented with: we need to figure out what kind of experiment setup works best for our binary classifier. 

   - Initially what I am thinking is that I will remove the last lap, and create some fake samples... then remove the last 2 laps, and the last 4 laps, and create samples out of those too.
   - What I want to make sure is to not create a very unbalanced data set. What I am aiming for is 20-25% end-of-stint data, with 75-80% not-end-of stint data to comprise my data set which I will use to train my binary classifier.


In [None]:
def remove_lastN_elements(arr, N):
    return arr[:-N]

In [None]:
print(end_of_stint_sequences[0])
print()
print(remove_lastN_elements(end_of_stint_sequences[0], 2))

In [None]:
NOT_end_of_stint_sequences = []

# N needs to be experimented with. Initially I chose N=1, N=2, and N=4
for lst in end_of_stint_sequences:
    temp_list = remove_lastN_elements(lst, N = 1)        # remove last lap from each stint
    NOT_end_of_stint_sequences.append(temp_list)
    
    temp_list = remove_lastN_elements(lst, N = 2)        # remove last 2 laps from each stint
    NOT_end_of_stint_sequences.append(temp_list)
    
    temp_list = remove_lastN_elements(lst, N = 4)        # remove last 4 laps from each stint
    NOT_end_of_stint_sequences.append(temp_list)

In [None]:
#print(len(NOT_end_of_stint_sequences))
print(len(end_of_stint_sequences))

RESULT = 3:1 ratio between not-end-of-stint and end-os-stint data

Let's create the labels:

In [None]:
end_of_stint_labels = [1] * len(end_of_stint_sequences)
NOT_end_of_stint_labels = [0] * len(NOT_end_of_stint_sequences)

### Get NOT-end-of-stint sequences & Create final data set

In [None]:
NOT_end_of_stint_sequences = last_stint_sequences.tolist() 

NOT_end_of_stint_labels = [0] * len(NOT_end_of_stint_sequences)
end_of_stint_labels = [1] * len(end_of_stint_sequences)

print("Labels:")
print(len(NOT_end_of_stint_labels))
print(len(end_of_stint_labels))

print("\nSequences:")
print(len(NOT_end_of_stint_sequences))
print(len(end_of_stint_sequences))

In [None]:
stint_sequences = end_of_stint_sequences + NOT_end_of_stint_sequences
stint_labels = end_of_stint_labels + NOT_end_of_stint_labels

print(len(stint_sequences))
print(len(stint_labels))

## Binary Classifier

I view this task as a Sequence Classification task, where deep learning approaches have been widely used in practice for similar tasks such as: 

 - DNA Sequence Classification: Given a DNA sequence of ACGT values, predict whether the sequence codes for a coding or non-coding region.
 - Anomaly Detection: Given a sequence of observations, predict whether the sequence is anomalous or not.
 - Sentiment Analysis: Given a sequence of text such as a review or a tweet, predict whether sentiment of the text is positive or negative.
 
Reference: https://machinelearningmastery.com/sequence-prediction/
 
I have done some research on the problem and the most common approaches seem to be using LSTM (Long-Short-Term-Memory) Recurrent Neural Networks. In the upcoming subsections I will test various architectures of different LSTM and maybe even non-LSTM Recurrent Neural Networks to see some results, then evaluate if we need to use some other binary classifier, or we actually just need more data, or better data, or we need to apply some techniques used when working with imbalaced data (undersampling, oversampling).

### Split data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(stint_sequences, stint_labels, test_size=0.20, random_state=7)

print("Train set:", len(X_train))
print("Test set:", len(X_test))
print("Train labels:", len(y_train))
print("Test labels:", len(y_test))

### Pad Input Sequences 

In [None]:
# find out what's the longest stint in our data set
#max_stint_length = max(map(len, end_of_stint_sequences))
#print("Max stint-length =", max_stint_length)

max_stint_length = 30 

In [None]:
from keras.preprocessing import sequence

X_train = sequence.pad_sequences(X_train, maxlen=max_stint_length, padding="pre", truncating='pre')
X_test = sequence.pad_sequences(X_test, maxlen=max_stint_length, padding="pre", truncating='pre')

Wrap every list into numpy arrays, so Keras can process the input

In [None]:
X_train = np.array(X_train)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

X_test = np.array(X_test)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

y_train = np.array(y_train)
y_train = y_train.reshape(y_train.shape[0], 1)

y_test = np.array(y_test)
y_test = y_test.reshape(y_test.shape[0], 1)

testing the ratio of 0 to 1 in the train and test set

In [None]:
unique, frequency = np.unique(y_test,  
                              return_counts = True) 
# print unique values array 
print("Unique Values:",  
      unique) 
  
# print frequency array 
print("Frequency Values:", 
      frequency)

In [None]:
unique, frequency = np.unique(y_train,  
                              return_counts = True) 
# print unique values array 
print("Unique Values:",  
      unique) 
  
# print frequency array 
print("Frequency Values:", 
      frequency)

### Approach 1: Simple LSTM for Sequence Classification

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# fix random seed for reproducibility
np.random.seed(7)

In [None]:
model = Sequential()
model.add(LSTM(100, input_shape=(max_stint_length, 1)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

In [None]:
model = Sequential()
model.add(LSTM(64, input_shape=(max_stint_length, 1), return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

### Approach 2: Time Distributed LSTM

In [None]:
from keras.models import Sequential
from keras.layers import Dense, TimeDistributed, LSTM, Dropout

# fix random seed for reproducibility
np.random.seed(7)

In [None]:
model = Sequential()
model.add(LSTM(512, input_shape=(max_stint_length, 1), return_sequences=True))
#model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

### Approach 3: Bidirectional LSTMs

In [None]:
from keras.layers import Bidirectional

# define LSTM model
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(max_stint_length, 1)))
#model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
#model.add(Dropout(0.2))
#model.add(Bidirectional(LSTM(64, return_sequences=True)))
#model.add(Bidirectional(LSTM(64)))
#model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

### Approach 4: LSTM and CNNs combined

In [None]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

model = Sequential()
model.add(Conv1D(filters=256, kernel_size=3, padding='same', activation='relu', input_shape=(max_stint_length, 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)