In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

## Importing the final files from the data engineering side.

In [51]:
data = pd.read_csv('./data/Final_2.csv')

In [52]:
data.drop(columns=['Unnamed: 0'],inplace=True)

In [59]:
data.shape

(472504, 12)

## Creating a new column which would have a different time then the lap time shown for rows where cars had pitstops. Without this column being updated, any modeling would just choose rows with a large amound of time to complete a lap as laps which are pit stops. The "new" variable creates a list of new lap times for the laps that have a pit stop. It takes the previous lap and adds a second to it and uses that as the new value for that lap. 

In [117]:
new=[data[(data.raceId == data.loc[i].raceId) & 
     (data.driverId == data.loc[i].driverId) & 
     (data.lt_lap == (data.loc[i].lt_lap-1))].milliseconds.values[0]+1000
      if ((data.loc[i].target) and (data.loc[i].lt_lap != 1)) else data.loc[i].milliseconds for i in range(len(data))]

## Creates the new column

In [118]:
data['up_milli'] = new

## To extract only the year for that column

In [129]:
data.date = [x[:4] for x in data.date]

## Exporting the data to the final csv

In [6]:
data.to_csv("./data/For_models.csv")

In [2]:
data = pd.read_csv("./data/For_models.csv")

In [107]:
# Number of negative class observations
n_false = data[data['target'] == False].shape[0]
n_false

465068

In [109]:
data_false = data[data['target'] == False]
data_false.head()

Unnamed: 0.1,Unnamed: 0,raceId,driverId,lt_lap,position,milliseconds,circuitId,date,ps_lap,pit_time,target,up_milli
360,360,841,20,1,1,98109,1,2011,,,False,98109
361,361,841,20,2,1,93006,1,2011,,,False,93006
362,362,841,20,3,1,92713,1,2011,,,False,92713
363,363,841,20,4,1,92803,1,2011,,,False,92803
364,364,841,20,5,1,92342,1,2011,,,False,92342


In [103]:
# Number of positive class observations
data[data['target'] == True].shape[0]

7436

In [112]:
data_true = data[data['target'] == True]
data_true.head()

Unnamed: 0.1,Unnamed: 0,raceId,driverId,lt_lap,position,milliseconds,circuitId,date,ps_lap,pit_time,target,up_milli
0,0,841,153,1,21,133879,1,2011,1.0,26898.0,True,133879
1,1,841,30,1,22,161428,1,2011,1.0,25021.0,True,161428
2,2,841,17,11,7,114995,1,2011,11.0,23426.0,True,95248
3,3,841,4,12,12,112624,1,2011,12.0,23251.0,True,95857
4,4,841,13,13,11,114060,1,2011,13.0,23842.0,True,95662


In [113]:
# Bootstrap data_true so that we have a 50/50 baseline
data_true = data_true.sample(n_false, replace = True)
data_true

Unnamed: 0.1,Unnamed: 0,raceId,driverId,lt_lap,position,milliseconds,circuitId,date,ps_lap,pit_time,target,up_milli
38725,38725,874,5,41,16,102321,22,2012,41.0,22678.0,True,101356
54023,54023,884,8,45,2,92851,4,2013,45.0,20184.0,True,90901
439633,439633,1007,844,13,7,88335,32,2018,13.0,22588.0,True,86636
470347,470347,1030,840,5,15,108982,24,2019,5.0,28842.0,True,107776
23453,23453,864,817,39,15,96177,4,2012,39.0,20059.0,True,92966
...,...,...,...,...,...,...,...,...,...,...,...,...
408760,408760,976,822,11,18,133418,73,2017,11.0,20407.0,True,109350
447321,447321,1013,20,11,5,114804,73,2019,11.0,19981.0,True,110490
92638,92638,929,154,31,7,105040,3,2015,31.0,25259.0,True,103275
84854,84854,911,820,29,18,123046,13,2014,29.0,24873.0,True,119526


In [117]:
split_=pd.concat([data_true, data_false])

In [125]:
split_.columns

Index(['Unnamed: 0', 'raceId', 'driverId', 'lt_lap', 'position',
       'milliseconds', 'circuitId', 'date', 'ps_lap', 'pit_time', 'target',
       'up_milli'],
      dtype='object')

In [129]:
split_ = split_.sort_values(['circuitId', 'raceId', 'lt_lap'])
split_.reset_index(drop = True, inplace = True)
split_

Unnamed: 0.1,Unnamed: 0,raceId,driverId,lt_lap,position,milliseconds,circuitId,date,ps_lap,pit_time,target,up_milli
0,342881,1,1,1,13,109088,1,2009,,,False,109088
1,342939,1,13,1,3,100201,1,2009,,,False,100201
2,342984,1,8,1,5,101712,1,2009,,,False,101712
3,343039,1,9,1,4,101250,1,2009,,,False,101250
4,343094,1,2,1,17,157754,1,2009,,,False,157754
...,...,...,...,...,...,...,...,...,...,...,...,...
930131,452366,1013,830,51,4,106921,73,2019,,,False,106921
930132,452415,1013,846,51,8,107801,73,2019,,,False,107801
930133,452497,1013,832,51,7,107246,73,2019,,,False,107246
930134,452576,1013,844,51,5,106503,73,2019,,,False,106503


In [7]:
features = ['lt_lap', 'position', 'milliseconds', 'circuitId', 'date']

In [10]:
X= data[features]
y= data['target']

## The data set is extemely unbalanced but not much can be changed as every race has an average of 2 pits out of 60-70 laps there will never be a balanced amount of data for it. 

In [225]:
y.value_counts(normalize=True)

False    0.984263
True     0.015737
Name: target, dtype: float64

In [223]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

## Base model logistic regression

In [13]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
logreg.score(X_test, y_test)

0.984120774464974

## Features that include the updated time to have one second added.

In [3]:
features1 = ['lt_lap', 'position', 'up_milli', 'circuitId', 'date']
X= data[features1]
y= data['target']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

In [192]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [193]:
logreg.score(X_test, y_test)

0.9841400142374316

R**2

In [194]:
pred = logreg.predict(X_test)

In [195]:
senistivity_specificity(y_test,pred)

Sensitivity: 0.0
Specificity: 1.0


In [4]:
dt1 = DecisionTreeClassifier(random_state = 42)

In [14]:
dt1.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [15]:
dt1.score(X_test, y_test)

0.978021766595907

In [35]:
out = dt1.predict(X_test)

In [38]:
out[:10]

array([False, False, False, False, False, False, False,  True, False,
       False])

In [40]:
out_=pd.Series(list(out))

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
8    False
9    False
dtype: bool

In [47]:
out_.shape

(155927,)

In [48]:
X_test.shape

(155927, 5)

In [55]:
X_test.reset_index(drop=True, inplace=True)
out_.reset_index(drop=True, inplace=True)

df = pd.concat( [X_test, out_], axis=1)

In [58]:
df.to_csv("./data/pred__.csv")


In [200]:
senistivity_specificity(y_test,out)

Sensitivity: 0.3085
Specificity: 0.9888


In [206]:
sum(list(out))

2480

In [None]:
[]

In [207]:
grid = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [10,15,20,25],
                                  'min_samples_split': [20,25,30,35],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1)

In [208]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:  4.9min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [10, 15, 20, 25],
                      

In [209]:
grid.best_score_

0.9846261728549388

In [210]:
dt = grid.best_estimator_

In [87]:
def senistivity_specificity(y_t, predI):
    tn, fp, fn, tp = confusion_matrix(y_t,
                                  predI).ravel()
    sens = tp / (tp + fn)
    print(f'Sensitivity: {round(sens, 4)}')
    spec = tn / (tn + fp)
    print(f'Specificity: {round(spec, 4)}')

In [213]:
preds = dt.predict(X_test)

In [227]:
rf = RandomForestClassifier()

In [228]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [229]:
preds = rf.predict(X_test)

In [230]:
senistivity_specificity(y_test,preds)

Sensitivity: 0.1084
Specificity: 0.9995


In [232]:
et = ExtraTreesClassifier()

In [233]:
et.fit(X_train,y_train)
preds=rf.predict(X_test)

In [234]:
senistivity_specificity(y_test,preds)

Sensitivity: 0.1084
Specificity: 0.9995


In [5]:
from tensorflow.keras.metrics import SpecificityAtSensitivity
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GRU
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler

In [130]:
features1 = ['lt_lap', 'position', 'up_milli', 'circuitId', 'date']
X= split_[features1]
y= split_['target'].astype('int')

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

In [120]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [9]:
1-y.mean()

0.9842625670893791

In [121]:
model = Sequential()
model.add(Dense(5, input_dim=5, activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(2, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [122]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [123]:
model.fit(
    X_train_sc,
    y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=2000,
    verbose=2
)

Train on 623191 samples, validate on 306945 samples
Epoch 1/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.5001 - val_loss: 0.6931 - val_acc: 0.4994
Epoch 2/100
623191/623191 - 1s - loss: 0.6931 - acc: 0.4995 - val_loss: 0.6931 - val_acc: 0.5006
Epoch 3/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.4998 - val_loss: 0.6931 - val_acc: 0.5006
Epoch 4/100
623191/623191 - 1s - loss: 0.6931 - acc: 0.4993 - val_loss: 0.6931 - val_acc: 0.5006
Epoch 5/100
623191/623191 - 1s - loss: 0.6931 - acc: 0.5005 - val_loss: 0.6932 - val_acc: 0.4994
Epoch 6/100
623191/623191 - 1s - loss: 0.6931 - acc: 0.5005 - val_loss: 0.6932 - val_acc: 0.4994
Epoch 7/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.4995 - val_loss: 0.6932 - val_acc: 0.4994
Epoch 8/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.5001 - val_loss: 0.6931 - val_acc: 0.5006
Epoch 9/100
623191/623191 - 1s - loss: 0.6931 - acc: 0.5004 - val_loss: 0.6932 - val_acc: 0.4994
Epoch 10/100
623191/623191 - 1s - loss: 0.6931 - acc: 0.5002 - val_loss: 0.

Epoch 85/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.4990 - val_loss: 0.6932 - val_acc: 0.4994
Epoch 86/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.4999 - val_loss: 0.6932 - val_acc: 0.4994
Epoch 87/100
623191/623191 - 1s - loss: 0.6931 - acc: 0.5004 - val_loss: 0.6931 - val_acc: 0.5006
Epoch 88/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.4998 - val_loss: 0.6931 - val_acc: 0.5006
Epoch 89/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.5005 - val_loss: 0.6931 - val_acc: 0.5006
Epoch 90/100
623191/623191 - 1s - loss: 0.6931 - acc: 0.4999 - val_loss: 0.6931 - val_acc: 0.4994
Epoch 91/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.4985 - val_loss: 0.6931 - val_acc: 0.4994
Epoch 92/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.4994 - val_loss: 0.6931 - val_acc: 0.5006
Epoch 93/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.5001 - val_loss: 0.6931 - val_acc: 0.5006
Epoch 94/100
623191/623191 - 1s - loss: 0.6932 - acc: 0.5003 - val_loss: 0.6931 - val_acc: 0.4994
Epoch 95/100
623191/

<tensorflow.python.keras.callbacks.History at 0x7f60c56a6ad0>

In [93]:
pred_1=model.predict(X_test_sc)

In [94]:
pred_1

array([[7.5022685e-03],
       [4.4092657e-36],
       [1.1617148e-02],
       ...,
       [7.5749308e-02],
       [0.0000000e+00],
       [2.4362797e-23]], dtype=float32)

In [95]:
pred_m = [pred_1[i][0] for i in range(len(pred_1))]

In [96]:
final_pred = [False if i < .0599 else True for i in pred_m]

In [97]:
senistivity_specificity(y_test,final_pred)

Sensitivity: 0.2952
Specificity: 0.9347


In [86]:
sum(final_pred)

2393

In [34]:
output = model.predict(X_test_sc)

In [184]:
pred_ = [pred2[i][0] for i in range(len(pred2))]

## Only one value was being returned which meant that the model could not seem to predict an outcome of true. 

In [185]:
set(pred_)

{0.5}

In [106]:
sum(pred1)

105394

In [107]:
sum(y_test)

2473

In [64]:
senistivity_specificity(y_test,pred1_)

Sensitivity: 1.0
Specificity: 0.3293


## RNN

In [162]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [171]:
features1 = ['lt_lap', 'position', 'up_milli', 'circuitId', 'date']
X= split_[features1]
y= split_['target'].astype('int')

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = False)

In [173]:
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [174]:
train_sequences = TimeseriesGenerator(Z_train, y_train, length =  10, batch_size = 512)

In [175]:
train_sequences[0][0].shape

(512, 10, 5)

In [176]:
test_sequences = TimeseriesGenerator(Z_test, y_test, length = 10, batch_size=512)

In [177]:
# creating the topology

rnn = Sequential()

rnn.add(GRU(train_sequences[0][0].shape[2],
          input_shape = (train_sequences[0][0].shape[1], train_sequences[0][0].shape[2]),
         return_sequences = True))

rnn.add(GRU(train_sequences[0][0].shape[2], return_sequences = False))

rnn.add(Dense(128, activation = 'relu'))

rnn.add(Dense(2, activation = 'sigmoid'))

rnn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [178]:
# Not working
hist = rnn.fit_generator(train_sequences, epochs = 3, validation_data=test_sequences)

  ...
    to  
  ['...']


KeyError: 10