In [4]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

## Importing the final files from the data engineering side.

In [51]:
data = pd.read_csv('./data/Final_2.csv')

In [52]:
data.drop(columns=['Unnamed: 0'],inplace=True)

In [53]:
data.head()

Unnamed: 0,raceId,driverId,lt_lap,position,milliseconds,circuitId,date,ps_lap,pit_time,target
0,841,153,1,21,133879,1,2011-03-27,1.0,26898.0,True
1,841,30,1,22,161428,1,2011-03-27,1.0,25021.0,True
2,841,17,11,7,114995,1,2011-03-27,11.0,23426.0,True
3,841,4,12,12,112624,1,2011-03-27,12.0,23251.0,True
4,841,13,13,11,114060,1,2011-03-27,13.0,23842.0,True


## Creating a new column which would have a different time then the lap time shown for rows where cars had pitstops. Without this column being updated, any modeling would just choose rows with a large amound of time to complete a lap as laps which are pit stops. The "new" variable creates a list of new lap times for the laps that have a pit stop. It takes the previous lap and adds a second to it and uses that as the new value for that lap. 

In [117]:
new=[data[(data.raceId == data.loc[i].raceId) & 
     (data.driverId == data.loc[i].driverId) & 
     (data.lt_lap == (data.loc[i].lt_lap-1))].milliseconds.values[0]+1000
      if ((data.loc[i].target) and (data.loc[i].lt_lap != 1)) else data.loc[i].milliseconds for i in range(len(data))]

## Creates the new column

In [118]:
data['up_milli'] = new

## To extract only the year for that column

In [129]:
data.date = [x[:4] for x in data.date]

## Exporting the data to the final csv

In [2]:
data.to_csv("./data/For_models.csv")

In [5]:
data = pd.read_csv("./data/For_models.csv")

In [200]:
features = ['lt_lap', 'position', 'milliseconds', 'circuitId', 'date']

In [222]:
X= data[features1]
y= data['target']

## The data set is extemely unbalanced but not much can be changed as every race has an average of 2 pits out of 60-70 laps there will never be a balanced amount of data for it. 

In [225]:
y.value_counts(normalize=True)

False    0.984263
True     0.015737
Name: target, dtype: float64

In [223]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

## Base model logistic regression

In [13]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
logreg.score(X_test, y_test)

0.984120774464974

## Features that include the updated time to have one second added.

In [190]:
features1 = ['lt_lap', 'position', 'up_milli', 'circuitId', 'date']
X= data[features1]
y= data['target']

In [191]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

In [192]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [193]:
logreg.score(X_test, y_test)

0.9841400142374316

R**2

In [194]:
pred = logreg.predict(X_test)

In [195]:
senistivity_specificity(y_test,pred)

Sensitivity: 0.0
Specificity: 1.0


In [196]:
dt1 = DecisionTreeClassifier(random_state = 42)

In [197]:
dt1.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [198]:
dt1.score(X_test, y_test)

0.978021766595907

In [199]:
out = dt1.predict(X_test)

In [200]:
senistivity_specificity(y_test,out)

Sensitivity: 0.3085
Specificity: 0.9888


In [206]:
sum(list(out))

2480

In [None]:
[]

In [207]:
grid = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [10,15,20,25],
                                  'min_samples_split': [20,25,30,35],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1)

In [208]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:  4.9min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [10, 15, 20, 25],
                      

In [209]:
grid.best_score_

0.9846261728549388

In [210]:
dt = grid.best_estimator_

In [57]:
def senistivity_specificity(y_t, predI):
    tn, fp, fn, tp = confusion_matrix(y_t,
                                  predI).ravel()
    sens = tp / (tp + fn)
    print(f'Sensitivity: {round(sens, 4)}')
    spec = tn / (tn + fp)
    print(f'Specificity: {round(spec, 4)}')

In [213]:
preds = dt.predict(X_test)

In [227]:
rf = RandomForestClassifier()

In [228]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [229]:
preds = rf.predict(X_test)

In [230]:
senistivity_specificity(y_test,preds)

Sensitivity: 0.1084
Specificity: 0.9995


In [232]:
et = ExtraTreesClassifier()

In [233]:
et.fit(X_train,y_train)
preds=rf.predict(X_test)

In [234]:
senistivity_specificity(y_test,preds)

Sensitivity: 0.1084
Specificity: 0.9995


In [115]:
from tensorflow.keras.metrics import SpecificityAtSensitivity
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GRU
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler

In [186]:
features1 = ['lt_lap', 'position', 'up_milli', 'circuitId', 'date']
X= data[features1]
y= data['target'].astype('int')

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

In [188]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [171]:
1-y.mean()

0.9842625670893791

In [172]:
model = Sequential()
model.add(Dense(5, input_dim=5, activation='relu'))
model.add(Dense(3, input_dim=5, activation='relu'))
model.add(Dense(2, input_dim=5, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [173]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [174]:
history.model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=1500,
    verbose=2
)

Train on 316577 samples, validate on 155927 samples
Epoch 1/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 2/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 3/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 4/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 5/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 6/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 7/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 8/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 9/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 10/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.

Epoch 85/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 86/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 87/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 88/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 89/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 90/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 91/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 92/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 93/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 94/100
316577/316577 - 1s - loss: 0.0807 - acc: 0.9843 - val_loss: 0.0815 - val_acc: 0.9841
Epoch 95/100
316577/

<tensorflow.python.keras.callbacks.History at 0x7f1f2f494d50>

In [175]:
pred2=model.predict(X_test)

In [181]:
set(pred_)

{False}

In [34]:
output = model.predict(X_test_sc)

In [62]:
pred = history.model.predict(X_test_sc)

In [184]:
pred_ = [pred2[i][0] for i in range(len(pred2))]

## Only one value was being returned which meant that the model could not seem to predict an outcome of true. 

In [185]:
set(pred_)

{0.5}

In [106]:
sum(pred1)

105394

In [107]:
sum(y_test)

2473

In [64]:
senistivity_specificity(y_test,pred1)

Sensitivity: 1.0
Specificity: 0.3293
