# Create Lap Feature DataFrame and Model

In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import scripts.f1_scripts  as f1

import statsmodels.api as sms
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor




In [2]:
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

def assign_color(tire):
    colors = {'Ultra': 'm', 'Super': 'r', 'Soft': 'y', 'Medium': 'w', 'Hard': 'k', 'Intermediate': 'g'}
    return colors[tire]

def assign_ordinal(tire):
    vals = {'Ultra': 1, 'Super': 2, 'Soft': 3, 'Medium': 4, 'Hard': 5, 'Intermediate': 6}
    return vals[tire]



In [3]:
# Load driver list as GLOBAL variable
DRIVER_LIST = pd.read_csv('data/drivers.csv')

In [4]:
def create_race_features(filename):
    year, race_num, track = filename.split('_')
    # Load lap times for all drivers
    lap_data = pd.read_csv('data/lap_history/{filename}_lap_history.csv'.format(filename=filename), header=None)
    lap_times = f1.assign_lap(lap_data)
    lap_times['TIME'] = f1.convert_time(lap_times['TIME'])
    lap_times.sort_values(by=['NO', 'LAP'], inplace=True)
    lap_times['LAP'] = lap_times['LAP'].astype(int)

    # Load Tire strategy data
    tire_data = pd.read_csv('data/tire_strategy/{filename}.csv'.format(filename=filename))
    tire_strat = f1.get_tires(tire_data)

    # Join Driver, Name, No. to tire data and sort by No.
    tire_strat = pd.merge(DRIVER_LIST, tire_strat, on='NAME')
    tire_strat.drop(['NAME', 'DRIVER'], axis=1, inplace=True)

    # Append tire data to lap data
    mask = tire_strat.iloc[:,1:].notnull().values
    lap_times['TIRE'] = tire_strat[tire_strat.columns[1:]].values[mask].flatten()
    lap_times['TRACK'] = track
    lap_times['YEAR'] = year
    lap_times['RACE'] = race_num
    lap_times['GAP'] = lap_times['GAP'].apply(lambda x: x.strip())

    return lap_times

def assign_stint_lap(df):
    df['STINT_LAP'] = 1
    idx = df[df['GAP'] == 'PIT'].index
    start = 0
    for val in idx:
        df['STINT_LAP'].ix[start:val] = df.ix[start:val]['STINT_LAP'].cumsum()
        start = val + 1
    end = df.index[-1]
    df['STINT_LAP'].ix[start:end] = df.ix[start:end]['STINT_LAP'].cumsum()
    return df

In [5]:
# Create a list of the available races to date that we can use for training
races = os.listdir('data/fia')
races = races[1:]
races;

In [6]:
# Create DataFrame of all laps for all drivers in all races
list_of_times = []
for race in races:
    lap_times = create_race_features(race)
    list_of_times.append(lap_times)
    print '{} complete.'.format(race)
all_lap_times = pd.concat(list_of_times)
all_lap_times['YEAR'] = all_lap_times['YEAR'].astype(int)
all_lap_times.reset_index(inplace=True, drop=True)



2015_10_hungary complete.
2015_11_belgium complete.
2015_12_italy complete.
2015_13_singapore complete.
2015_14_japan complete.
2015_15_russia complete.
2015_16_usa complete.
2015_17_mexico complete.
2015_18_brazil complete.
2015_19_abudhabi complete.
2015_1_australia complete.
2015_2_malaysia complete.
2015_3_china complete.
2015_4_bahrain complete.
2015_5_spain complete.
2015_6_monaco complete.
2015_7_canada complete.
2015_8_austria complete.
2015_9_britain complete.
2016_1_australia complete.
2016_2_bahrain complete.
2016_3_china complete.


In [7]:
all_lap_times[all_lap_times['LAP'] == 1].shape

(423, 8)

In [8]:
all_lap_times['SAFETY'] = all_lap_times.apply(lambda x: f1.assign_safety(x['TRACK'], x['YEAR'], x['LAP']), axis=1)


In [9]:
all_lap_times[all_lap_times['SAFETY'] == True].shape

(1362, 9)

## Add laps since tire change

In [10]:
all_lap_times.head()

Unnamed: 0,NO,GAP,TIME,LAP,TIRE,TRACK,YEAR,RACE,SAFETY
0,3,6.325,99.193,1,Soft,hungary,2015,10,0
1,3,8.887,91.382,2,Soft,hungary,2015,10,0
2,3,10.556,90.333,3,Soft,hungary,2015,10,0
3,3,12.451,89.962,4,Soft,hungary,2015,10,0
4,3,13.664,89.525,5,Soft,hungary,2015,10,0


In [11]:
all_lap_times = assign_stint_lap(all_lap_times)

        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
all_lap_times

Unnamed: 0,NO,GAP,TIME,LAP,TIRE,TRACK,YEAR,RACE,SAFETY,STINT_LAP
0,3,6.325,99.193,1,Soft,hungary,2015,10,0,1
1,3,8.887,91.382,2,Soft,hungary,2015,10,0,2
2,3,10.556,90.333,3,Soft,hungary,2015,10,0,3
3,3,12.451,89.962,4,Soft,hungary,2015,10,0,4
4,3,13.664,89.525,5,Soft,hungary,2015,10,0,5
5,3,15.351,89.663,6,Soft,hungary,2015,10,0,6
6,3,16.816,89.597,7,Soft,hungary,2015,10,0,7
7,3,17.971,89.516,8,Soft,hungary,2015,10,0,8
8,3,19.129,89.429,9,Soft,hungary,2015,10,0,9
9,3,19.620,88.830,10,Soft,hungary,2015,10,0,10


### Commented out cells used to debug loading the data

In [None]:
# # Load driver list for particular season
# driver_list = pd.read_csv('data/drivers.csv')

# # Load lap times for all drivers
# filename = '2016_3_china'
# lap_data = pd.read_csv('data/lap_history/{filename}_lap_history.csv'.format(filename=filename), header=None)
# lap_times = f1.assign_lap(lap_data)
# lap_times['TIME'] = f1.convert_time(lap_times['TIME'])
# lap_times.sort_values(by=['NO', 'LAP'], inplace=True)
# lap_times.shape

In [None]:
# # Load Tire strategy data
# tire_data = pd.read_csv('data/tire_strategy/{filename}.csv'.format(filename=filename))
# tire_strat = f1.get_tires(tire_data)
# tire_strat['NAME'].unique;

In [None]:
# # Join Driver, Name, No. to tire data and sort by No.
# tire_strat = pd.merge(driver_list, tire_strat, on='NAME')
# tire_strat.drop(['NAME', 'DRIVER'], axis=1, inplace=True)
# tire_strat;

In [None]:
# # Append tire data to lap data
# mask = tire_strat.iloc[:,1:].notnull().values
# # lap_times['TIRE'] = 
# tire_strat[tire_strat.columns[1:]].values[mask].flatten().shape
# # lap_times['TRACK'] = 'Australia'
# # lap_times['YEAR'] = 2016
# # lap_times['RACE'] = 1
# # lap_times

In [None]:
# for num in lap_times['NO'].unique():
#     print num
#     print lap_times[lap_times['NO'] == num].shape

In [None]:
# for num in lap_times['NO'].unique():
#     print num
#     print 'Laps: ',tire_strat.ix[tire_strat['NO'] == num].notnull().values.sum() - 1

## Remove Pit and Out laps

In [13]:
def remove_pits(df):
    idx = []
    previous = None
    for row in df.iterrows():
        if row[1]['GAP'] == 'PIT' or previous == 'PIT':
            idx.append(row[0])
        previous = row[1]['GAP']
    return df.drop(idx, axis=0)

In [14]:
no_pits = remove_pits(all_lap_times)
print all_lap_times.shape
print no_pits.shape

(23097, 10)
(21255, 10)


In [15]:
no_pits.drop(no_pits[no_pits['LAP'] == 1].index, axis=0, inplace=True)
no_pits.shape

(20896, 10)

In [16]:
no_safety = no_pits[no_pits['SAFETY'] == 0]
no_safety.shape

(19867, 10)

## Create average lap times by driver, race, tire

In [None]:
group1 = no_safety.drop(['GAP', 'LAP', 'RACE', 'SAFETY'], axis=1).groupby(['NO', 'TIRE', 'TRACK', 'YEAR'], as_index=False)
avg_laps = pd.merge(group1.count(), group1.mean(), how='left', on=['NO', 'TIRE', 'TRACK', 'YEAR'])
avg_laps.head()


In [None]:
group2 = no_safety[['NO', 'TIRE', 'TRACK', 'YEAR', 'TIME']].groupby(['NO', 'TIRE', 'TRACK', 'YEAR'])
std_laps = pd.merge(avg_laps, group2.std().reset_index(), how='left', on=['NO', 'TIRE', 'TRACK', 'YEAR'])
std_laps.columns = ['NO', 'TIRE', 'TRACK', 'YEAR', 'COUNT', 'TIME_AVG', 'TIME_STD']
std_laps.head()


## Add track features to lap features data frame

In [17]:
# Load Track data
tracks = pd.read_csv('data/track_history.csv')
tracks.drop(['LAPS', 'T_TEMP_MIN'], axis=1, inplace=True)
tracks['TRACK'] = tracks['TRACK'].apply(lambda x: x.lower())
tracks.head()

Unnamed: 0,TRACK,YEAR,LENGTH,DOWNFORCE,LATERAL,ASPHALT_ABR,ASPHALT_GRP,TIRE_STRESS,AIR_TEMP_MAX,AIR_TEMP_MIN,T_TEMP_MAX
0,australia,2016,5.3,4,1,3,1,1,24.7,22.2,38.4
1,bahrain,2016,5.41,3,3,5,4,3,22.6,21.2,29.6
2,china,2016,5.45,3,4,3,3,4,21.9,20.3,37.3
3,australia,2015,5.3,4,1,3,1,1,21.0,17.0,38.0
4,malaysia,2015,5.54,3,4,3,3,4,35.4,32.7,61.4


In [None]:
std_laps.info()

In [None]:
tracks.info()

In [None]:
lap_features = pd.merge(std_laps, tracks, how='left', on=['TRACK', 'YEAR'])
lap_features['TIRE_VAL'] = lap_features['TIRE'].apply(assign_ordinal)
lap_features.head()

In [None]:
regression_features = lap_features.drop(['NO', 'TIRE', 'TRACK', 'YEAR', 'TIME_STD'], axis=1)
regression_features.info()

In [None]:
y = regression_features.pop('TIME_AVG')
y;

In [None]:
X = regression_features
X;

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
model = sms.OLS(y_train, X_train).fit()
summary = model.summary()

In [None]:
summary

In [None]:
y_predict_OLS = model.predict(X_test)

In [None]:
# model.score(y_predict, y_test)

## all lap prediction

In [54]:
all_lap_features = pd.merge(no_safety, tracks, how='left', on=['TRACK', 'YEAR'])
all_lap_features.shape

(19867, 19)

In [70]:
all_regression_features = all_lap_features.drop(['NO', 'GAP', 'TRACK', 'YEAR', 'RACE', 'SAFETY'], axis=1)
with_dummies = pd.get_dummies(all_regression_features, columns=['TIRE'])
with_dummies.drop(['TIRE_Intermediate'], axis=1, inplace=True)
with_dummies.head()

Unnamed: 0,TIME,LAP,STINT_LAP,LENGTH,DOWNFORCE,LATERAL,ASPHALT_ABR,ASPHALT_GRP,TIRE_STRESS,AIR_TEMP_MAX,AIR_TEMP_MIN,T_TEMP_MAX,TIRE_Hard,TIRE_Medium,TIRE_Soft,TIRE_Super
0,91.382,2,2,4.38,4,4,3,4,3,24.9,22.0,49.2,0.0,0.0,1.0,0.0
1,90.333,3,3,4.38,4,4,3,4,3,24.9,22.0,49.2,0.0,0.0,1.0,0.0
2,89.962,4,4,4.38,4,4,3,4,3,24.9,22.0,49.2,0.0,0.0,1.0,0.0
3,89.525,5,5,4.38,4,4,3,4,3,24.9,22.0,49.2,0.0,0.0,1.0,0.0
4,89.663,6,6,4.38,4,4,3,4,3,24.9,22.0,49.2,0.0,0.0,1.0,0.0


In [71]:
# sn.pairplot(with_dummies)

In [72]:
y2 = with_dummies.pop('TIME')
y2.head()

0    91.382
1    90.333
2    89.962
3    89.525
4    89.663
Name: TIME, dtype: float64

In [73]:
X2 = with_dummies


In [74]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2)

In [75]:
model2 = sms.OLS(y2_train, sms.add_constant(X2_train)).fit()
summary2 = model2.summary()
model2.rsquared

0.83375477993902658

In [76]:
print summary2

                            OLS Regression Results                            
Dep. Variable:                   TIME   R-squared:                       0.834
Model:                            OLS   Adj. R-squared:                  0.834
Method:                 Least Squares   F-statistic:                     4976.
Date:                Fri, 29 Apr 2016   Prob (F-statistic):               0.00
Time:                        01:45:31   Log-Likelihood:                -45094.
No. Observations:               14900   AIC:                         9.022e+04
Df Residuals:                   14884   BIC:                         9.034e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
const           18.9274      0.738     25.635   

In [79]:
model3 = sms.OLS(y2, sms.add_constant(X2)).fit()
summary3 = model3.summary()
model3.rsquared

0.83463909951588477

In [80]:
print summary3

                            OLS Regression Results                            
Dep. Variable:                   TIME   R-squared:                       0.835
Model:                            OLS   Adj. R-squared:                  0.835
Method:                 Least Squares   F-statistic:                     6680.
Date:                Fri, 29 Apr 2016   Prob (F-statistic):               0.00
Time:                        01:55:30   Log-Likelihood:                -60079.
No. Observations:               19867   AIC:                         1.202e+05
Df Residuals:                   19851   BIC:                         1.203e+05
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
const           18.9320      0.636     29.776   

In [77]:
linear2 = LinearRegression()
linear2.fit(X2_train, y2_train)
linear2.score(X2_train, y2_train)

0.83375477993902658

In [69]:
y2_predict = linear2.predict(X2_test)
linear2.score(X2_test, y2_test)

0.76815973241960467

In [28]:
y2_predict - y2_test

7010      5.996850
115      -1.680710
8885     -1.148660
11153     1.855801
14337    -7.034480
17331     5.583096
9803      1.359103
6024     -3.018967
14361     0.727547
3546      6.803056
2154      2.665684
11415    -0.832232
13919    -0.432684
14761    -0.076369
12105     0.875619
1465     -2.739186
13888    -0.460093
7042      5.415550
14414     2.059480
5021      2.003997
18038     3.671206
5509    -17.081439
358      -2.593503
15248    -3.913450
17360     9.078168
17230     9.005295
6069     -1.151434
2254      3.423573
343      -0.470667
14863    -1.556978
           ...    
5279     11.596025
13781    -2.693382
9722      4.038790
6000     -2.248089
7747      1.168333
6600     -1.603232
14915    -0.626754
19558    -7.596721
4709      4.690387
5370      7.780667
7086      6.993504
481      -2.013645
8092     -6.190788
17610     9.804842
19108    -7.558245
10246    -2.690974
163      -1.180852
4787     -0.396203
644      -1.012873
3817      6.995126
9161     -2.921828
19642    -4.

In [47]:
rfr2 = RandomForestRegressor(n_estimators=20, max_features='sqrt', oob_score=True)

In [48]:
rfr2.fit(X2_train, y2_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [49]:
rfr2.oob_score_

0.96728737326950354

In [50]:
rfr2.predict(X2_test) - y2_test

7010    -0.449700
115      2.025150
8885     2.244600
11153    1.695613
14337   -0.380660
17331   -1.258138
9803    -0.114663
6024     1.589400
14361   -0.166850
3546    -0.228100
2154    -0.843650
11415   -0.684067
13919    1.376017
14761   -0.342892
12105    3.477558
1465    -0.721717
13888    0.992500
7042    -1.166700
14414    0.501600
5021     0.667700
18038    0.542050
5509     0.682200
358     -0.331900
15248   -0.381750
17360   -0.065300
17230    1.016086
6069    -0.030900
2254    -1.421175
343      1.283267
14863    1.726057
           ...   
5279     1.037800
13781    0.215238
9722     4.565050
6000    -0.176800
7747    -2.192000
6600     0.532600
14915    0.397700
19558   -1.755150
4709    -0.818650
5370    -2.725300
7086    -0.365950
481     -1.553549
8092    -1.446850
17610    1.670925
19108   -0.885450
10246    3.618800
163      0.859000
4787    -4.515108
644     -0.390614
3817    -0.197900
9161     1.341000
19642    0.991150
6668     0.020700
10606    1.072183
18601    0

In [51]:
rfr2.score(X2_train, y2_train)

0.99333654541341787

In [52]:
rfr2.score(X2_test, y2_test)

0.97658787659149715

In [53]:
rfr2.feature_importances_

array([ 0.02207809,  0.01222142,  0.33010442,  0.04942634,  0.08501465,
        0.08567886,  0.05268774,  0.05929288,  0.0736983 ,  0.09517426,
        0.11906978,  0.00121302,  0.00377299,  0.00054615,  0.0100211 ])

### Sklearn LinearReg Model, no cross val

In [None]:
linear = LinearRegression()
linear.fit(X_train, y_train)
linear.score(X_train, y_train)

In [None]:
y_predict = linear.predict(X_test)
linear.score(X_test, y_test)

In [None]:
y_predict - y_test

### Sklearn LinearReg Model, with cross val

In [None]:
linear_cv = cross_val_score(linear, X, y, scoring='r2', cv=5)
linear_cv

### RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(n_estimators=15, max_features='auto', oob_score=True)

In [None]:
rfr.fit(X_train, y_train)

In [None]:
rfr.oob_score_

In [None]:
rfr.predict(X_test) - y_test

In [None]:
rfr.score(X_train, y_train)

In [None]:
rfr.score(X_test, y_test)

### AdaBoostRegressor

In [None]:
abr = AdaBoostRegressor(base_estimator=rfr, n_estimators=400, learning_rate=.01, loss='square')

In [None]:
abr.fit(X_train, y_train)

In [None]:
abr.score(X_train, y_train)

In [None]:
abr.score(X_test, y_test)

In [None]:
abr.predict(X_test) - y_test

## Plot All Lap times vs Lap, Labeled by Tire type

In [None]:
def plot_drivers(df, race):
    sn.set_style(style='whitegrid')
    year, race_num, track = race.split('_')
    this_race = df[(df['TRACK'] == track) & (df['YEAR'] == int(year))]
    for num in this_race['NO'].unique():
        driver_idx = this_race['NO'] == num
        plt.figure(figsize=(12,6))
        plt.title('{} {} - Driver No. {}'.format(track.upper(), year, num))
        plt.xlim([0, this_race['LAP'].max() + 1])
        plt.scatter(this_race['LAP'][driver_idx], this_race['TIME'][driver_idx], c=this_race['TIRE'][driver_idx].apply(assign_color), alpha=1)
    plt.show()
    
def plot_race(df, race):
    year, race_num, track = race.split('_')
    this_race = df[(df['TRACK'] == track) & (df['YEAR'] == int(year))]
    plt.figure(figsize=(12,6))
    plt.title('{} {}'.format(track.upper(), year))
    plt.xlim([0, this_race['LAP'].max() + 1])
    plt.scatter(this_race['LAP'], this_race['TIME'], c=this_race['TIRE'].apply(assign_color), alpha=.5)
    plt.show()

In [None]:
plot_drivers(all_lap_times, '2016_1_australia')

In [None]:
plot_drivers(no_safety, '2016_2_bahrain')

In [None]:
for race in ['2016_1_bahrain']:
    plot_race(all_lap_times, race)

In [None]:
for race in races:
    plot_race(no_safety, race)

## TEST CODE BELOW

In [None]:
no_safety[(no_safety['NO'] == 20) & (no_safety['TRACK'] == 'australia') & (no_safety['YEAR'] == 2016)]

In [None]:
all_lap_times.ix[:80]


In [None]:
# df['STINT_LAP'] = 1
# idx = df[df['GAP'] == 'PIT'].index
# start = 0
# for val in idx:
#     df['STINT_LAP'] = df.ix[start:val]['STINT_LAP'].apply(cumsum, axis=0)
#     start = val + 1

    

In [None]:
all_lap_times['STINT_LAP'] = 1
idx = all_lap_times[all_lap_times['GAP'] == 'PIT'].index
start = 0
for val in idx:
    all_lap_times['STINT_LAP'].ix[start:val] = all_lap_times.ix[start:val]['STINT_LAP'].cumsum()
    start = val + 1
end = all_lap_times.index[-1]
all_lap_times['STINT_LAP'].ix[start:end] = all_lap_times.ix[start:end]['STINT_LAP'].cumsum()


In [None]:
all_lap_times



In [None]:
all_lap_times.index[-1]

In [None]:
print all_lap_times.head()
print all_lap_times.ix[0:5]['GAP'].astype(float)
print all_lap_times.ix[0:5]['GAP'].shift(1).fillna(value=0).astype(float)

In [None]:
foo = f1.load_tracks()
bar = tracks[]