In [26]:
import pandas as pd
import numpy as np
import datetime
import os
from scripts.f1_scripts import (assign_lap, 
                        get_tires, 
                        get_sector_times,
                        convert_time
                               )

import statsmodels.api as sms
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

In [13]:
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

def assign_color(tire):
    colors = {'Ultra': 'm', 'Super': 'r', 'Soft': 'y', 'Medium': 'w', 'Hard': 'k', 'Intermediate': 'g'}
    return colors[tire]

def assign_ordinal(tire):
    vals = {'Ultra': 1, 'Super': 2, 'Soft': 3, 'Medium': 4, 'Hard': 5, 'Intermediate': 5}
    return vals[tire]



In [2]:
# Load driver list as GLOBAL variable
DRIVER_LIST = pd.read_csv('data/drivers.csv')

In [3]:
def create_race_features(filename):
    year, race_num, track = filename.split('_')
    # Load lap times for all drivers
    lap_data = pd.read_csv('data/lap_history/{filename}_lap_history.csv'.format(filename=filename), header=None)
    lap_times = assign_lap(lap_data)
    lap_times['TIME'] = convert_time(lap_times['TIME'])
    lap_times.sort_values(by=['NO', 'LAP'], inplace=True)

    # Load Tire strategy data
    tire_data = pd.read_csv('data/tire_strategy/{filename}.csv'.format(filename=filename))
    tire_strat = get_tires(tire_data)

    # Join Driver, Name, No. to tire data and sort by No.
    tire_strat = pd.merge(DRIVER_LIST, tire_strat, on='NAME')
    tire_strat.drop(['NAME', 'DRIVER'], axis=1, inplace=True)

    # Append tire data to lap data
    mask = tire_strat.iloc[:,1:].notnull().values
    lap_times['TIRE'] = tire_strat[tire_strat.columns[1:]].values[mask].flatten()
    lap_times['TRACK'] = track
    lap_times['YEAR'] = year
    lap_times['RACE'] = race_num

    return lap_times


In [4]:
# Create a list of the available races to date that we can use for training
races = os.listdir('data/fia')
races = races[1:]
# Remove bahrain 2015 until data is fixed
races.pop(13)
races;

In [5]:
# Create DataFrame of all laps for all drivers in all races
list_of_times = []
for race in races:
    lap_times = create_race_features(race)
    list_of_times.append(lap_times)
    print '{} complete.'.format(race)
all_lap_times = pd.concat(list_of_times)
all_lap_times['YEAR'] = all_lap_times['YEAR'].astype(int)



2015_10_hungary complete.
2015_11_belgium complete.
2015_12_italy complete.
2015_13_singapore complete.
2015_14_japan complete.
2015_15_russia complete.
2015_16_usa complete.
2015_17_mexico complete.
2015_18_brazil complete.
2015_19_abudhabi complete.
2015_1_australia complete.
2015_2_malaysia complete.
2015_3_china complete.
2015_5_spain complete.
2015_6_monaco complete.
2015_7_canada complete.
2015_8_austria complete.
2015_9_britain complete.
2016_1_australia complete.
2016_2_bahrain complete.
2016_3_china complete.


### Commented out cells used to debug loading the data

In [None]:
# # Load driver list for particular season
# driver_list = pd.read_csv('data/drivers.csv')

# # Load lap times for all drivers
# filename = '2016_3_china'
# lap_data = pd.read_csv('data/lap_history/{filename}_lap_history.csv'.format(filename=filename), header=None)
# lap_times = assign_lap(lap_data)
# lap_times['TIME'] = convert_time(lap_times['TIME'])
# lap_times.sort_values(by=['NO', 'LAP'], inplace=True)
# lap_times.shape

In [None]:
# # Load Tire strategy data
# tire_data = pd.read_csv('data/tire_strategy/{filename}.csv'.format(filename=filename))
# tire_strat = get_tires(tire_data)
# tire_strat['NAME'].unique;

In [None]:
# # Join Driver, Name, No. to tire data and sort by No.
# tire_strat = pd.merge(driver_list, tire_strat, on='NAME')
# tire_strat.drop(['NAME', 'DRIVER'], axis=1, inplace=True)
# tire_strat;

In [None]:
# # Append tire data to lap data
# mask = tire_strat.iloc[:,1:].notnull().values
# # lap_times['TIRE'] = 
# tire_strat[tire_strat.columns[1:]].values[mask].flatten().shape
# # lap_times['TRACK'] = 'Australia'
# # lap_times['YEAR'] = 2016
# # lap_times['RACE'] = 1
# # lap_times

In [None]:
# for num in lap_times['NO'].unique():
#     print num
#     print lap_times[lap_times['NO'] == num].shape

In [None]:
# for num in lap_times['NO'].unique():
#     print num
#     print 'Laps: ',tire_strat.ix[tire_strat['NO'] == num].notnull().values.sum() - 1

### Add track features to lap features data frame

In [6]:
# Load Track data
tracks = pd.read_csv('data/track_profiles.csv')
tracks.drop('LAPS', axis=1, inplace=True)
tracks['TRACK'] = tracks['TRACK'].apply(lambda x: x.lower())
tracks.head()

Unnamed: 0,TRACK,YEAR,LENGTH,DOWNFORCE,LATERAL,ASPHALT_ABR,ASPHALT_GRP,TIRE_STRESS,AIR_TEMP_MAX,AIR_TEMP_MIN,T_TEMP_MAX,T_TEMP_MIN
0,australia,2016,5.3,4,1,3,1,1,24.7,22.2,38.4,26.8
1,bahrain,2016,5.41,3,3,5,4,3,22.6,21.2,29.6,25.9
2,china,2016,5.45,3,4,3,3,4,21.9,20.3,37.3,45.3
3,australia,2015,5.3,4,1,3,1,1,21.0,17.0,38.0,29.0
4,malaysia,2015,5.54,3,4,3,3,4,35.4,32.7,61.4,53.0


In [7]:
all_lap_times.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22078 entries, 6 to 1206
Data columns (total 8 columns):
NO       22078 non-null object
GAP      22078 non-null object
TIME     22078 non-null float64
LAP      22078 non-null object
TIRE     22078 non-null object
TRACK    22078 non-null object
YEAR     22078 non-null int64
RACE     22078 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 1.5+ MB


In [8]:
tracks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22 entries, 0 to 21
Data columns (total 12 columns):
TRACK           22 non-null object
YEAR            22 non-null int64
LENGTH          22 non-null float64
DOWNFORCE       22 non-null int64
LATERAL         22 non-null int64
ASPHALT_ABR     22 non-null int64
ASPHALT_GRP     22 non-null int64
TIRE_STRESS     22 non-null int64
AIR_TEMP_MAX    22 non-null float64
AIR_TEMP_MIN    22 non-null float64
T_TEMP_MAX      22 non-null float64
T_TEMP_MIN      22 non-null float64
dtypes: float64(5), int64(6), object(1)
memory usage: 2.2+ KB


In [18]:
grouped = all_lap_times.drop(['GAP', 'LAP', 'RACE'], axis=1).groupby(['NO', 'TIRE', 'TRACK', 'YEAR'], as_index=False)
avg_laps = pd.merge(grouped.count(), grouped.mean(), how='left', on=['NO', 'TIRE', 'TRACK', 'YEAR'])
avg_laps.columns = ['NO', 'TIRE', 'TRACK', 'YEAR', 'COUNT', 'TIME_AVG']
avg_laps.head()

Unnamed: 0,NO,TIRE,TRACK,YEAR,COUNT,TIME_AVG
0,3,Hard,britain,2015,10,108.7041
1,3,Hard,japan,2015,51,101.457765
2,3,Hard,malaysia,2015,37,108.02027
3,3,Hard,spain,2015,23,92.960391
4,3,Intermediate,usa,2015,19,119.811789


In [45]:
lap_features = pd.merge(avg_laps, tracks, how='left', on=['TRACK', 'YEAR'])
lap_features['TIRE_VAL'] = lap_features['TIRE'].apply(assign_ordinal)
lap_features.head()

Unnamed: 0,NO,TIRE,TRACK,YEAR,COUNT,TIME_AVG,LENGTH,DOWNFORCE,LATERAL,ASPHALT_ABR,ASPHALT_GRP,TIRE_STRESS,AIR_TEMP_MAX,AIR_TEMP_MIN,T_TEMP_MAX,T_TEMP_MIN,TIRE_VAL
0,3,Hard,britain,2015,10,108.7041,5.89,4,5,3,4,5,20.6,17.3,38.9,23.9,5
1,3,Hard,japan,2015,51,101.457765,5.8,3,5,3,4,5,28.6,27.2,42.4,35.3,5
2,3,Hard,malaysia,2015,37,108.02027,5.54,3,4,3,3,4,35.4,32.7,61.4,53.0,5
3,3,Hard,spain,2015,23,92.960391,4.65,4,4,4,4,4,28.3,25.3,51.9,45.6,5
4,3,Intermediate,usa,2015,19,119.811789,5.51,3,4,3,2,3,17.8,17.2,19.4,17.8,5


In [46]:
regression_features = lap_features.drop(['NO', 'TIRE', 'TRACK', 'YEAR', 'T_TEMP_MIN'], axis=1)
regression_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 830 entries, 0 to 829
Data columns (total 12 columns):
COUNT           830 non-null int64
TIME_AVG        830 non-null float64
LENGTH          830 non-null float64
DOWNFORCE       830 non-null int64
LATERAL         830 non-null int64
ASPHALT_ABR     830 non-null int64
ASPHALT_GRP     830 non-null int64
TIRE_STRESS     830 non-null int64
AIR_TEMP_MAX    830 non-null float64
AIR_TEMP_MIN    830 non-null float64
T_TEMP_MAX      830 non-null float64
TIRE_VAL        830 non-null int64
dtypes: float64(5), int64(7)
memory usage: 84.3 KB


In [47]:
y = regression_features.pop('TIME_AVG')
y;

In [48]:
X = regression_features
X;

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [50]:
model = sms.OLS(y_train, X_train).fit()
summary = model.summary()

In [51]:
summary

0,1,2,3
Dep. Variable:,TIME_AVG,R-squared:,0.979
Model:,OLS,Adj. R-squared:,0.978
Method:,Least Squares,F-statistic:,2565.0
Date:,"Mon, 25 Apr 2016",Prob (F-statistic):,0.0
Time:,09:49:54,Log-Likelihood:,-2582.3
No. Observations:,622,AIC:,5187.0
Df Residuals:,611,BIC:,5235.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
COUNT,-0.3510,0.046,-7.663,0.000,-0.441 -0.261
LENGTH,17.8175,0.741,24.050,0.000,16.363 19.272
DOWNFORCE,4.8299,0.768,6.290,0.000,3.322 6.338
LATERAL,5.4716,1.347,4.062,0.000,2.826 8.117
ASPHALT_ABR,3.9741,1.032,3.851,0.000,1.948 6.001
ASPHALT_GRP,-6.3488,1.297,-4.894,0.000,-8.896 -3.801
TIRE_STRESS,-7.4980,0.968,-7.745,0.000,-9.399 -5.597
AIR_TEMP_MAX,-6.0211,1.154,-5.217,0.000,-8.288 -3.755
AIR_TEMP_MIN,6.0040,1.062,5.652,0.000,3.918 8.090

0,1,2,3
Omnibus:,680.54,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51534.273
Skew:,5.037,Prob(JB):,0.0
Kurtosis:,46.439,Cond. No.,208.0


In [53]:
y_predict = model.predict(X_test)

In [55]:
# model.score(y_predict, y_test)

## Plot Tire Stints

In [None]:
avg_laps['TIRE_VAL'] = avg_laps['TIRE'].apply(assign_ordinal)
avg_laps.head()

In [None]:
sn.pairplot(avg_laps[['COUNT', 'TIME', 'TIRE_VAL']])

In [None]:
plt.scatter(avg_laps['COUNT'], avg_laps['TIME'], c=avg_laps['TIRE'].apply(assign_color), alpha=.5)
plt.xlim([0, 50])
plt.ylim([50,200])

## TEST CODE BELOW

In [None]:
pwd