# Chapter 2: Basic Analysis
Slicing, Dicing, and Graphing.  Simple exploration of the data.

In [23]:
import pandas as pd
from os import listdir
from os.path import isfile, join

raw_data_path = '../raw_data/'
logfiles = [f for f in listdir(raw_data_path) if isfile(join(raw_data_path, f))]
print(logfiles)

col_map = {
    'Time (sec)': 'time',
    'A/F Sens 1 Ratio (AFR)': 'afr',
    'AF Sens 1 Ratio (AFR)': 'afr',
    'Boost (PSI)': 'boost',
    'Calculated Load (g/rev)': 'load',
    'Feedback Knock (°)': 'feedback_knock',
    'Fine Knock Learn (°)': 'fine_knock_learn',
    'Gear Position (Gear)': 'gear_position',
    'Ignition Timing (°)': 'timing',
    'MAF (g/s)': 'maf',
    'Primary Ign. (°)': 'primary_timing',
    'RPM (RPM)': 'rpm',
    'TD Burst (%)': 'td_burst',
    'TD Continuous (%)': 'td_continuous',
    'Target Boost (PSI)': 'boost_target',
    'Throttle Pos. (%)': 'throttle',
    'Wastegate Duty (%)': 'wg_duty',
    'Wastegate Max (%)': 'wg_max',
}
dataframes = []

for log in logfiles:
    #print( log )
    # read the logfile
    data = pd.read_csv('../raw_data/{}'.format(log), encoding='iso-8859-1')
    
    #print(data.columns)
    # drop the metadata column:
    data.drop(data.columns[[-1]], axis=1, inplace=True)

    # rename the columns
    data.rename(columns=col_map, inplace=True)
    
    # change delta_t to timestamps
    start_time = pd.Timestamp(log.split('.')[1])
    delta_t = pd.to_timedelta(data['time'], unit='s')
    data['time'] = delta_t + start_times[0]

    # and set the timestamps as the index
    data.index = data.time
    data.drop('time', 1, inplace=True)
                              
    dataframes.append(data)
    
    
# Concatenate all the dataframes together, 
# easier because they are in a list already

data = pd.concat(dataframes)
data.describe()

['roxy_log.201607210800.csv', 'roxy_log.201607211200.csv', 'roxy_log.201607251200.csv', 'roxy_log.201607251500.csv', 'roxy_log.201607250900.csv', 'roxy_log.201607220800.csv']


Unnamed: 0,afr,boost,load,feedback_knock,fine_knock_learn,gear_position,timing,maf,primary_timing,rpm,td_burst,td_continuous,boost_target,throttle,wg_duty,wg_max
count,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0
mean,15.721728,-7.742465,0.449947,-0.001478,-0.043026,3.243987,25.018784,18.37508,25.298697,2138.596933,-0.002714,4.257299,-4.325508,7.82078,10.800594,13.802727
std,3.36872,4.432594,0.374578,0.055823,0.254251,1.878305,11.863678,25.772601,12.533931,1058.625519,0.177154,1.496412,3.784796,14.451291,14.18946,14.925013
min,11.14,-12.39,0.03,-2.11,-2.46,1.0,-13.5,1.56,5.31,0.0,-6.0,-5.03,-9.75,0.0,0.0,0.0
25%,14.36,-10.07,0.27,0.0,0.0,1.0,11.5,4.35,11.99,929.0,0.0,4.21,-5.41,0.0,0.0,0.0
50%,14.59,-8.91,0.33,0.0,0.0,3.0,25.0,9.93,26.41,2336.0,0.0,5.0,-5.34,4.0,0.0,5.61
75%,14.81,-7.1,0.52,0.0,0.0,5.0,37.0,22.4,38.36,2920.0,0.0,5.0,-5.34,11.0,26.67,31.0
max,25.73,18.76,2.89,0.0,0.0,6.0,43.5,245.7,42.58,6880.0,5.0,5.0,19.84,100.0,45.49,45.66


In [24]:
# describe some high-level stats about the data
data.describe()

Unnamed: 0,afr,boost,load,feedback_knock,fine_knock_learn,gear_position,timing,maf,primary_timing,rpm,td_burst,td_continuous,boost_target,throttle,wg_duty,wg_max
count,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0,109932.0
mean,15.721728,-7.742465,0.449947,-0.001478,-0.043026,3.243987,25.018784,18.37508,25.298697,2138.596933,-0.002714,4.257299,-4.325508,7.82078,10.800594,13.802727
std,3.36872,4.432594,0.374578,0.055823,0.254251,1.878305,11.863678,25.772601,12.533931,1058.625519,0.177154,1.496412,3.784796,14.451291,14.18946,14.925013
min,11.14,-12.39,0.03,-2.11,-2.46,1.0,-13.5,1.56,5.31,0.0,-6.0,-5.03,-9.75,0.0,0.0,0.0
25%,14.36,-10.07,0.27,0.0,0.0,1.0,11.5,4.35,11.99,929.0,0.0,4.21,-5.41,0.0,0.0,0.0
50%,14.59,-8.91,0.33,0.0,0.0,3.0,25.0,9.93,26.41,2336.0,0.0,5.0,-5.34,4.0,0.0,5.61
75%,14.81,-7.1,0.52,0.0,0.0,5.0,37.0,22.4,38.36,2920.0,0.0,5.0,-5.34,11.0,26.67,31.0
max,25.73,18.76,2.89,0.0,0.0,6.0,43.5,245.7,42.58,6880.0,5.0,5.0,19.84,100.0,45.49,45.66


In [25]:
# slice out just the boost column
boost = data['boost']
print( boost.head() )
print( boost.describe() )

time
2016-07-21 08:00:00.000   -8.97
2016-07-21 08:00:00.002   -8.97
2016-07-21 08:00:00.003   -9.03
2016-07-21 08:00:00.004   -9.09
2016-07-21 08:00:00.016   -9.05
Name: boost, dtype: float64
count    109932.000000
mean         -7.742465
std           4.432594
min         -12.390000
25%         -10.070000
50%          -8.910000
75%          -7.100000
max          18.760000
Name: boost, dtype: float64


In [26]:
# get just boost above 10psi
boost[boost > 10].count()

1834

In [27]:
# get just boost above 10psi
boost[boost > 10].count()


1834

In [28]:
# look for learned knock events
knock = data['fine_knock_learn']
data[knock < 0].describe()

Unnamed: 0,afr,boost,load,feedback_knock,fine_knock_learn,gear_position,timing,maf,primary_timing,rpm,td_burst,td_continuous,boost_target,throttle,wg_duty,wg_max
count,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0,3612.0
mean,13.091603,4.839601,1.519142,-0.019277,-1.309507,3.209856,22.202381,90.769986,20.665291,3227.128184,-0.024983,2.152118,6.394416,45.801495,26.427326,28.178336
std,1.820013,7.447717,0.649686,0.200786,0.55591,1.565069,8.281483,64.310141,10.602877,1099.904999,0.29247,2.146365,7.407231,34.765443,11.120105,10.297728
min,11.14,-4.76,0.85,-2.11,-2.46,1.0,-2.5,17.84,5.66,1547.0,-5.03,-5.03,-8.16,0.0,0.0,0.0
25%,11.25,-1.28,0.99,0.0,-1.76,2.0,14.0,39.77,9.88,2347.0,0.0,0.6,0.33,20.0,18.82,21.665
50%,13.44,0.83,1.165,0.0,-1.05,3.0,22.5,53.03,20.08,2719.5,0.0,1.8,3.27,27.0,25.88,26.895
75%,14.36,11.27,1.98,0.0,-1.05,4.0,30.0,136.51,32.03,4194.0,0.0,4.48,15.41,100.0,34.51,35.64
max,25.73,18.76,2.89,0.0,-0.35,6.0,40.0,245.7,42.58,6769.0,1.23,5.0,19.84,100.0,45.49,45.66


### Filter down to just OverBoost Events

In [29]:
# return a filterd dataframe, containing only samples where boost > boost_target
overboost = data[data.boost > data.boost_target]
overboost.describe()

Unnamed: 0,afr,boost,load,feedback_knock,fine_knock_learn,gear_position,timing,maf,primary_timing,rpm,td_burst,td_continuous,boost_target,throttle,wg_duty,wg_max
count,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0,3544.0
mean,13.321318,1.159797,1.174746,-0.007144,-0.26296,2.92579,22.319695,57.736024,24.247046,2641.771445,-0.198183,-0.992861,0.015418,24.251129,14.214328,20.172438
std,1.744837,7.353299,0.638637,0.122589,0.593829,1.854995,9.908775,50.824361,11.407939,1156.803422,0.811334,1.641017,7.435647,26.352034,15.676016,16.932274
min,11.14,-8.97,0.14,-2.11,-2.46,1.0,-13.5,1.65,5.66,0.0,-6.0,-5.03,-9.75,0.0,0.0,0.0
25%,11.25,-4.08,0.74,0.0,0.0,1.0,14.5,21.0475,11.99,1619.75,0.0,-1.63,-5.32,11.0,0.0,0.0
50%,13.78,-2.11,0.88,0.0,0.0,3.0,22.0,32.57,26.76,2759.5,0.0,-0.48,-3.33,14.0,5.88,26.56
75%,14.47,5.47,1.5,0.0,0.0,4.0,30.5,81.4325,33.79,3606.0,0.0,0.0,4.16,26.0,29.02,35.25
max,24.81,18.76,2.89,0.0,0.0,6.0,42.0,238.25,42.58,6769.0,1.55,5.0,17.98,100.0,45.49,45.66


In [30]:
# get just the boost value, where boost > target
data[data.boost > data.boost_target]['boost'].describe()
# Best practice is to use loc method:
data.loc[data.boost > data.boost_target, 'boost'].describe()

count    3544.000000
mean        1.159797
std         7.353299
min        -8.970000
25%        -4.080000
50%        -2.110000
75%         5.470000
max        18.760000
Name: boost, dtype: float64

NOTE: looks like almost all of our overboost events are where boost is negative.

Probably should adjust the map so that we target more reasonable boost levels for these areas.

consider setting minimum boost targets to 0psi, and adjusting the max WGDC so that it is the MINIMUM WGDC needed to hit peak boost.
Or, perhaps, because the WG will only be able to open above ~7psi [WG rod begins to aggressively move at ~7psi](https://cobbtuning.zendesk.com/hc/en-us/articles/200025164-How-Subaru-s-Factory-Boost-Control-System-Works), set the minimum targets to that value, WGDC will be irrelevant below the 7psi threshold.

In [31]:
# filter rows based on multiple criteria:
# use &, |, and parens for evaluation order.
data[(data.boost > 0) & (data.boost > data.boost_target)].describe()

Unnamed: 0,afr,boost,load,feedback_knock,fine_knock_learn,gear_position,timing,maf,primary_timing,rpm,td_burst,td_continuous,boost_target,throttle,wg_duty,wg_max
count,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0,1294.0
mean,11.981283,9.349699,1.872048,-0.013045,-0.551662,4.372488,16.365533,112.257859,12.199057,3546.671561,-0.12755,-1.101298,8.083006,45.703246,28.367156,36.120139
std,1.211161,6.108758,0.559538,0.165456,0.741325,1.30691,4.862701,46.270884,6.104407,659.135396,0.582461,1.657802,6.59298,31.503913,12.32641,5.990177
min,11.14,0.02,0.48,-2.11,-2.46,1.0,4.5,19.12,5.66,795.0,-5.03,-5.03,-8.16,0.0,0.0,1.34
25%,11.25,4.27,1.4225,0.0,-1.05,4.0,12.0,75.83,7.77,3120.5,0.0,-1.63,2.59,24.0,19.3175,32.165
50%,11.25,8.88,1.78,0.0,0.0,4.0,15.5,106.6,10.59,3571.0,0.0,-0.38,7.52,31.0,29.41,35.43
75%,12.4,16.695,2.5,0.0,0.0,6.0,19.0,146.285,14.45,3983.0,0.0,0.0,13.56,60.0,36.47,37.49
max,21.36,18.76,2.89,0.0,0.0,6.0,40.0,238.25,42.58,5756.0,0.97,5.0,17.98,100.0,45.49,45.66
