In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import os
import re

from functools import reduce

src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
routine_clean = pd.read_csv('../../data/02_intermediate/routine_clean.csv')
astm = pd.read_csv('../../data/01_raw/ASTM_fuel.csv')
astm.columns = ['Date', 'TN_retailers_seasons', 'TN_distributor_seasons',
       'vapor_liquid_minC_retail', 'distillation_50_minC _retail',
       'distillation_50_maxC_retail', 'vapor_pressure_maxC_retail',
       'vapor_liquid_minC_dist', 'distillation_50_minC_dist',
       'distillation_50_maxC_dist', 'vapor_pressure_maxC_dist']

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
gasoline = routine_clean.loc[routine_clean['Prod']=='Gasoline']

In [4]:
gasoline_compliance = gasoline.loc[(gasoline['Compliance']=='Y')
                                   |(gasoline['Compliance']=='N')]

In [5]:
print('Number of unique Samples in the gasoline_compliance Dataset: ', len(gasoline_compliance.Sample.unique()))

Number of unique Samples in the gasoline_compliance Dataset:  20670


**Gasoline Tests w. Y/N Compliance** (only focusing on the bold tests moving forward) 
1.	Motor Octane Number
1.	Distillation Residue
1.	Distillation 90%
1.	Distillation E.P.
1.	Driveability Index
1.	Workmanship
1.	Distillation 10%
1.	**<font color='red'>Distillation 50%</font>**
1.	**<font color='red'>Vapor Pressure</font>**
1.	**<font color='red'>Vapor-Liquid Ratio</font>**
1.	Antiknock Index
1.	Phase Separation
1.	Ethanol
1.	Total Oxygen
1.	Distillation 60%
1.	Distillation 5%
1.	Distillation Loss
1.	Research Octane Number
1.	Distillation 80%
1.	Relative Density
1.	Distillation 95%
1.	Distillation 20%
1.	Methanol
1.	Distillation 40%
1.	Distillation Recovery
1.	API Gravity

In [6]:
gasoline_compliance['DateSampled'] = pd.to_datetime(gasoline_compliance['DateSampled'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [7]:
gasoline_compliance.Test.unique()

array(['Antiknock Index', 'Distillation 10%', 'Distillation 50%',
       'Distillation 90%', 'Distillation E.P.', 'Distillation Residue',
       'Driveability Index', 'Motor Octane Number', 'Phase Separation',
       'Vapor Pressure', 'Vapor-Liquid Ratio', 'Workmanship', 'Ethanol',
       'Distillation Loss', 'Total Oxygen', 'Relative Density',
       'Distillation 5%', 'Research Octane Number', 'Distillation 60%',
       'Distillation 20%', 'Distillation 95%', 'Distillation 80%',
       'Distillation 40%', 'API Gravity', 'Methanol', nan,
       'Distillation IBP', 'Distillation Recovery'], dtype=object)

Let's reduce the number of tests to the three that we are interested in testing (Distillation 50%, Vapor Pressure, and Vapor-Liquid Ratio)

In [8]:
gasoline_compliance = gasoline_compliance.loc[(gasoline_compliance['Test']=='Distillation 50%') 
                        | (gasoline_compliance['Test']=='Vapor Pressure') 
                        | (gasoline_compliance['Test']=='Vapor-Liquid Ratio')]

In [9]:
gasoline_compliance.head(3)

Unnamed: 0,Sample,Prod,DateSampled,Grade,Supplier,FacilityName,SiteAddress,Test,Units,Method,Result,MinResult,MaxResult,Compliance
8,61916134,Gasoline,2015-11-23,Mid Grade Unleaded,Marathon Petroleum Lp,Circle K #2723609,"198 Haywood Ln \r\nnashville, Tn 37211",Distillation 50%,Deg. C,D86,73.3,,,Y
36,61916134,Gasoline,2015-11-23,Mid Grade Unleaded,Marathon Petroleum Lp,Circle K #2723609,"198 Haywood Ln \r\nnashville, Tn 37211",Vapor Pressure,kPa,D5191,88.9,,,Y
37,61916134,Gasoline,2015-11-23,Mid Grade Unleaded,Marathon Petroleum Lp,Circle K #2723609,"198 Haywood Ln \r\nnashville, Tn 37211",Vapor-Liquid Ratio,Deg. C,D5188,44.6,,,Y


Number of unique samples in the dataset

In [10]:
len(gasoline_compliance.Sample.unique())

20658

### There are 144 duplicated rows in this dataset. Let's keep the first occurance. 

In [11]:
gasoline_compliance.duplicated(subset=None, keep='first').sum()

144

In [12]:
gasoline_compliance.drop_duplicates(inplace=True)

In [13]:
gasoline_compliance.reset_index(drop=True, inplace=True)

### Let's unstack this dataset

At the moment, every sample takes up three rows (each test has it's own dataset). Let's make sure that each sample has it's own row. 

In [14]:
# create multilevel index
gasoline_compliance.set_index(['Sample', 'Test'], inplace=True)

In [15]:
# unstack on the inner undex (test)
gasoline_compliance = gasoline_compliance.unstack(level=1)

now let's save each sub-dataframe into it's own dataframe so that we can re-name the columns

In [16]:
prod = gasoline_compliance['Prod']

In [17]:
prod.drop(['Distillation 50%', 'Vapor Pressure'], inplace=True, axis=1)
prod.reset_index(inplace=True)
prod.rename(columns={'Vapor-Liquid Ratio':'prod'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [18]:
datesampled = gasoline_compliance['DateSampled']
datesampled.drop(['Distillation 50%', 'Vapor Pressure'], inplace=True, axis=1)
datesampled.reset_index(inplace=True)
datesampled.rename(columns={'Vapor-Liquid Ratio':'datesampled'}, inplace=True)

In [19]:
grade = gasoline_compliance['Grade']

In [20]:
grade.drop(['Distillation 50%', 'Vapor Pressure'], inplace=True, axis=1)
grade.reset_index(inplace=True)
grade.rename(columns={'Vapor-Liquid Ratio':'grade'}, inplace=True)

In [21]:
supplier = gasoline_compliance['Supplier']

In [22]:
supplier.drop(['Distillation 50%', 'Vapor Pressure'], inplace=True, axis=1)
supplier.reset_index(inplace=True)
supplier.rename(columns={'Vapor-Liquid Ratio':'supplier'}, inplace=True)

In [23]:
facilityname = gasoline_compliance['FacilityName']

In [24]:
facilityname.drop(['Distillation 50%', 'Vapor Pressure'], inplace=True, axis=1)
facilityname.reset_index(inplace=True)
facilityname.rename(columns={'Vapor-Liquid Ratio':'facilityname'}, inplace=True)

In [25]:
siteaddress = gasoline_compliance['SiteAddress']

In [26]:
siteaddress.drop(['Distillation 50%', 'Vapor Pressure'], inplace=True, axis=1)
siteaddress.reset_index(inplace=True)
siteaddress.rename(columns={'Vapor-Liquid Ratio':'siteaddress'}, inplace=True)

In [27]:
units = gasoline_compliance['Units']
units.reset_index(inplace=True)
units.rename(
    columns={'Distillation 50%':'units_dist_50', 
             'Vapor Pressure':'units_vap_pressure', 
             'Vapor-Liquid Ratio':'units_vap_liq_pressure'}, inplace=True)


In [28]:
method = gasoline_compliance['Method']

In [29]:
method.reset_index(inplace=True)
method.rename(
    columns={'Distillation 50%':'method_dist_50', 
             'Vapor Pressure':'method_vap_pressure', 
             'Vapor-Liquid Ratio':'method_vap_liq_pressure'}, inplace=True)

In [30]:
result = gasoline_compliance['Result']

In [31]:
result.reset_index(inplace=True)
result.rename(
    columns={'Distillation 50%':'result_dist_50', 
             'Vapor Pressure':'result_vap_pressure', 
             'Vapor-Liquid Ratio':'result_vap_liq_pressure'}, inplace=True)

In [32]:
minresults = gasoline_compliance['MinResult']

In [33]:
minresults.reset_index(inplace=True)
minresults.rename(
    columns={'Distillation 50%':'minresults_dist_50', 
             'Vapor Pressure':'minresults_vap_pressure', 
             'Vapor-Liquid Ratio':'minresults_vap_liq_pressure'}, inplace=True)

In [34]:
maxresults = gasoline_compliance['MaxResult']

In [35]:
maxresults.reset_index(inplace=True)
maxresults.rename(
    columns={'Distillation 50%':'maxresults_dist_50', 
             'Vapor Pressure':'maxresults_vap_pressure', 
             'Vapor-Liquid Ratio':'maxresults_vap_liq_pressure'}, inplace=True)

In [36]:
compliance = gasoline_compliance['Compliance']

In [37]:
compliance.reset_index(inplace=True)
compliance.rename(
    columns={'Distillation 50%':'compliance_dist_50', 
             'Vapor Pressure':'compliance_vap_pressure', 
             'Vapor-Liquid Ratio':'compliance_vap_liq_pressure'}, inplace=True)

In [38]:
df = [prod, datesampled, grade, supplier, facilityname, 
               siteaddress, units, method, result, minresults, maxresults, compliance]

In [39]:
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Sample'],
                                            how='outer'), df)

### Let's try to make a dummy variable for location

In [40]:
print('Number of unique addresses in the dataset: ', len(df_merged['siteaddress'].unique()))

Number of unique addresses in the dataset:  8051


In [41]:
print('Percent of samples with missing address: ', df_merged.siteaddress.isna().sum()/len(df_merged)*100)

Percent of samples with missing address:  4.569658243779649


#### let's make a zipcode column and then geocode the siteaddress column

In [42]:
df_merged['str_split'] = df_merged.siteaddress.str.split('Tn')

In [43]:
df_merged['zipcode'] = df_merged.str_split.str.get(1)

In [44]:
print('Percentage of zipcode column that is empty: ', df_merged.zipcode.isna().sum()/len(df_merged)*100)

Percentage of zipcode column that is empty:  64.2317746151612


In [45]:
df_merged.drop(columns=['str_split'], inplace=True)
df_merged['city'] = ', Tn'
df_merged['siteaddress_city'] = df_merged['siteaddress'] + df_merged['city']
df_merged.drop(columns=['city'], inplace=True)

### Let's check out the target variable

In [46]:
print('Compliance Outcomes dist 50: ', '\n', df_merged.compliance_dist_50.value_counts())

Compliance Outcomes dist 50:  
 Y    20452
N        3
Name: compliance_dist_50, dtype: int64


In [47]:
print('dist 50 nan count: ', df_merged.compliance_dist_50.isna().sum())

dist 50 nan count:  203


In [48]:
print('Compliance outcome vap liq press: ', '\n', df_merged.compliance_vap_liq_pressure.value_counts())

Compliance outcome vap liq press:  
 Y    19789
N       74
Name: compliance_vap_liq_pressure, dtype: int64


In [49]:
print('Vap liq pressure nan count: ', df_merged.compliance_vap_liq_pressure.isna().sum())

Vap liq pressure nan count:  795


In [50]:
print('Compliance outcome vap press: ','\n', df_merged.compliance_vap_pressure.value_counts())

Compliance outcome vap press:  
 Y    20169
N      106
Name: compliance_vap_pressure, dtype: int64


In [51]:
print('Vap pressure nan count: ',df_merged.compliance_vap_pressure.isna().sum())

Vap pressure nan count:  383


From our expert interview, we have determined that the nan results in the compliance rows are test results that were inconclusive. Below I have two blocks of code. 

Block 1: this converts the nan values to None (later if we want to better understand this catagory then we can use block 1 to encude the variable). 

Block 2: this drops the nan values for the target variable. This allows for analysis in the model building phase. 

Block 1: 
```python
df_merged['compliance_dist_50'] = df_merged['compliance_dist_50'].replace(np.nan, 'None')
df_merged['compliance_vap_liq_pressure'] = df_merged['compliance_vap_liq_pressure'].replace(np.nan, 'None')
df_merged['compliance_vap_pressure'] = df_merged['compliance_vap_pressure'].replace(np.nan, 'None')
```

In [52]:
# Block 2: 
# df_merged.dropna(subset=['compliance_dist_50'], inplace=True)
# df_merged.dropna(subset=['compliance_vap_pressure'], inplace=True)
# df_merged.dropna(subset=['compliance_vap_liq_pressure'], inplace=True)

How many grades of gasoline does each supplier have

In [53]:
df_merged.groupby(['supplier', 'grade']).count()

Unnamed: 0_level_0,Test,Sample,prod,datesampled,facilityname,siteaddress,units_dist_50,units_vap_pressure,units_vap_liq_pressure,method_dist_50,method_vap_pressure,...,minresults_vap_pressure,minresults_vap_liq_pressure,maxresults_dist_50,maxresults_vap_pressure,maxresults_vap_liq_pressure,compliance_dist_50,compliance_vap_pressure,compliance_vap_liq_pressure,zipcode,siteaddress_city
supplier,grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,Premium Unleaded,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,0,1
GPM Southeast,Regular Unleaded,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,0,1,1,1,0,1
Murphy Oil Est # 36783,Premium Unleaded,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,0,1
9NFRY-AKVRCL,Regular Unleaded,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,0,1
A & M Oil Company,Mid Grade Unleaded,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A & M Oil Company,Regular Unleaded,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
A. L. Johnson,Mid Grade Unleaded,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A. L. Johnson,Premium Unleaded,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A. L. Johnson,Regular Unleaded,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
A9GLNQ,Regular Unleaded,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,0,1,1,1,0,1


In [54]:
supplier_grade = df_merged.groupby('supplier').grade.nunique().to_frame()
supplier_grade.reset_index(inplace=True)
print('Half of Suppliers only supply one grade of Gasoline: ', len(supplier_grade.loc[supplier_grade['grade']>1])/len(supplier_grade))

Half of Suppliers only supply one grade of Gasoline:  0.5206243032329989


In [55]:
len(supplier_grade.loc[supplier_grade['grade']>1])/len(supplier_grade)

0.5206243032329989

In [56]:
df_merged

Test,Sample,prod,datesampled,grade,supplier,facilityname,siteaddress,units_dist_50,units_vap_pressure,units_vap_liq_pressure,...,minresults_vap_pressure,minresults_vap_liq_pressure,maxresults_dist_50,maxresults_vap_pressure,maxresults_vap_liq_pressure,compliance_dist_50,compliance_vap_pressure,compliance_vap_liq_pressure,zipcode,siteaddress_city
0,61916134,Gasoline,2015-11-23,Mid Grade Unleaded,Marathon Petroleum Lp,Circle K #2723609,"198 Haywood Ln \r\nnashville, Tn 37211",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37211,"198 Haywood Ln \r\nnashville, Tn 37211, Tn"
1,61916135,Gasoline,2015-11-24,Regular Unleaded,,Tiger Market #214,"2001 8th Ave S \r\nnashville, Tn 37204",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37204,"2001 8th Ave S \r\nnashville, Tn 37204, Tn"
2,61916136,Gasoline,2015-11-24,Mid Grade Unleaded,Tri-star Energy,Twice Daily #8085,"648 Thompson Ln \r\nnashville, Tn 37204",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37204,"648 Thompson Ln \r\nnashville, Tn 37204, Tn"
3,61916138,Gasoline,2015-11-24,Regular Unleaded,"Mapco Express, Inc.",Mapco Express #3195,"4677 Trousdale Dr Nashville, Tn 37204",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37204,"4677 Trousdale Dr Nashville, Tn 37204, Tn"
4,61916139,Gasoline,2015-12-02,Premium Unleaded,Tri-star Energy,Top It Off Holding Inc,"13016 Old Hickory Blvd \r\nantioch, Tn 37013",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37013,"13016 Old Hickory Blvd \r\nantioch, Tn 37013, Tn"
5,61916140,Gasoline,2015-12-02,Regular Unleaded,"Mapco Express, Inc.",Mapco Express #1030,"2616 Franklin Road Nashville, Tn 37204",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37204,"2616 Franklin Road Nashville, Tn 37204, Tn"
6,61916142,Gasoline,2015-12-02,Mid Grade Unleaded,"Mapco Express, Inc.",Mapco Mart #3410,"4314 Harding Rd \r\nnashville, Tn 37205",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37205,"4314 Harding Rd \r\nnashville, Tn 37205, Tn"
7,61916148,Gasoline,2015-12-07,Mid Grade Unleaded,Exxonmobil,Kroger #550,"8175 Highway 100 \r\nnashville, Tn 37221",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37221,"8175 Highway 100 \r\nnashville, Tn 37221, Tn"
8,61916149,Gasoline,2015-12-07,Premium Unleaded,"Mapco Express, Inc.",Mapco Express #3414,"7670 Hwy. 70s Nashville, Tn 37221",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37221,"7670 Hwy. 70s Nashville, Tn 37221, Tn"
9,61916150,Gasoline,2015-12-07,Regular Unleaded,Tri-star Energy,Dailys #6645,"7691 Highway 70 South Nashville, Tn 37221",Deg. C,kPa,Deg. C,...,,,,,,Y,Y,Y,37221,"7691 Highway 70 South Nashville, Tn 37221, Tn"


In [57]:
df_merged.to_csv('../../data/03_processed/gasoline_processed.csv', index=False)