# Drawing Conclusions
Use the space below to address questions on datasets `clean_08.csv` and `clean_18.csv`. You should've created these data files in the previous section: *Fixing Data Types Pt 3*.

In [190]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

%config IPCompleter.greedy = True

In [191]:
import pandas as pd
import numpy as np
%matplotlib inline

In [192]:
# load 2008 dataset
df_08.shape
df_08 = pd.read_csv('clean_08.csv')
df_08.head(n = 2)

(987, 13)

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
0,ACURA MDX,3.7,6.0,Auto-S5,4WD,Gasoline,SUV,7.0,15.0,20.0,17.0,4.0,no
1,ACURA RDX,2.3,4.0,Auto-S5,4WD,Gasoline,SUV,7.0,17.0,22.0,19.0,5.0,no


In [193]:
# load 2018 dataset
df_18 = pd.read_csv('clean_18.csv')
df_18.shape
df_18.head(n = 2)

(832, 13)

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
0,ACURA RDX,3.5,6.0,SemiAuto-6,2WD,Gasoline,small SUV,3.0,20.0,28.0,23.0,5.0,No
1,ACURA RDX,3.5,6.0,SemiAuto-6,4WD,Gasoline,small SUV,3.0,19.0,27.0,22.0,4.0,No


### Q1: Are more unique models using alternative sources of fuel? By how much?

In [194]:
df_08.fuel.value_counts()

Gasoline    984
ethanol       1
CNG           1
gas           1
Name: fuel, dtype: int64

In [195]:
unique_fuel_source_count = df_18.fuel.value_counts()
total_fuel_source_count = df_18.fuel.value_counts().sum()

differ = 100- (unique_fuel_source_count[0]/total_fuel_source_count)*100
differ

9.975961538461547

> In 2018 9.98 % alternative source of fuel is used by unique models


### Q2: How much have vehicle classes improved in fuel economy?  

In [196]:
df_08.columns

Index(['model', 'displ', 'cyl', 'trans', 'drive', 'fuel', 'veh_class',
       'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg',
       'greenhouse_gas_score', 'smartway'],
      dtype='object')

##### Mean of cmb_mpg based on various veh_class in 2008 dataset

In [197]:
veh_class_grp_08_df = df_08.groupby('veh_class')
veh_class_grp_08_cmb_mpg_mean_series = veh_class_grp_08_df['cmb_mpg'].mean()

##### Mean of cmb_mpg based on various veh_class in 2018 dataset

In [198]:
veh_class_grp_18_df = df_18.groupby('veh_class')
veh_class_grp_18_cmb_mpg_mean_series = veh_class_grp_18_df['cmb_mpg'].mean()

In [199]:
#### Increment in cmb_mpg

In [200]:
difference = veh_class_grp_18_cmb_mpg_mean_series - veh_class_grp_08_cmb_mpg_mean_series
difference

veh_class
SUV                     NaN
large car          4.900000
midsize car        6.282609
minivan            1.682353
pickup             2.312635
small SUV               NaN
small car          4.315948
special purpose         NaN
standard SUV            NaN
station wagon      5.162745
van                     NaN
Name: cmb_mpg, dtype: float64

### Q3: What are the characteristics of SmartWay vehicles? Have they changed over time?

In [201]:
df_08.columns

Index(['model', 'displ', 'cyl', 'trans', 'drive', 'fuel', 'veh_class',
       'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg',
       'greenhouse_gas_score', 'smartway'],
      dtype='object')

In [202]:
smartway_unique_values_df_08 = df_08.loc[:, 'smartway'].unique()
smartway_unique_values_df_08

array(['no', 'yes'], dtype=object)

In [203]:
# filter df_08 datasets, only for smartway with yes value and get statisctics for each column
df_08[df_08['smartway'] == smartway_unique_values_df_08[1]].describe()

Unnamed: 0,displ,cyl,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score
count,380.0,380.0,380.0,380.0,380.0,380.0,380.0
mean,2.602895,4.826316,7.365789,20.984211,28.413158,23.736842,6.868421
std,0.623436,1.002025,1.148195,3.442672,3.075194,3.060379,0.827338
min,1.3,4.0,6.0,17.0,22.0,20.0,6.0
25%,2.275,4.0,7.0,19.0,26.0,22.0,6.0
50%,2.4,4.0,7.0,20.0,28.0,23.0,7.0
75%,3.0,6.0,7.0,22.0,30.0,25.0,7.0
max,5.0,8.0,9.5,48.0,45.0,46.0,10.0


In [204]:
smartway_unique_values_df_18 = df_18.loc[:, 'smartway'].unique()
smartway_unique_values_df_18

array(['No', 'Yes', 'Elite'], dtype=object)

In [205]:
# filter df_18 datasets, only for smartway with yes value and get statisctics for each column
df_18[df_18['smartway'] == smartway_unique_values_df_18[1]].describe()

Unnamed: 0,displ,cyl,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score
count,91.0,91.0,91.0,91.0,91.0,91.0,91.0
mean,1.753846,3.923077,4.879121,31.483516,39.296703,34.43956,7.538462
std,0.420399,0.452911,1.769087,11.23918,9.41098,10.214596,0.860332
min,1.2,3.0,3.0,25.0,27.0,27.0,7.0
25%,1.5,4.0,3.0,28.0,36.0,31.0,7.0
50%,1.6,4.0,5.0,28.0,37.0,32.0,7.0
75%,2.0,4.0,7.0,30.0,40.0,34.0,8.0
max,3.5,6.0,7.0,113.0,99.0,106.0,10.0


### Q4: What features are associated with better fuel economy?

In [206]:
cmb_mpg_08_mean = df_08.loc[:, 'cmb_mpg'].mean()
cmb_mpg_08_mean

cmb_mpg_above_mean_08_df = df_08.query('cmb_mpg > @cmb_mpg_08_mean')
cmb_mpg_above_mean_08_df

19.78824721377913