## Fix `city_mpg`, `hwy_mpg`, `cmb_mpg` datatypes
    2008 and 2018: convert string to float

Load datasets `data_08_v4.csv` and `data_18_v4.csv`. You should've created these data files in the previous section: *Fixing Data Types Pt 2*.

In [13]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

%config IPCompleter.greedy = True

In [14]:
import pandas as pd
import numpy as np

In [15]:
# load 2008 dataset
df_08 = pd.read_csv('data_08_v4.csv')
df_08.head(n = 2)

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
0,ACURA MDX,3.7,6,Auto-S5,4WD,Gasoline,SUV,7.0,15,20,17,4,no
1,ACURA RDX,2.3,4,Auto-S5,4WD,Gasoline,SUV,7.0,17,22,19,5,no


In [16]:
# load 2018 dataset
df_18 = pd.read_csv('data_18_v4.csv')
df_18.head(n = 2)

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway
0,ACURA RDX,3.5,6.0,SemiAuto-6,2WD,Gasoline,small SUV,3.0,20,28,23,5,No
1,ACURA RDX,3.5,6.0,SemiAuto-6,4WD,Gasoline,small SUV,3.0,19,27,22,4,No


In [17]:
# Columns 'city_mpg', 'hwy_mpg', 'greenhouse_gas_score' data type before converting them to float
type(df_18.loc[0, 'city_mpg']) # data type of 'city_mpg' column
type(df_18.loc[0, 'hwy_mpg']) # data type of 'hwy_mpg' column
type(df_18.loc[0, 'cmb_mpg']) # data type of 'cmb_mpg' column
type(df_18.loc[0, 'greenhouse_gas_score']) # data type of 'greenhouse_gas_score' column

numpy.int64

numpy.int64

numpy.int64

numpy.int64

In [18]:
# convert mpg columns to floats
mpg_columns = ['city_mpg', 'hwy_mpg', 'cmb_mpg', 'greenhouse_gas_score']
for c in mpg_columns:
    df_18[c] = df_18[c].astype('float64')
    df_08[c] = df_08[c].astype('float64')

In [19]:
# Columns 'city_mpg', 'hwy_mpg', 'greenhouse_gas_score' data type after converting them to float
type(df_18.loc[0, 'city_mpg']) # data type of 'city_mpg' column
type(df_18.loc[0, 'hwy_mpg']) # data type of 'hwy_mpg' column
type(df_18.loc[0, 'cmb_mpg']) # data type of 'cmb_mpg' column
type(df_18.loc[0, 'greenhouse_gas_score']) # data type of 'greenhouse_gas_score' column

numpy.float64

numpy.float64

numpy.float64

numpy.float64

## Fix `cyl` datatype
    2008: convert from int to float

In [20]:
# convert from float to int
df_08['cyl'] = df_08['cyl'].astype('float64')

## All the dataypes are now fixed! Take one last check to confirm all the changes.

In [21]:
df_08.dtypes

model                    object
displ                   float64
cyl                     float64
trans                    object
drive                    object
fuel                     object
veh_class                object
air_pollution_score     float64
city_mpg                float64
hwy_mpg                 float64
cmb_mpg                 float64
greenhouse_gas_score    float64
smartway                 object
dtype: object

In [22]:
df_18.dtypes

model                    object
displ                   float64
cyl                     float64
trans                    object
drive                    object
fuel                     object
veh_class                object
air_pollution_score     float64
city_mpg                float64
hwy_mpg                 float64
cmb_mpg                 float64
greenhouse_gas_score    float64
smartway                 object
dtype: object

In [23]:
df_08.dtypes == df_18.dtypes

model                   True
displ                   True
cyl                     True
trans                   True
drive                   True
fuel                    True
veh_class               True
air_pollution_score     True
city_mpg                True
hwy_mpg                 True
cmb_mpg                 True
greenhouse_gas_score    True
smartway                True
dtype: bool

In [24]:
# Save your final CLEAN datasets as new files!
df_08.to_csv('clean_08.csv', index=False)
df_18.to_csv('clean_18.csv', index=False)