# <font color="blue">Lesson 6 - Feature Engineering and Selection</font>
# One Hot Encoding
For this lesson, we'll import the Automobile Dataset from the UCI Machine Learning repository.  

## Import Data

In [1]:
import pandas as pd
import numpy as np

# add headers to the data
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values='?' )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


## Pre-Process Data
Before we can beging working with this dataset, let's explore the dataset and do some parsing. 

In [2]:
# check what datatypes the dataframe contains and make sure they are correct
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [3]:
# check for nulls
df[df.isnull().any(axis=1)].head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
5,2,,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,15250.0
7,1,,audi,gas,std,four,wagon,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,18920.0


In [4]:
# use pandas fillna function to remove NaN

df = df.fillna(0)
df.head()


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,0.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,0.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,0.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


## Consider this
What did the pandas fill the NaN values with? Why? What could be a better fill?

See <a href="https://pandas.pydata.org/pandas-docs/version/0.23/generated/pandas.DataFrame.fillna.html">pandas fillna documentation</a>

### Convert categorical strings to numeric
There are two columns in this dataset that use strings for numerical values instead of just using the number; num_doors and num_cylinders. 


In [5]:
df['num_doors'].head()

0     two
1     two
2     two
3    four
4    four
Name: num_doors, dtype: object

In [6]:
df['num_cylinders'].head()

0    four
1    four
2     six
3    four
4    five
Name: num_cylinders, dtype: object

In Pandas we can use dictionaries to create a map between old values and new values, and this will work great for converting these strings into actual numbers. 

In [7]:
# here's how it works on the new doors column
new_door_counts = {"four":4, "two":2}
df.replace(new_door_counts, inplace=True)
df["num_doors"].head()

0    2
1    2
2    2
3    4
4    4
Name: num_doors, dtype: int64

In [8]:
type(new_door_counts)

dict

## Your Turn

In [9]:
# here are the values you will need to map for the num_cylinders column
df["num_cylinders"].unique()

array([4, 'six', 'five', 'three', 'twelve', 2, 'eight'], dtype=object)

Following the code we used to map num_doors, create a mapping dictionary and replace the values for the num_cylinders column. 

In [16]:
# string to number dictionary 
new_num_cyls = {"six":6, "five":5, "twelve":12, "eight":8, "4":4, "2":2}

# pandas replace on df
df.replace(new_num_cyls,inplace=True)

In [17]:
df["num_cylinders"].head()

0    4
1    4
2    6
3    4
4    5
Name: num_cylinders, dtype: object

## One Hot Encoding with Pandas
Now that we've parsed our dataset, we can use pandas get_dummies function to one-hot-encode the categorical variables into integers. 

First we'll use pandas select_dtypes() function to pull out columns that are categorical; in pandas categorical data is stored in an object dtype. 

In [18]:
# store object column names in a list
obj_cols = df.select_dtypes(include=["object"]).columns
obj_cols

Index(['make', 'fuel_type', 'aspiration', 'body_style', 'drive_wheels',
       'engine_location', 'engine_type', 'num_cylinders', 'fuel_system'],
      dtype='object')

Now we can use pandas get_dummies function to one hot encode. The syntax is: 

`pd.get_dummies(dataframe, columns=[list of object cols])`

In [20]:
# encode your dataframe
pd.get_dummies(df, columns= obj_cols)

Unnamed: 0,symboling,normalized_losses,num_doors,wheel_base,length,width,height,curb_weight,engine_size,bore,...,num_cylinders_12,num_cylinders_three,fuel_system_1bbl,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,3,0.0,2,88.6,168.8,64.1,48.8,2548,130,3.47,...,0,0,0,0,0,0,0,1,0,0
1,3,0.0,2,88.6,168.8,64.1,48.8,2548,130,3.47,...,0,0,0,0,0,0,0,1,0,0
2,1,0.0,2,94.5,171.2,65.5,52.4,2823,152,2.68,...,0,0,0,0,0,0,0,1,0,0
3,2,164.0,4,99.8,176.6,66.2,54.3,2337,109,3.19,...,0,0,0,0,0,0,0,1,0,0
4,2,164.0,4,99.4,176.6,66.4,54.3,2824,136,3.19,...,0,0,0,0,0,0,0,1,0,0
5,2,0.0,2,99.8,177.3,66.3,53.1,2507,136,3.19,...,0,0,0,0,0,0,0,1,0,0
6,1,158.0,4,105.8,192.7,71.4,55.7,2844,136,3.19,...,0,0,0,0,0,0,0,1,0,0
7,1,0.0,4,105.8,192.7,71.4,55.7,2954,136,3.19,...,0,0,0,0,0,0,0,1,0,0
8,1,158.0,4,105.8,192.7,71.4,55.9,3086,131,3.13,...,0,0,0,0,0,0,0,1,0,0
9,0,0.0,2,99.5,178.2,67.9,52.0,3053,131,3.13,...,0,0,0,0,0,0,0,1,0,0


We can see from the dataframe above that there are now a lot of redundant columns in our dataframe. Add the argument "drop_first=True" to the get_dummies function you used above and see what happens. 

In [None]:
# encode your data frame again, but add the following argument
drop_first=True

## One Hot Encoding with Sklearn
Sklearn also provides a method for one hot encoding. Some people prefer pandas because it's more straightforward, but you can decide on your own. 

In [None]:
from sklearn.preprocessing import LabelEncoder

# instantiate encoder
le = LabelEncoder()

# fit and transform the object columns
# use df.apply() to apply le.fit_transform to object columns
df2 = df.apply(le.fit_transform, ignore_failures=True)

In [None]:
df2.head()

### Congratulations
So which method do you prefer for one hot encoding?