# https://archive.ics.uci.edu/ml/index.php

In [7]:
import numpy as np
import pandas as pd

headers = ['symboling','normalized_losses','make','fuel-type','aspiration', 'num_doors','body_style',
           'drive_wheels', 'engine_location', 'wheel_base', 'length', 'width', 'height', 'curb_weight',
           'engine_type', 'num_cylinders', 'engine_size', 'fuel_system', 'bore', 'stroke',
           'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price']

df = pd.read_csv('http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data', header=None, names=headers, na_values='?')

In [8]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel-type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [9]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel-type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [16]:
df_obj = df.select_dtypes(include=['object']).copy()
df_obj.head()

Unnamed: 0,make,fuel-type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [22]:
df_obj['num_doors'].unique()

array([2., 4.])

In [19]:
df_obj.replace({'num_doors': {'two': 2, 'four': 4}}, inplace=True)

In [21]:
df_obj.fillna({'num_doors':4}, inplace=True)

In [46]:
df_obj.head()

Unnamed: 0,make,fuel-type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,boday_style_code,engine_type_code
0,alfa-romero,gas,std,2.0,convertible,rwd,front,dohc,4,mpfi,0,1
1,alfa-romero,gas,std,2.0,convertible,rwd,front,dohc,4,mpfi,0,1
2,alfa-romero,gas,std,2.0,hatchback,rwd,front,ohcv,6,mpfi,2,1
3,audi,gas,std,4.0,sedan,fwd,front,ohc,4,mpfi,3,1
4,audi,gas,std,4.0,sedan,4wd,front,ohc,5,mpfi,3,1


In [27]:
df_obj.replace({'num_cylinders': {'eight':8, 'five':5, 'four':4, 'six':6, 'three':3, 'twelve':12, 'two': 2}}, inplace=True)

In [28]:
df_obj['num_cylinders'].unique()

array([ 4,  6,  5,  3, 12,  2,  8], dtype=int64)

In [45]:
df_obj.dtypes

make                  object
fuel-type             object
aspiration            object
num_doors            float64
body_style          category
drive_wheels          object
engine_location       object
engine_type           object
num_cylinders          int64
fuel_system           object
boday_style_code        int8
engine_type_code       int32
dtype: object

In [31]:
df_obj['body_style'] = df_obj['body_style'].astype('category')

In [35]:
df_obj['boday_style_code'] = df_obj.body_style.cat.codes

In [39]:
pd.get_dummies(df_obj, columns=['body_style']).head()

Unnamed: 0,make,fuel-type,aspiration,num_doors,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,boday_style_code,body_style_convertible,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon
0,alfa-romero,gas,std,2.0,rwd,front,dohc,4,mpfi,0,1,0,0,0,0
1,alfa-romero,gas,std,2.0,rwd,front,dohc,4,mpfi,0,1,0,0,0,0
2,alfa-romero,gas,std,2.0,rwd,front,ohcv,6,mpfi,2,0,0,1,0,0
3,audi,gas,std,4.0,fwd,front,ohc,4,mpfi,3,0,0,0,1,0
4,audi,gas,std,4.0,4wd,front,ohc,5,mpfi,3,0,0,0,1,0


In [44]:
df_obj['engine_type_code'] = np.where(df_obj['engine_type'].str.contains('ohc'), 1, 0)

In [47]:
from sklearn.preprocessing import LabelEncoder

In [49]:
len(df_obj.make.unique())

22

In [50]:
make_encoder = LabelEncoder()

In [51]:
df_obj['make_code'] = make_encoder.fit_transform(df_obj.make)
df_obj.head()

Unnamed: 0,make,fuel-type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,boday_style_code,engine_type_code,make_code
0,alfa-romero,gas,std,2.0,convertible,rwd,front,dohc,4,mpfi,0,1,0
1,alfa-romero,gas,std,2.0,convertible,rwd,front,dohc,4,mpfi,0,1,0
2,alfa-romero,gas,std,2.0,hatchback,rwd,front,ohcv,6,mpfi,2,1,0
3,audi,gas,std,4.0,sedan,fwd,front,ohc,4,mpfi,3,1,1
4,audi,gas,std,4.0,sedan,4wd,front,ohc,5,mpfi,3,1,1
