# Feature Engineering of Automobile data

### Importing required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Reading the dataset

In [2]:
data = pd.read_csv('/Users/manis/Downloads/Automobile_data.csv', na_values='?')
data.head(10)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
5,2,,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,15250.0
6,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0
7,1,,audi,gas,std,four,wagon,fwd,front,105.8,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,18920.0
8,1,158.0,audi,gas,turbo,four,sedan,fwd,front,105.8,...,131,mpfi,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0
9,0,,audi,gas,turbo,two,hatchback,4wd,front,99.5,...,131,mpfi,3.13,3.4,7.0,160.0,5500.0,16,22,


### Lets find the features with nan value

In [3]:
data.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [4]:
feature_with_nan = [feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in feature_with_nan:
    print(feature, ' has {} missing values'.format(data[feature].isnull().sum()))

normalized-losses  has 41 missing values
num-of-doors  has 2 missing values
bore  has 4 missing values
stroke  has 4 missing values
horsepower  has 2 missing values
peak-rpm  has 2 missing values
price  has 4 missing values


In [5]:
data[feature_with_nan].head(10)

Unnamed: 0,normalized-losses,num-of-doors,bore,stroke,horsepower,peak-rpm,price
0,,two,3.47,2.68,111.0,5000.0,13495.0
1,,two,3.47,2.68,111.0,5000.0,16500.0
2,,two,2.68,3.47,154.0,5000.0,16500.0
3,164.0,four,3.19,3.4,102.0,5500.0,13950.0
4,164.0,four,3.19,3.4,115.0,5500.0,17450.0
5,,two,3.19,3.4,110.0,5500.0,15250.0
6,158.0,four,3.19,3.4,110.0,5500.0,17710.0
7,,four,3.19,3.4,110.0,5500.0,18920.0
8,158.0,four,3.13,3.4,140.0,5500.0,23875.0
9,,two,3.13,3.4,160.0,5500.0,


### Lets seperate feature with nan values into numerical and categorical

In [6]:
cat_nan = [feature for feature in feature_with_nan if data[feature].dtype=='O']
num_nan = [feature for feature in feature_with_nan if data[feature].dtype!='O']
print('Categorical feature with NaN : ', cat_nan)
print('\nNumerical feature with NaN : ', num_nan)

Categorical feature with NaN :  ['num-of-doors']

Numerical feature with NaN :  ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']


### We are replacing numerical missing values with median and categorical missing value with mode i.e. with frequently occuring values

In [7]:
def median_imputation(data, nan_features):
    df = data.copy()
    for feature in nan_features:
        median = df[feature].median()
        df[feature].fillna(median, inplace=True)
    return df

In [8]:
data = median_imputation(data, num_nan)

In [9]:
def mode_imputation(data, nan_features):
    df = data.copy()
    for feature in nan_features:
        mode = df[feature].mode()[0]
        df[feature].fillna(mode, inplace=True)
    return df

In [10]:
data = mode_imputation(data, cat_nan)

In [11]:
data[feature_with_nan].head(10)

Unnamed: 0,normalized-losses,num-of-doors,bore,stroke,horsepower,peak-rpm,price
0,115.0,two,3.47,2.68,111.0,5000.0,13495.0
1,115.0,two,3.47,2.68,111.0,5000.0,16500.0
2,115.0,two,2.68,3.47,154.0,5000.0,16500.0
3,164.0,four,3.19,3.4,102.0,5500.0,13950.0
4,164.0,four,3.19,3.4,115.0,5500.0,17450.0
5,115.0,two,3.19,3.4,110.0,5500.0,15250.0
6,158.0,four,3.19,3.4,110.0,5500.0,17710.0
7,115.0,four,3.19,3.4,110.0,5500.0,18920.0
8,158.0,four,3.13,3.4,140.0,5500.0,23875.0
9,115.0,two,3.13,3.4,160.0,5500.0,10295.0


In [12]:
data.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

### so we can see now there are no NaN values. we have filled all missing values

### There are so many categorical features with many labels. For training the model categorical features needs to be converted into numerical. So we are not going to use categorical feature for training the model as if we convert these categorical feature into numerical then more columns will increase and the model may suffer from curse of Dimensionality

In [13]:
num_feature = [feature for feature in data.columns if data[feature].dtype!='O']
data[num_feature].head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,115.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,115.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,115.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [14]:
for feature in num_feature:
    if 0 in data[feature].values:
        pass
    else:
        data[feature] = np.log(data[feature])

In [15]:
data_for_training = data[num_feature]

In [16]:
data_for_training.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,4.744932,4.484132,5.128715,4.160444,3.88773,7.843064,4.867534,1.244155,0.985817,2.197225,4.70953,8.517193,3.044522,3.295837,9.510075
1,3,4.744932,4.484132,5.128715,4.160444,3.88773,7.843064,4.867534,1.244155,0.985817,2.197225,4.70953,8.517193,3.044522,3.295837,9.711116
2,1,4.744932,4.5486,5.142832,4.18205,3.958907,7.945555,5.023881,0.985817,1.244155,2.197225,5.036953,8.517193,2.944439,3.258097,9.711116
3,2,5.099866,4.603168,5.173887,4.19268,3.994524,7.756623,4.691348,1.160021,1.223775,2.302585,4.624973,8.612503,3.178054,3.401197,9.543235
4,2,5.099866,4.599152,5.173887,4.195697,3.994524,7.94591,4.912655,1.160021,1.223775,2.079442,4.744932,8.612503,2.890372,3.091042,9.767095


### Now our data is ready for training so lets save it

In [17]:
data_for_training.to_csv('auto_data_cleaned.csv', index=False)