# Formatting and Normalization

## Formatting

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('datasets/CarPrice.csv', index_col=0)
df.head()

Unnamed: 0_level_0,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [2]:
print(len(df))
df.dropna()
print(len(df))

205
205


In [3]:
print(df.index.dtype)
print(df.dtypes)

int64
symboling             int64
CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object


In [4]:
cols = list(df.select_dtypes(np.object_).columns)
cols

['CarName',
 'fueltype',
 'aspiration',
 'doornumber',
 'carbody',
 'drivewheel',
 'enginelocation',
 'enginetype',
 'cylindernumber',
 'fuelsystem']

In [5]:
df[cols] = df[cols].astype('string')
df.dtypes

symboling                    int64
CarName             string[python]
fueltype            string[python]
aspiration          string[python]
doornumber          string[python]
carbody             string[python]
drivewheel          string[python]
enginelocation      string[python]
wheelbase                  float64
carlength                  float64
carwidth                   float64
carheight                  float64
curbweight                   int64
enginetype          string[python]
cylindernumber      string[python]
enginesize                   int64
fuelsystem          string[python]
boreratio                  float64
stroke                     float64
compressionratio           float64
horsepower                   int64
peakrpm                      int64
citympg                      int64
highwaympg                   int64
price                      float64
dtype: object

In [6]:
df['CarName']

car_ID
1            alfa-romero giulia
2           alfa-romero stelvio
3      alfa-romero Quadrifoglio
4                   audi 100 ls
5                    audi 100ls
                 ...           
201             volvo 145e (sw)
202                 volvo 144ea
203                 volvo 244dl
204                   volvo 246
205                 volvo 264gl
Name: CarName, Length: 205, dtype: string

In [7]:
df['CompanyName'] = df['CarName'].apply(lambda x: x.split()[0])
df['CompanyName'].unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'maxda', 'mazda', 'buick', 'mercury',
       'mitsubishi', 'Nissan', 'nissan', 'peugeot', 'plymouth', 'porsche',
       'porcshce', 'renault', 'saab', 'subaru', 'toyota', 'toyouta',
       'vokswagen', 'volkswagen', 'vw', 'volvo'], dtype=object)

In [8]:
cols = {'maxda': 'mazda', 'porcshce': 'porsche', 'toyouta': 'toyota', 'vokswagen': 'volkswagen', 'vw': 'volkswagen', 'Nissan': 'nissan'}
df['CompanyName'].replace(cols,inplace=True)
df['CompanyName'].unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'mazda', 'buick', 'mercury', 'mitsubishi',
       'nissan', 'peugeot', 'plymouth', 'porsche', 'renault', 'saab',
       'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object)

In [9]:
df['peakrpm'].head()

car_ID
1    5000
2    5000
3    5000
4    5500
5    5500
Name: peakrpm, dtype: int64

## Standartization

In [12]:
df['SF_peakrpm'] = df['peakrpm']/df['peakrpm'].max() # single feature scaling
df['SF_peakrpm'].head()

car_ID
1    0.757576
2    0.757576
3    0.757576
4    0.833333
5    0.833333
Name: SF_peakrpm, dtype: float64

In [13]:
df['MM_peakrpm'] = (df['peakrpm'] - df['peakrpm'].min())/(df['peakrpm'].max() - df['peakrpm'].min())# Min-Max scaling
df['MM_peakrpm'].head()

car_ID
1    0.346939
2    0.346939
3    0.346939
4    0.551020
5    0.551020
Name: MM_peakrpm, dtype: float64