In [8]:
import pandas as pd
import numpy as np

## Convert continuous variables into categorical discrete variables

In [9]:
df= pd.read_csv('./data/auto-mpg.csv')
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [10]:
df['horsepower'].value_counts(dropna=False) 

150    22
90     20
88     19
110    18
100    17
       ..
61      1
93      1
148     1
152     1
82      1
Name: horsepower, Length: 94, dtype: int64

In [11]:
df[df['horsepower']=='?']

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
32,25.0,4,98.0,?,2046,19.0,71,1,ford pinto
126,21.0,6,200.0,?,2875,17.0,74,1,ford maverick
330,40.9,4,85.0,?,1835,17.3,80,2,renault lecar deluxe
336,23.6,4,140.0,?,2905,14.3,80,1,ford mustang cobra
354,34.5,4,100.0,?,2320,15.8,81,2,renault 18i
374,23.0,4,151.0,?,3035,20.5,82,1,amc concord dl


In [12]:
df['horsepower'].replace('?',np.nan,inplace=True)
df.dropna(subset=['horsepower'],axis=0,inplace=True)
df['horsepower']=df['horsepower'].astype('float')

In [13]:
count, bin_dividers = np.histogram(df['horsepower'], bins=3)
print(bin_dividers)
print(count)

[ 46.         107.33333333 168.66666667 230.        ]
[257 103  32]


In [15]:
bin_names = ['Low Output','Normal Output','High Output']
df['hp_bin'] =pd.cut(x=df['horsepower'],
                     bins=bin_dividers,
                     labels = bin_names,
                     include_lowest=True)
df 


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,hp_bin
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,Normal Output
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,Normal Output
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,Normal Output
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,Normal Output
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,Normal Output
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1,ford mustang gl,Low Output
394,44.0,4,97.0,52.0,2130,24.6,82,2,vw pickup,Low Output
395,32.0,4,135.0,84.0,2295,11.6,82,1,dodge rampage,Low Output
396,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger,Low Output


# dummies

In [16]:
horse_power_dummies = pd.get_dummies(df['hp_bin'])
horse_power_dummies 

Unnamed: 0,Low Output,Normal Output,High Output
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
393,1,0,0
394,1,0,0
395,1,0,0
396,1,0,0


In [17]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
horse_power_ohe=enc.fit_transform(df[['hp_bin']])
horse_power_ohe

<392x3 sparse matrix of type '<class 'numpy.float64'>'
	with 392 stored elements in Compressed Sparse Row format>

In [19]:
print(horse_power_ohe.toarray())


[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


In [20]:
print(enc.categories_)


[array(['High Output', 'Low Output', 'Normal Output'], dtype=object)]


In [21]:
horse_power_ohe.shape

(392, 3)

In [23]:
transformed_df = pd.DataFrame(
    horse_power_ohe.toarray(), 
    columns=enc.get_feature_names()
)
transformed_df

Unnamed: 0,x0_High Output,x0_Low Output,x0_Normal Output
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
...,...,...,...
387,0.0,1.0,0.0
388,0.0,1.0,0.0
389,0.0,1.0,0.0
390,0.0,1.0,0.0


# Normalization 

### MinMax

[MinMax](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)

In [24]:
df['horsepower'].describe()

count    392.000000
mean     104.469388
std       38.491160
min       46.000000
25%       75.000000
50%       93.500000
75%      126.000000
max      230.000000
Name: horsepower, dtype: float64

In [25]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,hp_bin
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,Normal Output
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,Normal Output
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,Normal Output
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,Normal Output
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,Normal Output
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1,ford mustang gl,Low Output
394,44.0,4,97.0,52.0,2130,24.6,82,2,vw pickup,Low Output
395,32.0,4,135.0,84.0,2295,11.6,82,1,dodge rampage,Low Output
396,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger,Low Output


In [26]:
# check min
print(min(df['horsepower']))
print(max(df['horsepower']))

46.0
230.0


In [27]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['horsepower_scaled']=scaler.fit_transform(df[['horsepower']])

In [28]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,hp_bin,horsepower_scaled
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,Normal Output,0.456522
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,Normal Output,0.646739
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,Normal Output,0.565217
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,Normal Output,0.565217
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,Normal Output,0.510870
...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,1,ford mustang gl,Low Output,0.217391
394,44.0,4,97.0,52.0,2130,24.6,82,2,vw pickup,Low Output,0.032609
395,32.0,4,135.0,84.0,2295,11.6,82,1,dodge rampage,Low Output,0.206522
396,28.0,4,120.0,79.0,2625,18.6,82,1,ford ranger,Low Output,0.179348


In [29]:
df[['horsepower','horsepower_scaled']].describe()

Unnamed: 0,horsepower,horsepower_scaled
count,392.0,392.0
mean,104.469388,0.317768
std,38.49116,0.209191
min,46.0,0.0
25%,75.0,0.157609
50%,93.5,0.258152
75%,126.0,0.434783
max,230.0,1.0
