### Perform - 
1. Label Encoder
2. One hot Encoder
3. Ordinal Encoder

In [24]:
import pandas as pd 
import warnings
warnings.filterwarnings('ignore') 

In [25]:
df = pd.read_csv('data/insurance.csv') 

In [26]:
df.head() 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [28]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [29]:
y = df['charges']
x =  df.drop('charges', axis=1)

In [30]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [31]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

#### Label Encoder

In [32]:
from sklearn.preprocessing import LabelEncoder

In [33]:
le = LabelEncoder()

In [34]:
df1= df.copy()

In [35]:
df1.sex = le.fit_transform(df[['sex']])

In [36]:
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [37]:
le.classes_

array(['female', 'male'], dtype=object)

In [38]:
df1.region = le.fit_transform(df[['region']])

In [39]:
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,3,16884.924
1,18,1,33.77,1,no,2,1725.5523
2,28,1,33.0,3,no,2,4449.462
3,33,1,22.705,0,no,1,21984.47061
4,32,1,28.88,0,no,1,3866.8552


In [40]:
le.classes_

array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)

##### Decode the encoded values back to original classes

In [41]:
decoded_sex = le.inverse_transform(df1.sex)
decoded_sex

array(['northeast', 'northwest', 'northwest', ..., 'northeast',
       'northeast', 'northeast'], dtype=object)

In [42]:
decoded_region = le.inverse_transform(df1.region)
decoded_region

array(['southwest', 'southeast', 'southeast', ..., 'southeast',
       'southwest', 'northwest'], dtype=object)

In [43]:
df1.sex = decoded_sex
df1.region = decoded_region
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,northeast,27.9,0,yes,southwest,16884.924
1,18,northwest,33.77,1,no,southeast,1725.5523
2,28,northwest,33.0,3,no,southeast,4449.462
3,33,northwest,22.705,0,no,northwest,21984.47061
4,32,northwest,28.88,0,no,northwest,3866.8552


In [44]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


#### One hot Encoder

In [52]:
df2 = pd.read_csv('data/insurance.csv') 

In [53]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [54]:
region = pd.get_dummies(df2['region'], drop_first=True, prefix='region', prefix_sep='_' )
region.head()

Unnamed: 0,region_northwest,region_southeast,region_southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0


In [55]:
df2.drop('region', axis=1, inplace=True)

In [56]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [57]:
new_df2 = pd.concat([df2, region], axis=1)

In [58]:
new_df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,female,27.9,0,yes,16884.924,0,0,1
1,18,male,33.77,1,no,1725.5523,0,1,0
2,28,male,33.0,3,no,4449.462,0,1,0
3,33,male,22.705,0,no,21984.47061,1,0,0
4,32,male,28.88,0,no,3866.8552,1,0,0


#### Ordinal Encoder

In [60]:
df3 = df.copy()

In [61]:
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [66]:
region = df3.region.unique()
region

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [64]:
from sklearn.preprocessing import OrdinalEncoder

In [70]:
ordinal = OrdinalEncoder(categories = [['southwest', 'southeast', 'northwest', 'northeast']])

In [71]:
ordinal

In [72]:
df3.region = ordinal.fit_transform(df3[['region']])

In [73]:
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,0.0,16884.924
1,18,male,33.77,1,no,1.0,1725.5523
2,28,male,33.0,3,no,1.0,4449.462
3,33,male,22.705,0,no,2.0,21984.47061
4,32,male,28.88,0,no,2.0,3866.8552
