In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()


# Lebel Encoder

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [7]:
categorical_columns = ['sex','smoker','region']

In [8]:
for col in categorical_columns:
    df1[col] = le.fit_transform(df1[col])
    
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [9]:
#alternative way

In [10]:
from pandas.core.dtypes.common import is_numeric_dtype

In [11]:
df1.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [12]:
for col in df1.columns:
    if is_numeric_dtype(df1[col]):
        continue
    else:
        le.fit_transform(df1[col])
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# One HOt

In [13]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [14]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore',drop='first')

In [15]:
df2['sex'].shape

(1338,)

In [16]:
df2['sex'].values.reshape(-1,1).shape

(1338, 1)

In [18]:
ohe.fit_transform(df2['sex'].values.reshape(-1,1)).toarray()

array([[0.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [19]:
ohe.categories_

[array(['female', 'male'], dtype=object)]

In [20]:
ohe.fit_transform(df2['smoker'].values.reshape(-1,1)).toarray()

array([[1.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [21]:
ohe.categories_

[array(['no', 'yes'], dtype=object)]

In [23]:
ohe_array = ohe.fit_transform(df2[['sex','smoker','region']]).toarray()
ohe_array


array([[0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.]])

In [24]:
ohe.categories_

[array(['female', 'male'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]

In [25]:
features = ohe.categories_
features

[array(['female', 'male'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]

In [27]:
features_lst = []

for i in features:
    for j in i[1:]:
        features_lst.append(j)
features_lst

['male', 'yes', 'northwest', 'southeast', 'southwest']

In [29]:
ohe_features = pd.DataFrame(ohe_array, columns=features_lst)
ohe_features.head()

Unnamed: 0,male,yes,northwest,southeast,southwest
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0


In [30]:
df2.drop(columns=['sex','smoker','region'],inplace=True)
df2.head()

Unnamed: 0,age,bmi,children,charges
0,19,27.9,0,16884.924
1,18,33.77,1,1725.5523
2,28,33.0,3,4449.462
3,33,22.705,0,21984.47061
4,32,28.88,0,3866.8552


In [31]:
pd.concat([df2, ohe_features], axis=1)

Unnamed: 0,age,bmi,children,charges,male,yes,northwest,southeast,southwest
0,19,27.900,0,16884.92400,0.0,1.0,0.0,0.0,1.0
1,18,33.770,1,1725.55230,1.0,0.0,0.0,1.0,0.0
2,28,33.000,3,4449.46200,1.0,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,1.0,0.0,1.0,0.0,0.0
4,32,28.880,0,3866.85520,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1.0,0.0,1.0,0.0,0.0
1334,18,31.920,0,2205.98080,0.0,0.0,0.0,0.0,0.0
1335,18,36.850,0,1629.83350,0.0,0.0,0.0,1.0,0.0
1336,21,25.800,0,2007.94500,0.0,0.0,0.0,0.0,1.0


# Ordinal Encoding

In [32]:
from sklearn.preprocessing import OrdinalEncoder

In [33]:
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [35]:
city = df3.region.unique()
city

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [40]:
ore = OrdinalEncoder(categories=[city])

In [42]:
ore_area = ore.fit_transform(df3[['region']])
ore_area

array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [2.]])

In [43]:
ore.categories_

[array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)]

In [44]:
df3[['region']] = ore_area
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,0.0,16884.924
1,18,male,33.77,1,no,1.0,1725.5523
2,28,male,33.0,3,no,1.0,4449.462
3,33,male,22.705,0,no,2.0,21984.47061
4,32,male,28.88,0,no,2.0,3866.8552
