In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('insurance.csv') # load the dataset

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()
df5 = df.copy()


In [5]:
df.isnull().sum() # looking for null value

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
df.shape

(1338, 7)

# Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()

In [9]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [10]:
df['sex'] = le.fit_transform(df['sex']) # manually one column encoding using LabelEncoder

In [11]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [12]:
from pandas.core.dtypes.common import is_numeric_dtype

In [13]:
# using for loop to encode all the necessary column
for x in df.columns:
    if is_numeric_dtype(df[x]):
        continue
    else:
        df[x] = le.fit_transform(df[x])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# One Hot Encoding

In [14]:
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [15]:
dummy = pd.get_dummies(df1['sex'], drop_first=True) # manually encoding one column 

In [16]:
dummy.head()

Unnamed: 0,male
0,0
1,1
2,1
3,1
4,1


In [17]:
df1 = df1.drop('sex',axis=1)

In [18]:
df1 = pd.concat([df1,dummy],axis=1)

In [19]:
df1.head()

Unnamed: 0,age,bmi,children,smoker,region,charges,male
0,19,27.9,0,yes,southwest,16884.924,0
1,18,33.77,1,no,southeast,1725.5523,1
2,28,33.0,3,no,southeast,4449.462,1
3,33,22.705,0,no,northwest,21984.47061,1
4,32,28.88,0,no,northwest,3866.8552,1


In [20]:
# using for loop to encode all the necessary columns
for x in df1.columns:
    if is_numeric_dtype(df1[x]):
        continue
    else:
        one_encode = pd.get_dummies(df1[x], drop_first=True, prefix = x)
        df1.drop(df1[[x]],axis=1, inplace = True)
        df1 = pd.concat([df1,one_encode],axis=1)

df1.head()

Unnamed: 0,age,bmi,children,charges,male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


# Ordinal Encoder

In [21]:
from sklearn.preprocessing import OrdinalEncoder

In [22]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [23]:
df2.nunique()

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [24]:
df2.sex.unique()

array(['female', 'male'], dtype=object)

In [25]:
df2.smoker.unique()

array(['yes', 'no'], dtype=object)

In [26]:
df2.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [27]:
sex = ['female', 'male']

In [28]:
ordinal_encoder = OrdinalEncoder(categories= [sex])

In [29]:
encode = ordinal_encoder.fit_transform(df2[['sex']]) # manually encode 1 column

In [30]:
encode

array([[0.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [31]:
new_encode = pd.DataFrame(encode, columns=['sex']) # converting array into Pandas DataFrame

In [32]:
new_encode.head()

Unnamed: 0,sex
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0


In [33]:
df2.sex = new_encode

In [34]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,yes,southwest,16884.924
1,18,1.0,33.77,1,no,southeast,1725.5523
2,28,1.0,33.0,3,no,southeast,4449.462
3,33,1.0,22.705,0,no,northwest,21984.47061
4,32,1.0,28.88,0,no,northwest,3866.8552


In [35]:
dtest = df2.smoker.unique().tolist()

In [36]:
dtest

['yes', 'no']

In [37]:
# using for to encode all the necessary clumns
for x in df2.columns:
    if is_numeric_dtype(df2[x]):
        continue
    else:
        un = df2[x].unique().tolist()
        oe = OrdinalEncoder(categories=[un])
        df2[x] = oe.fit_transform(df2[[x]])
df2.head()
        
        

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,0.0,0.0,16884.924
1,18,1.0,33.77,1,1.0,1.0,1725.5523
2,28,1.0,33.0,3,1.0,1.0,4449.462
3,33,1.0,22.705,0,1.0,2.0,21984.47061
4,32,1.0,28.88,0,1.0,2.0,3866.8552


# Traditional Encoding (Replace)

In [38]:
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [39]:
uniq = df3.sex.unique()

In [40]:
uniq

array(['female', 'male'], dtype=object)

In [41]:
df3.sex = df.sex.replace(['female', 'male'],[1,2]) # manually encode single column
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [42]:
for x in df4.columns:
    if is_numeric_dtype(df4[x]):
        continue
    else:
        uni = df4[x].unique().tolist()
        if df4[x].nunique() == 2:
             df4[x] = df4[x].replace(uni,[1,2])
        elif df4[x].nunique() == 3:
            df4[x] = df4[x].replace(uni,[1,2,3])
        elif df4[x].nunique() == 4:
            df4[x] = df4[x].replace(uni,[1,2,3,4])
df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,1,16884.924
1,18,2,33.77,1,2,2,1725.5523
2,28,2,33.0,3,2,2,4449.462
3,33,2,22.705,0,2,3,21984.47061
4,32,2,28.88,0,2,3,3866.8552
