In [1]:
import pandas as pd

# load data from CSV
df = pd.read_csv('insurance.csv')

In [2]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Measures of Central Tendency

In [3]:
# mean value of a column
df.bmi.mean()

30.663396860986538

In [4]:
# median value of a column
df.bmi.median()

30.4

In [5]:
# mode value of a column
df.bmi.mode()

0    32.3
dtype: float64

In [6]:
# data correlation
df.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [7]:
# counts of unique values
df['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [8]:
# descriptive statistics
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


### Handling NaN values (If have)

In [9]:
# checking null values
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [10]:
# filling NaN data with mean value
df.bmi = df.bmi.fillna(df.bmi.mean())

# Encoding

## Label Encoder

In [11]:
df2 = df.copy()
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [12]:
# unique values
df2.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df2.sex = le.fit_transform(df2['sex'])
df2.smoker = le.fit_transform(df2['smoker'])
df2.region = le.fit_transform(df2['region'])

In [14]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


### using loop

In [15]:
df3 = df.copy()
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [16]:
#dataframe columns
columns = df3.columns
columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [17]:
#all column in a single shot using numpy
#cautious: all unnecessary numerical data like 'age' will be transformed also
import numpy as np

for column in columns:
    if df3[column].dtype == np.number:
        continue
    df3[column] = le.fit_transform(df3[column])

In [18]:
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,1,0,27.9,0,1,3,16884.924
1,0,1,33.77,1,0,2,1725.5523
2,10,1,33.0,3,0,2,4449.462
3,15,1,22.705,0,0,1,21984.47061
4,14,1,28.88,0,0,1,3866.8552


In [19]:
#Bypass warning messages

#import warnings
#warnings.filterwarnings('ignore')

In [20]:
df4 = df.copy()
df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [21]:
#all column in a single shot using pandas
from pandas.core.dtypes.common import is_numeric_dtype
for column in columns:
    if is_numeric_dtype(df4[column]):
        continue
    df4[column] = le.fit_transform(df4[column])

In [22]:
df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


### One hot

In [23]:
df5 = df.copy()
df5.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [24]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop = 'first')

sex = pd.DataFrame(ohe.fit_transform(df5[['sex']]).toarray(),columns=['male'])
smoker = pd.DataFrame(ohe.fit_transform(df5[['smoker']]).toarray(),columns=['smoker'])
region = pd.DataFrame(ohe.fit_transform(df5[['region']]).toarray(),columns=['northwest','southeast','southwest'])

df5.drop(['sex','smoker','region'],axis=1, inplace=True)

df5.join([sex,smoker,region]).head()

Unnamed: 0,age,bmi,children,charges,male,smoker,northwest,southeast,southwest
0,19,27.9,0,16884.924,0.0,1.0,0.0,0.0,1.0
1,18,33.77,1,1725.5523,1.0,0.0,0.0,1.0,0.0
2,28,33.0,3,4449.462,1.0,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,1.0,0.0,1.0,0.0,0.0
4,32,28.88,0,3866.8552,1.0,0.0,1.0,0.0,0.0


### using dummy table

In [45]:
df6 = df.copy()
df6.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [48]:
pd.get_dummies(df6['sex'])

Unnamed: 0,female,male
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
1333,0,1
1334,1,0
1335,1,0
1336,1,0


In [49]:
#dummy tables for columns
dummy = pd.get_dummies(df6,columns=['sex','smoker','region'],drop_first=True)

#dropping columns
df6 = df6.drop(columns=['sex','smoker','region'],axis=1)

#concating the dummy tables with rest dataframe
df6 = pd.concat([df6,dummy],axis=1)

In [50]:
df6.head()

Unnamed: 0,age,bmi,children,charges,age.1,bmi.1,children.1,charges.1,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,32,28.88,0,3866.8552,1,0,1,0,0


### using loop

In [53]:
df7 = df.copy()
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [54]:
df7.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [55]:
from pandas.core.dtypes.common import is_string_dtype

for column in columns:
#     df7[column].dtype.kind in 'biufc'
    if is_string_dtype(df7[column]):
        dummy = pd.get_dummies(df7[column],drop_first=True)
        df7 = df7.drop(column,axis=1)
        df7 = pd.concat([df7,dummy],axis=1)

In [56]:
df7.head()

Unnamed: 0,age,bmi,children,charges,male,yes,northwest,southeast,southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [57]:
x = df7.drop(columns=['charges'],axis=1)
x.head()

Unnamed: 0,age,bmi,children,male,yes,northwest,southeast,southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


In [35]:
y = df7.charges
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

### Replace 

In [36]:
df8 = df.copy()
df8.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [37]:
#replace data with corresponding identical value
df8['sex'] = df8.sex.replace(['female','male'],[0,1])
df8['smoker'] = df8.smoker.replace(['no','yes'],[0,1])
df8['region'] = df8.region.replace(['southwest','southeast','northwest','northeast'],[3,2,1,4])

In [38]:
df8.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


### using loop

In [39]:
df9 = df.copy()
df9.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [40]:
for column in columns:
    if is_string_dtype(df9[column]):
        unique = df9[column].unique()
        df9[column] = df9[column].replace(unique,list(range(len(unique))))

In [41]:
df9.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,0,0,16884.924
1,18,1,33.77,1,1,1,1725.5523
2,28,1,33.0,3,1,1,4449.462
3,33,1,22.705,0,1,2,21984.47061
4,32,1,28.88,0,1,2,3866.8552


### Ordinal

In [42]:
df10 = df.copy()
df10.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [43]:
from sklearn.preprocessing import OrdinalEncoder

#custom ordered
#OrdinalEncoder(categories=[['male','female'],['yes','no'],['northeast', 'northwest', 'southeast', 'southwest']])
oe = OrdinalEncoder()

df10[['sex','smoker','region']] = oe.fit_transform(df10[['sex','smoker','region']])
df10.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,1.0,3.0,16884.924
1,18,1.0,33.77,1,0.0,2.0,1725.5523
2,28,1.0,33.0,3,0.0,2.0,4449.462
3,33,1.0,22.705,0,0.0,1.0,21984.47061
4,32,1.0,28.88,0,0.0,1.0,3866.8552


In [44]:
# categorical data type
oe.categories_

[array(['female', 'male'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]