In [1]:
import pandas as pd
df = pd.read_csv('insurance.csv')

In [2]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [4]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Handle Nan values

In [6]:
df.isnull().head

<bound method NDFrame.head of         age    sex    bmi  children  smoker  region  charges
0     False  False  False     False   False   False    False
1     False  False  False     False   False   False    False
2     False  False  False     False   False   False    False
3     False  False  False     False   False   False    False
4     False  False  False     False   False   False    False
...     ...    ...    ...       ...     ...     ...      ...
1333  False  False  False     False   False   False    False
1334  False  False  False     False   False   False    False
1335  False  False  False     False   False   False    False
1336  False  False  False     False   False   False    False
1337  False  False  False     False   False   False    False

[1338 rows x 7 columns]>

In [7]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Encoding in ML

In [8]:
x = df.columns

# Label Encoder

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le = LabelEncoder()

In [11]:
df.region = le.fit_transform(df['region'])

In [12]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,3,16884.924
1,18,male,33.77,1,no,2,1725.5523
2,28,male,33.0,3,no,2,4449.462
3,33,male,22.705,0,no,1,21984.47061
4,32,male,28.88,0,no,1,3866.8552


# Label Encoder using Loop

In [13]:
import pandas as pd
df = pd.read_csv('insurance.csv')

In [14]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [15]:
from pandas.core.dtypes.common import is_numeric_dtype

In [16]:
for column in df.columns:
    if is_numeric_dtype(df[column]):
        continue
    else:
        df[column] = le.fit_transform(df[column])

In [17]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# One Hot Encoding

In [18]:
import pandas as pd
df = pd.read_csv('insurance.csv')

In [19]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [20]:
dummy = pd.get_dummies(df['region'])

In [21]:
dummy.head()

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0


In [22]:
dummy = pd.get_dummies(df['region'], drop_first=True)

In [23]:
dummy.head()

Unnamed: 0,northwest,southeast,southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0


In [24]:
df = df.drop('region', axis=1)

In [25]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [26]:
df2 = pd.concat([df, dummy], axis=1)

In [27]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,female,27.9,0,yes,16884.924,0,0,1
1,18,male,33.77,1,no,1725.5523,0,1,0
2,28,male,33.0,3,no,4449.462,0,1,0
3,33,male,22.705,0,no,21984.47061,1,0,0
4,32,male,28.88,0,no,3866.8552,1,0,0


In [28]:
x = df2.drop('charges', axis=1)

In [29]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,northwest,southeast,southwest
0,19,female,27.9,0,yes,0,0,1
1,18,male,33.77,1,no,0,1,0
2,28,male,33.0,3,no,0,1,0
3,33,male,22.705,0,no,1,0,0
4,32,male,28.88,0,no,1,0,0


In [30]:
y = df2['charges']

In [31]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [32]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [33]:
columns = ['region']

for col in columns:
    one_hot = pd.get_dummies(df[col])
    df = pd.concat((df, one_hot), axis=1)

In [34]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast,southwest
0,19,female,27.9,0,yes,southwest,16884.924,0,0,0,1
1,18,male,33.77,1,no,southeast,1725.5523,0,0,1,0
2,28,male,33.0,3,no,southeast,4449.462,0,0,1,0
3,33,male,22.705,0,no,northwest,21984.47061,0,1,0,0
4,32,male,28.88,0,no,northwest,3866.8552,0,1,0,0


In [35]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [36]:
columns = ['region']

for col in columns:
    one_hot = pd.get_dummies(df[col], drop_first=True)
    df = pd.concat((df, one_hot), axis=1).drop(col, axis=1)

In [37]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,female,27.9,0,yes,16884.924,0,0,1
1,18,male,33.77,1,no,1725.5523,0,1,0
2,28,male,33.0,3,no,4449.462,0,1,0
3,33,male,22.705,0,no,21984.47061,1,0,0
4,32,male,28.88,0,no,3866.8552,1,0,0


# Replace Function

In [38]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [39]:
df.sex = df.sex.replace(['male','female'], [3,5])

In [40]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,5,27.9,0,yes,southwest,16884.924
1,18,3,33.77,1,no,southeast,1725.5523
2,28,3,33.0,3,no,southeast,4449.462
3,33,3,22.705,0,no,northwest,21984.47061
4,32,3,28.88,0,no,northwest,3866.8552


In [41]:
df.smoker = df.smoker.replace(['yes','no'], [0,1])

In [42]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,5,27.9,0,0,southwest,16884.924
1,18,3,33.77,1,1,southeast,1725.5523
2,28,3,33.0,3,1,southeast,4449.462
3,33,3,22.705,0,1,northwest,21984.47061
4,32,3,28.88,0,1,northwest,3866.8552


In [43]:
df.region = df.region.replace(['southwest','southeast', 'northwest'], [2,5, 6])

In [44]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,5,27.9,0,0,2,16884.924
1,18,3,33.77,1,1,5,1725.5523
2,28,3,33.0,3,1,5,4449.462
3,33,3,22.705,0,1,6,21984.47061
4,32,3,28.88,0,1,6,3866.8552


# Ordinal Encoder

In [45]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [46]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [47]:
city = ['southwest', 'southeast', 'northwest', 'northeast']

In [48]:
from sklearn.preprocessing import OrdinalEncoder

In [49]:
ordinal = OrdinalEncoder(categories=[city])

In [50]:
encoded = ordinal.fit_transform(df[['region']])

In [51]:
#encoded
newdata_frame = pd.DataFrame(encoded, columns=['Newregion'])

In [52]:
newdata_frame.head()

Unnamed: 0,Newregion
0,0.0
1,1.0
2,1.0
3,2.0
4,2.0


In [53]:
df.region = newdata_frame.Newregion

In [54]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,0.0,16884.924
1,18,male,33.77,1,no,1.0,1725.5523
2,28,male,33.0,3,no,1.0,4449.462
3,33,male,22.705,0,no,2.0,21984.47061
4,32,male,28.88,0,no,2.0,3866.8552


# Ordinal using loop

In [55]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [56]:
cols=['region']

for col in cols:
    unq = df[col].unique()
    df[col] =  OrdinalEncoder(categories=[unq]).fit_transform(df[[col]])

In [57]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,0.0,16884.924
1,18,male,33.77,1,no,1.0,1725.5523
2,28,male,33.0,3,no,1.0,4449.462
3,33,male,22.705,0,no,2.0,21984.47061
4,32,male,28.88,0,no,2.0,3866.8552


# BinaryEncoder

In [58]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [59]:
from sklearn.category_encoders import BinaryEncoder
encoder = BinaryEncoder(cols =['region'])
newdata = encoder.fit_transform(df['region'])
df = pd.concat([df, newdata], axis = 1)
df = df.drop(['region'], axis = 1)
df.head() #Please solve this problem

ModuleNotFoundError: No module named 'sklearn.category_encoders'

# Hashing Encoder

In [None]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

In [60]:
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features = 3, input_type ='string')
hashed_Feature = h.fit_transform(df['region'])
hashed_Feature = hashed_Feature.toarray()
df = pd.concat([df, pd.DataFrame(hashed_Feature)], axis = 1)
df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,0,1,2
0,19,female,27.9,0,yes,southwest,16884.924,-4.0,2.0,-1.0
1,18,male,33.77,1,no,southeast,1725.5523,-4.0,3.0,0.0
2,28,male,33.0,3,no,southeast,4449.462,-4.0,3.0,0.0
3,33,male,22.705,0,no,northwest,21984.47061,-4.0,2.0,-1.0
4,32,male,28.88,0,no,northwest,3866.8552,-4.0,2.0,-1.0


# Mean target encoding

In [None]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

In [61]:
df.insert(5, "Target", [0, 1, 1, 0, 0,], True)
# importing TargetEncoder
from category_encoders import TargetEncoder
Targetenc = TargetEncoder()
# transforming the column after fitting
values = Targetenc.fit_transform(X = df.sex, y = df.Target)
# concating values with dataframe
df = pd.concat([df, values], axis = 1)
df.head()


ValueError: Length of values (5) does not match length of index (1338)