In [180]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [181]:
DATA_PATH = os.path.join(os.getcwd(),'data','insurance.csv')
df = pd.read_csv(DATA_PATH)
df.isna().sum()

age         0
sex         3
bmi         1
children    0
smoker      2
region      4
charges     0
dtype: int64

### Fill Null Values

In [182]:
from sklearn.impute import SimpleImputer

In [183]:
mean_imputer = SimpleImputer(missing_values = np.nan,strategy='mean')
mode_imputer = SimpleImputer(missing_values = np.nan,strategy='most_frequent')

In [184]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [185]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [186]:
df['bmi'] = mean_imputer.fit_transform(df['bmi'].values.reshape(-1,1))

In [187]:
df.loc[:,'sex'] = mode_imputer.fit_transform(df.loc[:,'sex'].values.reshape(-1,1))

In [188]:
df.loc[:,'smoker'] = mode_imputer.fit_transform(df.loc[:,'smoker'].values.reshape(-1,1))

In [189]:
df.loc[:,'region'] = mode_imputer.fit_transform(df.loc[:,'region'].values.reshape(-1,1))

In [190]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [191]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,southeast,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Encode Data

In [192]:
from sklearn.preprocessing import LabelEncoder

In [193]:
lenc = LabelEncoder()

In [194]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [195]:
df['sex'] = lenc.fit_transform(df['sex'])
df['smoker'] = lenc.fit_transform(df['smoker'])

In [196]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,southeast,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [197]:
df= pd.get_dummies(df,drop_first=True,dtype='i')

In [198]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,1
1,18,1,33.77,1,0,1725.5523,0,1,0
2,28,1,33.0,3,0,4449.462,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0
4,32,1,28.88,0,0,3866.8552,1,0,0


## Scal Data

In [199]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [224]:
scl = MinMaxScaler()

In [210]:
y_true = df['charges']
x = df.drop(['charges'],axis=1)
x

Unnamed: 0,age,sex,bmi,children,smoker,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,0,0,1
1,18,1,33.770,1,0,0,1,0
2,28,1,33.000,3,0,0,1,0
3,33,1,22.705,0,0,0,1,0
4,32,1,28.880,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,0,0
1334,18,0,31.920,0,0,0,0,0
1335,18,0,36.850,0,0,0,1,0
1336,21,0,25.800,0,0,0,0,1


In [221]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [212]:
lr = LinearRegression()

In [222]:
lr.fit(x,y_true)
y_predict = lr.predict(x)
print('CC , r2 score' ,r2_score(y_true,y_predict))

CC , r2 score 0.7509113510328609


In [225]:
x_scale = scl.fit_transform(x)

lr.fit(x_scale,y_true)
y_predict = lr.predict(x_scale)
print('CC , r2 score' ,r2_score(y_true,y_predict))

CC , r2 score 0.7509113510328609


## using polynomial Featutres To Enhance Results

In [226]:
from sklearn.preprocessing import PolynomialFeatures

In [227]:
poly = PolynomialFeatures()

In [230]:
x_poly = poly.fit_transform(x_scale)

lr.fit(x_poly,y_true)
y_predict = lr.predict(x_poly)
print('CC , r2 score' ,r2_score(y_true,y_predict))

CC , r2 score 0.8481281567670187


### Scale x values again

In [231]:
x_poly = poly.fit_transform(x_scale)
x_scal = scl.fit_transform(x_poly)
lr.fit(x_scal,y_true)
y_predict = lr.predict(x_scal)
print('CC , r2 score' ,r2_score(y_true,y_predict))

CC , r2 score 0.8446189962474462
