In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_excel('insurance.xlsx')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [5]:
df.drop_duplicates(inplace=True)
df.drop('region',axis=1,inplace=True)
df['sex'].replace({'female' : 0 , 'male' : 1} ,inplace=True)
df['smoker'].replace({'no' : 0 , 'yes' : 1} ,inplace=True)
X = df.drop('expenses',axis=1)
y = df['expenses']

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=9)

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

estimator = ElasticNet()
param_grid = ({'alpha' : list(range(1,100)), 'l1_ratio':[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]})

model_hp = GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model_hp.fit(X_train,y_train)
model_hp.best_params_

{'alpha': 45, 'l1_ratio': 1}

In [10]:
from sklearn.linear_model import ElasticNet
enr_best = ElasticNet(alpha=45,l1_ratio=1)
enr_best.fit(X_train,y_train)

print('Intercept :',enr_best.intercept_)
print('Coefficent :',enr_best.coef_)

from sklearn.metrics import r2_score,mean_squared_error

ypred_train = enr_best.predict(X_train)
print('Train RMSE :',np.sqrt(mean_squared_error(y_train,ypred_train)))
print('Train R2 :',r2_score(y_train,ypred_train))

ypred_test = enr_best.predict(X_test)
print('Test RMSE :',np.sqrt(mean_squared_error(y_test,ypred_test)))
print('Test R2 :',r2_score(y_test,ypred_test))

from sklearn.model_selection import cross_val_score
print('CV :' , cross_val_score(enr_best,X_train,y_train).mean())

Intercept : -12084.243492696774
Coefficent : [  264.42706339    -0.           317.29607054   382.62392865
 23710.44414936]
Train RMSE : 6045.7371630459675
Train R2 : 0.7592721687983452
Test RMSE : 6118.0062176954325
Test R2 : 0.7008373661584879
CV : 0.753720520261007


In [11]:
input_data = {
    'age':31,
    'sex':'female',
    'bmi':25.74,
    'children':0,
    'smoker':'no',
    'region':'northeast'
}

In [12]:
df_test = pd.DataFrame(input_data,index=[0])
df_test.drop(['region'],axis=1,inplace=True)
df_test['sex'].replace({'female' : 0 , 'Male' : 1} ,inplace=True)
df_test['smoker'].replace({'no' : 0 , 'yes' : 1} ,inplace=True)

transformed_data = df_test

In [13]:
enr_best.predict(transformed_data)

array([4280.19632825])