In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [89]:
data = pd.read_csv("insurance.csv")

In [90]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [91]:
data.children.value_counts()

0    574
1    324
2    240
3    157
4     25
5     18
Name: children, dtype: int64

In [92]:
data.smoker.value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [93]:
# data.age.value_counts()

In [94]:
le=preprocessing.LabelEncoder() #to convert smoker col to int

In [95]:
a=le.fit(data['smoker'])
a

LabelEncoder()

In [96]:
data['smoker']=le.transform(data['smoker'])

In [97]:
d=pd.get_dummies(data)
d.head(10)

Unnamed: 0,age,bmi,children,smoker,charges,sex_female,sex_male,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1,16884.924,1,0,0,0,0,1
1,18,33.77,1,0,1725.5523,0,1,0,0,1,0
2,28,33.0,3,0,4449.462,0,1,0,0,1,0
3,33,22.705,0,0,21984.47061,0,1,0,1,0,0
4,32,28.88,0,0,3866.8552,0,1,0,1,0,0
5,31,25.74,0,0,3756.6216,1,0,0,0,1,0
6,46,33.44,1,0,8240.5896,1,0,0,0,1,0
7,37,27.74,3,0,7281.5056,1,0,0,1,0,0
8,37,29.83,2,0,6406.4107,0,1,1,0,0,0
9,60,25.84,0,0,28923.13692,1,0,0,1,0,0


In [100]:
correlation=d.corr('pearson')
abs(correlation['charges']).sort_values()

region_northeast    0.006349
region_northwest    0.039905
region_southwest    0.043210
sex_male            0.057292
sex_female          0.057292
children            0.067998
region_southeast    0.073982
bmi                 0.198341
age                 0.299008
smoker              0.787251
charges             1.000000
Name: charges, dtype: float64

In [101]:
d=(d-d.min())/(d.max()-d.min())

In [105]:
#Using 4 features--> bmi,children,age,smoker,charges
x1= np.array(d['bmi'])
x2=np.array(d['children'])
x3=np.array(d['age'])
x4=np.array(d['smoker'])
y= np.array(d['charges'])

In [106]:
x1train,x1test,x2train,x2test,x3train,x3test,x4train,x4test,ytrain,ytest= train_test_split(x1,x2,x3,x4,y,test_size=0.2)

In [104]:
len(x1train)

1070

In [72]:
def hypothesis(a,x1,b,x2,c,x3,d,x4,e):
    return a * x1 +b * x2 + c * x3 + d * x4 +e  #hypothesis eq 

In [107]:
def error(a,x1,b,x2,c,x3,d,x4,e,y):
    err=0     #holds the error value
    m=len(x1)#how many data_pts r present
    
    for i in range(m):
        err += np.power((hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i]),2) #err=summation(h-y)2
        
    return (1/(2*m)) *err #loss fn

In [108]:
def step_gradient(a,x1,b,x2,c,x3,d,x4,e,y,learning_rate):
    grad_a=0
    grad_b=0
    grad_c=0
    grad_d=0
    grad_e=0
    m= len(x1)
    
    for i in range(m):
        grad_a += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i])*x1[i] #differentaition wrt a
        grad_b += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i])*x2[i] #diff wrt b
        grad_c += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i])*x3[i] #differentaition wrt c
        grad_d += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i])*x4[i]#diff wrt d
        grad_e += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i]) #differentaition wrt e
    
    
    a=a-grad_a*learning_rate
    b=b-grad_b*learning_rate
    c=c-grad_c*learning_rate
    d=d-grad_d*learning_rate
    e=e-grad_e*learning_rate

    return a,b,c,d,e

In [109]:
def descend(init_a,x1,init_b,x2,init_c,x3,init_d,x4,init_e,y,learning_rate,iteration):
    a=init_a
    b=init_b
    c=init_c
    d=init_d
    e=init_e

    
    for i in range(iterations): #iterations--> hw many times we step
        err=error(a,x1,b,x2,c,x3,d,x4,e,y)
        
        if i% 1000 == 0:
            print(f"Error: {np.sqrt(err)}, a: {a}, b:{b}, c:{c},d:{d},e:{e}") #print error at every 1000 step
            #sqrt of mean error
            
        a,b,c,d,e =step_gradient(a,x1,b,x2,c,x3,d,x4,e,y,learning_rate)
        
    return a,b,c,d,e

In [110]:
a=0.06261540682422004
b=0.022698851726899196
c=0.11762137108556271
d=0.2892492514520354
e=0.05547298402158572
learning_rate = 0.001
iterations = 10000

In [111]:
final_a, final_b,final_c,final_d,final_e = descend(a,x1train,b,x2train,c,x3train,d,x4train,e,ytrain,learning_rate,iterations)

Error: 0.07821676127564203, a: 0.06261540682422004, b:0.022698851726899196, c:0.11762137108556271,d:0.2892492514520354,e:0.05547298402158572
Error: 0.07670349123975091, a: 0.06498100009870633, b:0.022761096158672854, c:0.12259095650383169,d:0.3029256075027036,e:0.05148425338429515
Error: 0.07554796202333094, a: 0.06688437720998167, b:0.022617406610303304, c:0.1267753906626323,d:0.31435435626403707,e:0.04660063922542786
Error: 0.0746425387737905, a: 0.06873168432096423, b:0.0225001187620588, c:0.13066886360888536,d:0.3240671391372678,e:0.04181920181574559
Error: 0.07392694918869024, a: 0.07060414882045811, b:0.02244998833479123, c:0.13437432524458834,d:0.3323623460176208,e:0.03733370791151513
Error: 0.07335646596819825, a: 0.0725111916418748, b:0.02246624118840251, c:0.1379096809592504,d:0.33945917969648237,e:0.03316061321320281
Error: 0.07289726038331193, a: 0.07444728075622699, b:0.022540160861631266, c:0.14127619980707232,d:0.34553636679770416,e:0.029279658004268
Error: 0.07252376535

In [112]:
final_a,final_b,final_c,final_d,final_e

(0.08233365028769875,
 0.02324135817527334,
 0.1530643490561815,
 0.36233603164221334,
 0.016187101700729234)

In [113]:
ytest[:3]

array([0.18784446, 0.1151298 , 0.06022293])

In [114]:
hypothesis(final_a,x1test[:3],final_b,x2test[:3],final_c,x3test[:3],final_d,x4test[:3],final_e)

array([0.06753211, 0.15272416, 0.09469732])

In [115]:
np.sqrt(error(final_a,x1test,final_b,x2test,final_c,x3test,final_d,x4test,final_e,ytest))#error for test values

0.0648276812792441