## Medical Health Insurance Price Prediction

#### In this project the prediction of health insurance price is happening by considering various parameters using both KNN Regressor and Linear Regression. And finding the model with best Mean square error value.

In [28]:
# importing the dataset
import pandas as pd
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


#### Data Preprocessing

In [2]:
#checking for null values
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [3]:
df.shape

(1338, 7)

In [4]:
# deleting duplicate values
df=df.drop_duplicates()
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
df.shape

(1337, 7)

In [6]:
df=df.reset_index()
df

Unnamed: 0,index,age,sex,bmi,children,smoker,region,charges
0,0,19,female,27.900,0,yes,southwest,16884.92400
1,1,18,male,33.770,1,no,southeast,1725.55230
2,2,28,male,33.000,3,no,southeast,4449.46200
3,3,33,male,22.705,0,no,northwest,21984.47061
4,4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...,...
1332,1333,50,male,30.970,3,no,northwest,10600.54830
1333,1334,18,female,31.920,0,no,northeast,2205.98080
1334,1335,18,female,36.850,0,no,southeast,1629.83350
1335,1336,21,female,25.800,0,no,southwest,2007.94500


In [8]:
df.drop('index',axis=1,inplace=True)

In [9]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1332,50,male,30.970,3,no,northwest,10600.54830
1333,18,female,31.920,0,no,northeast,2205.98080
1334,18,female,36.850,0,no,southeast,1629.83350
1335,21,female,25.800,0,no,southwest,2007.94500


In [10]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


In [11]:
df.replace({"sex":{'male':1,'female':0}},inplace=True)

In [12]:
df.replace({'smoker':{'yes':1,'no':0}},inplace=True)

In [13]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,southwest,16884.92400
1,18,1,33.770,1,0,southeast,1725.55230
2,28,1,33.000,3,0,southeast,4449.46200
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...
1332,50,1,30.970,3,0,northwest,10600.54830
1333,18,0,31.920,0,0,northeast,2205.98080
1334,18,0,36.850,0,0,southeast,1629.83350
1335,21,0,25.800,0,0,southwest,2007.94500


In [14]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [15]:
df.replace({'region':{'southwest':0,'southeast':1,'northwest':2,'northeast':3}},inplace=True)

In [16]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,0,16884.92400
1,18,1,33.770,1,0,1,1725.55230
2,28,1,33.000,3,0,1,4449.46200
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.880,0,0,2,3866.85520
...,...,...,...,...,...,...,...
1332,50,1,30.970,3,0,2,10600.54830
1333,18,0,31.920,0,0,3,2205.98080
1334,18,0,36.850,0,0,1,1629.83350
1335,21,0,25.800,0,0,0,2007.94500


#### Seperating the Input Features and Target Varible

In [17]:
x=df.iloc[:,:6]
x

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,0
1,18,1,33.770,1,0,1
2,28,1,33.000,3,0,1
3,33,1,22.705,0,0,2
4,32,1,28.880,0,0,2
...,...,...,...,...,...,...
1332,50,1,30.970,3,0,2
1333,18,0,31.920,0,0,3
1334,18,0,36.850,0,0,1
1335,21,0,25.800,0,0,0


In [18]:
y=df.iloc[:,6]
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1332    10600.54830
1333     2205.98080
1334     1629.83350
1335     2007.94500
1336    29141.36030
Name: charges, Length: 1337, dtype: float64

#### Train,Test and Split

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [20]:
x_train.shape

(1069, 6)

In [21]:
x_test.shape

(268, 6)

#### Building the Linear Regression Model

In [22]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train,y_train)

In [23]:
# checking the prediction for x_test values
y_pred = lr.predict(x_test)
y_pred

array([ 5249.62110372,  9614.07164753, 11264.86496896, 26114.25576197,
        6987.31665784,   556.38944467,  1845.76662159, -1254.27920129,
        2032.19968938, 14374.90713867,  9905.02049358, 26842.49597118,
       14220.19680395,  9511.95743082,  5478.85639721, 10016.05309605,
        6042.61801377,  5994.74708413,  4594.96009717, 14758.63893928,
        2640.70820822, 12206.96794651,  2010.71214373,  4288.03168683,
        4159.9052538 ,  8953.50979457,  1509.92912644, 12486.45961561,
        4089.95998433, 29878.91638036,  9062.89716314, 38822.69638551,
        8497.05636686, 12934.52820651, 25298.99893979, 15742.15511659,
       12371.97661804, 30325.7904704 ,  6769.64258837,  3352.18384114,
       27248.57545848,  4074.97780726,  5728.49276568, 39080.31151419,
       28106.58916584, 11707.46948316, 10864.06240669, 10012.49031362,
       13333.94002481,  6897.9915038 , 33623.99421574,  5276.04627729,
       32626.48967628, 33203.75578063, 15669.48050674,  3753.78754753,
      

#### Building the KNN Regressor Model

In [24]:
from sklearn.neighbors import KNeighborsRegressor
kn = KNeighborsRegressor()
kn.fit(x_train,y_train)
y_knpred = kn.predict(x_test)
y_knpred

array([ 4158.36295 ,  8380.74726 , 15756.95928 ,  4804.34254 ,
       11897.79081 ,  9516.793768,  4651.3949  ,  1530.71542 ,
        4929.23435 , 19295.47732 , 14698.64789 ,  8889.58646 ,
       16359.272666, 14357.32241 ,  7887.72026 , 17923.78744 ,
       10934.27645 ,  6975.263422,  6383.50555 , 14246.41273 ,
        1946.3552  , 12597.3602  ,  1602.30208 , 11342.738952,
        1949.43652 , 17275.688206,  8902.97178 , 10383.63446 ,
        6756.40649 , 21822.38019 , 13940.120314, 17409.09356 ,
       11566.8389  , 10660.26324 ,  6814.095078, 21006.21925 ,
       15230.42399 , 18564.302994, 11012.9031  ,  5892.954006,
        8305.72682 ,  9915.44495 , 13507.106216, 19398.9242  ,
       15503.95245 , 12554.072792, 10712.35836 , 13892.72065 ,
       16914.85542 , 10944.844828, 25185.535268,  4175.81353 ,
       20732.336652, 14255.497064, 18778.42271 ,  7627.52106 ,
       12833.113586,  9210.05472 , 16303.92235 ,  9142.96968 ,
       13988.02927 ,  4082.15623 ,  4287.21635 , 20264.

#### Checking the r2_score of both the algorithm to determine the best algorithm

In [25]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.7526726290709554

In [26]:
r2_score(y_test,y_knpred)

0.1779337188239173

#### Dynamic input for the prediction

In [27]:
import numpy as np
input = (34,1,26,2,0,3)
convert_to_array = np.asarray(input)
reshape=convert_to_array.reshape(1,-1)
prediction = lr.predict(reshape)
print(prediction)

[6446.49573188]


