In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_boston

import pickle
import json

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("medical_insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
df["sex"].replace({'male': 1, 'female': 0},inplace=True)
df["smoker"].replace({'no': 0,'yes': 1},inplace=True)
df = pd.get_dummies(df, columns=['region'])
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,16884.92400,0,0,0,1
1,18,1,33.770,1,0,1725.55230,0,0,1,0
2,28,1,33.000,3,0,4449.46200,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,0,1,0,0
1334,18,0,31.920,0,0,2205.98080,1,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,0,1


In [51]:
sex_value={'male': 1, 'female': 0}
smoker_value={'no': 0,'yes': 1}

In [52]:
df1=df[["age","bmi","children"]]
df2=df[["sex","smoker","region_northeast","region_northwest","region_southeast","region_southwest","charges"]]
df1

Unnamed: 0,age,bmi,children
0,19,27.900,0
1,18,33.770,1
2,28,33.000,3
3,33,22.705,0
4,32,28.880,0
...,...,...,...
1333,50,30.970,3
1334,18,31.920,0
1335,18,36.850,0
1336,21,25.800,0


In [6]:
df2

Unnamed: 0,sex,smoker,region_northeast,region_northwest,region_southeast,region_southwest,charges
0,0,1,0,0,0,1,16884.92400
1,1,0,0,0,1,0,1725.55230
2,1,0,0,0,1,0,4449.46200
3,1,0,0,1,0,0,21984.47061
4,1,0,0,1,0,0,3866.85520
...,...,...,...,...,...,...,...
1333,1,0,0,1,0,0,10600.54830
1334,0,0,1,0,0,0,2205.98080
1335,0,0,0,0,1,0,1629.83350
1336,0,0,0,0,0,1,2007.94500


## Standardization

In [12]:
std_scalar=StandardScaler()
std_scalar

In [13]:
array=std_scalar.fit_transform(df1)
df1_std=pd.DataFrame(array,columns=df1.columns)
df1_std

Unnamed: 0,age,bmi,children
0,-1.438764,-0.453320,-0.908614
1,-1.509965,0.509621,-0.078767
2,-0.797954,0.383307,1.580926
3,-0.441948,-1.305531,-0.908614
4,-0.513149,-0.292556,-0.908614
...,...,...,...
1333,0.768473,0.050297,1.580926
1334,-1.509965,0.206139,-0.908614
1335,-1.509965,1.014878,-0.908614
1336,-1.296362,-0.797813,-0.908614


In [18]:
df1_std=pd.concat([df1_std,df2],axis=True)
df1_std

Unnamed: 0,age,bmi,children,sex,smoker,region_northeast,region_northwest,region_southeast,region_southwest,charges,...,region_southeast.1,region_southwest.1,charges.1,sex.1,smoker.1,region_northeast.1,region_northwest.1,region_southeast.2,region_southwest.2,charges.2
0,-1.438764,-0.453320,-0.908614,0,1,0,0,0,1,16884.92400,...,0,1,16884.92400,0,1,0,0,0,1,16884.92400
1,-1.509965,0.509621,-0.078767,1,0,0,0,1,0,1725.55230,...,1,0,1725.55230,1,0,0,0,1,0,1725.55230
2,-0.797954,0.383307,1.580926,1,0,0,0,1,0,4449.46200,...,1,0,4449.46200,1,0,0,0,1,0,4449.46200
3,-0.441948,-1.305531,-0.908614,1,0,0,1,0,0,21984.47061,...,0,0,21984.47061,1,0,0,1,0,0,21984.47061
4,-0.513149,-0.292556,-0.908614,1,0,0,1,0,0,3866.85520,...,0,0,3866.85520,1,0,0,1,0,0,3866.85520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,0.768473,0.050297,1.580926,1,0,0,1,0,0,10600.54830,...,0,0,10600.54830,1,0,0,1,0,0,10600.54830
1334,-1.509965,0.206139,-0.908614,0,0,1,0,0,0,2205.98080,...,0,0,2205.98080,0,0,1,0,0,0,2205.98080
1335,-1.509965,1.014878,-0.908614,0,0,0,0,1,0,1629.83350,...,1,0,1629.83350,0,0,0,0,1,0,1629.83350
1336,-1.296362,-0.797813,-0.908614,0,0,0,0,0,1,2007.94500,...,0,1,2007.94500,0,0,0,0,0,1,2007.94500


## Noramalization

In [24]:
normal_scalar=MinMaxScaler()
array=normal_scalar.fit_transform(df1)

In [25]:
df1_norm=pd.DataFrame(array,columns=df1.columns)
df1_norm

Unnamed: 0,age,bmi,children
0,0.021739,0.321227,0.0
1,0.000000,0.479150,0.2
2,0.217391,0.458434,0.6
3,0.326087,0.181464,0.0
4,0.304348,0.347592,0.0
...,...,...,...
1333,0.695652,0.403820,0.6
1334,0.000000,0.429379,0.0
1335,0.000000,0.562012,0.0
1336,0.065217,0.264730,0.0


In [26]:
df1_norm=pd.concat([df1_norm,df2],axis=True)
df1_norm

Unnamed: 0,age,bmi,children,sex,smoker,region_northeast,region_northwest,region_southeast,region_southwest,charges
0,0.021739,0.321227,0.0,0,1,0,0,0,1,16884.92400
1,0.000000,0.479150,0.2,1,0,0,0,1,0,1725.55230
2,0.217391,0.458434,0.6,1,0,0,0,1,0,4449.46200
3,0.326087,0.181464,0.0,1,0,0,1,0,0,21984.47061
4,0.304348,0.347592,0.0,1,0,0,1,0,0,3866.85520
...,...,...,...,...,...,...,...,...,...,...
1333,0.695652,0.403820,0.6,1,0,0,1,0,0,10600.54830
1334,0.000000,0.429379,0.0,0,0,1,0,0,0,2205.98080
1335,0.000000,0.562012,0.0,0,0,0,0,1,0,1629.83350
1336,0.065217,0.264730,0.0,0,0,0,0,0,1,2007.94500


In [27]:
df1_std

Unnamed: 0,age,bmi,children,sex,smoker,region_northeast,region_northwest,region_southeast,region_southwest,charges,...,region_southeast.1,region_southwest.1,charges.1,sex.1,smoker.1,region_northeast.1,region_northwest.1,region_southeast.2,region_southwest.2,charges.2
0,-1.438764,-0.453320,-0.908614,0,1,0,0,0,1,16884.92400,...,0,1,16884.92400,0,1,0,0,0,1,16884.92400
1,-1.509965,0.509621,-0.078767,1,0,0,0,1,0,1725.55230,...,1,0,1725.55230,1,0,0,0,1,0,1725.55230
2,-0.797954,0.383307,1.580926,1,0,0,0,1,0,4449.46200,...,1,0,4449.46200,1,0,0,0,1,0,4449.46200
3,-0.441948,-1.305531,-0.908614,1,0,0,1,0,0,21984.47061,...,0,0,21984.47061,1,0,0,1,0,0,21984.47061
4,-0.513149,-0.292556,-0.908614,1,0,0,1,0,0,3866.85520,...,0,0,3866.85520,1,0,0,1,0,0,3866.85520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,0.768473,0.050297,1.580926,1,0,0,1,0,0,10600.54830,...,0,0,10600.54830,1,0,0,1,0,0,10600.54830
1334,-1.509965,0.206139,-0.908614,0,0,1,0,0,0,2205.98080,...,0,0,2205.98080,0,0,1,0,0,0,2205.98080
1335,-1.509965,1.014878,-0.908614,0,0,0,0,1,0,1629.83350,...,1,0,1629.83350,0,0,0,0,1,0,1629.83350
1336,-1.296362,-0.797813,-0.908614,0,0,0,0,0,1,2007.94500,...,0,1,2007.94500,0,0,0,0,0,1,2007.94500


##  By Stanaradization

In [28]:
x1=df1_std.drop("charges",axis=1)
y1=df1_std["charges"]

In [29]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.30, random_state=21)

In [30]:
knn_regressor=KNeighborsRegressor()              #  By Default n_neighbour = 5, P= 2
knn_regressor.fit(x1_train, y1_train)

In [31]:
# Testing Data Evaluation
KNN_test_accuracy=[]
y1_pred = knn_regressor.predict(x1_test)

mse = mean_squared_error(y1_test, y1_pred)
print("MSE :", mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y1_test, y1_pred)
print("MAE :",mae)

r2_value = r2_score(y1_test, y1_pred)*100
KNN_test_accuracy.append(r2_value)
# Simple_test_accuracy_array=np.array(Simple_test_accuracy)
print("R-Score :",r2_value)

MSE : 35478552.50962667
RMSE : 5956.387538569554
MAE : 3693.040123161693
R-Score : 74.07743345666461


In [32]:
# Training Data Evaluation
KNN_train_accuracy=[]
y1_pred_train = knn_regressor.predict(x1_train)

mse = mean_squared_error(y1_train, y1_pred_train)
print("MSE :", mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y1_train, y1_pred_train)
print("MAE :",mae)

r2_value = r2_score(y1_train, y1_pred_train)*100
KNN_train_accuracy.append(r2_value)
print("R-Score :",r2_value)

MSE : 23060162.67840336
RMSE : 4802.099819704226
MAE : 2881.583252846153
R-Score : 84.67491896282084


## By NOrmalization

In [33]:
x2=df1_norm.drop("charges",axis=1)
y2=df1_norm["charges"]

In [39]:
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2,  test_size=0.30, random_state=21)

In [40]:
knn_regressor=KNeighborsRegressor()              #  By Default n_neighbour = 5, P= 2
knn_regressor.fit(x2_train, y2_train)

In [42]:
# Testing Data Evaluation
KNN_test_accuracy=[]
y2_pred = knn_regressor.predict(x2_test)

mse = mean_squared_error(y2_test, y2_pred)
print("MSE :", mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y2_test, y2_pred)
print("MAE :",mae)

r2_value = r2_score(y2_test, y2_pred)*100
KNN_test_accuracy.append(r2_value)
# Simple_test_accuracy_array=np.array(Simple_test_accuracy)
print("R-Score :",r2_value)

MSE : 35481947.54047678
RMSE : 5956.672522514292
MAE : 3632.160206654229
R-Score : 74.07495286185724


In [43]:
# Training Data Evaluation
KNN_train_accuracy=[]
y2_pred_train = knn_regressor.predict(x2_train)

mse = mean_squared_error(y2_train, y2_pred_train)
print("MSE :", mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y2_train, y2_pred_train)
print("MAE :",mae)

r2_value = r2_score(y2_train, y2_pred_train)*100
KNN_train_accuracy.append(r2_value)
print("R-Score :",r2_value)

MSE : 23262036.73125206
RMSE : 4823.073369880668
MAE : 2897.5264050811966
R-Score : 84.54075962221447


## Testing On user input

In [47]:
x2

Unnamed: 0,age,bmi,children,sex,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,0.021739,0.321227,0.0,0,1,0,0,0,1
1,0.000000,0.479150,0.2,1,0,0,0,1,0
2,0.217391,0.458434,0.6,1,0,0,0,1,0
3,0.326087,0.181464,0.0,1,0,0,1,0,0
4,0.304348,0.347592,0.0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,0.695652,0.403820,0.6,1,0,0,1,0,0
1334,0.000000,0.429379,0.0,0,0,1,0,0,0
1335,0.000000,0.562012,0.0,0,0,0,0,1,0
1336,0.065217,0.264730,0.0,0,0,0,0,0,1


In [48]:
column_names = x2.columns
len(column_names)

9

In [53]:
project_data = {"sex":sex_value, "smoker":smoker_value, "columns":list(column_names)}
project_data

{'sex': {'male': 1, 'female': 0},
 'smoker': {'no': 0, 'yes': 1},
 'columns': ['age',
  'bmi',
  'children',
  'sex',
  'smoker',
  'region_northeast',
  'region_northwest',
  'region_southeast',
  'region_southwest']}

In [54]:
with open("project_data.json", "w") as f:
    json.dump(project_data, f)

In [55]:
with open("KNN_model.pkl", "wb") as f:
    pickle.dump(knn_regressor, f)

In [56]:
with open("Normal.pkl", "wb") as f:
    pickle.dump(normal_scalar, f)

In [57]:
x2.head(1). T

Unnamed: 0,0
age,0.021739
bmi,0.321227
children,0.0
sex,0.0
smoker,1.0
region_northeast,0.0
region_northwest,0.0
region_southeast,0.0
region_southwest,1.0


In [58]:
age = 67
sex = "male"
bmi = 27.9
children = 3
smoker = "yes"
region = "southeast"

# charges = ?

In [59]:
region = "region_" + region
region_index = np.where(column_names == region)[0][0]
region_index

7

In [64]:
array = np.zeros(x2.shape[1])
array[0] = age
array[1] = project_data['sex'][sex]
array[2] = bmi
array[3] = children
array[4] = project_data['smoker'][smoker]
array[region_index] = 1

print(array)

[67.   1.  27.9  3.   1.   0.   0.   1.   0. ]


In [65]:
Predicted_charges= knn_regressor.predict([array])[0]
print("Predicted_charges ", Predicted_charges , "/-Rs Only")

Predicted_charges  22689.43104862069 /-Rs Only
