<a href="https://colab.research.google.com/github/nallagondu/datatrained_inter_public/blob/main/Medical_Cost_Personal_Insurance_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Medical Cost Personal Insurance Project**

#Project Description

Health insurance is a type of insurance that covers medical expenses that arise due to an illness. These expenses could be related to hospitalisation costs, cost of medicines or doctor consultation fees. The main purpose of medical insurance is to receive the best medical care without any strain on your finances. Health insurance plans offer protection against high medical costs. It covers hospitalization expenses, day care procedures, domiciliary expenses, and ambulance charges, besides many others. Based on certain input features such as age , bmi,,no of dependents ,smoker ,region  medical insurance is calculated .


**Columns**                                        
•	age: age of primary beneficiary
•	sex: insurance contractor gender, female, male
•	bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9.
•	children: Number of children covered by health insurance / Number of dependents
•	smoker: Smoking
•	region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.
•	charges: Individual medical costs billed by health insurance


**Dataset Link-**
https://github.com/FlipRoboTechnologies/ML-Datasets/blob/main/Medical%20Cost%20Insurance/medical_cost_insurance.csv


In [None]:
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline

In [None]:
url = "https://raw.githubusercontent.com/nallagondu/ML-Datasets/main/Medical%20Cost%20Insurance/medical_cost_insurance.csv"
medical_ins_df = pd.read_csv(url)
medical_ins_df

In [None]:
medical_ins_df.head()

In [None]:
medical_ins_df.sample(20)

In [None]:
medical_ins_df.info()

**We can observer that ,there are no missing values**

In [None]:
medical_ins_df.shape

In [None]:
medical_ins_df.columns

In [None]:
medical_ins_df.columns.tolist()

In [None]:
medical_ins_df['children'].value_counts().sort_values()

In [None]:
#Converting categorical Features into Numerical
Categorical_medical_ins_df = {'sex': {'male' : 1,'female': 0},
                              'smoker': {'no':0 ,'yes' : 1 },
                              'region': {'northeast': 0,'northwest': 1,'southeast': 3,'southwest':4}
                              }
cp_medical_ins_df = medical_ins_df.copy()
cp_medical_ins_df.replace(Categorical_medical_ins_df,inplace=True)

In [None]:
cp_medical_ins_df

In [None]:
cp_medical_ins_df.describe()

# By comparing min, max, and percentiles, we can identify potential outliers.
# BMI has a minimum of 15.96 and a maximum of 53.13, indicating potential outliers at both ends.
# Class Imbalance
# High variation in charges (std = 12110.01)

In [None]:
#corr = cp_medical_ins_df.corr()
cp_medical_ins_df.info()

In [None]:
sns.heatmap(cp_medical_ins_df.isnull())

In [None]:
corr = cp_medical_ins_df.corr()

plt.figure(figsize=(14,10))
#cha = cp_medical_ins_df['charges'].value_counts()
sns.heatmap(corr,fmt=".2g",annot=True, cmap = 'YlOrRd_r', linewidths= .5)
plt.title("Medical Charges ",fontsize = 14)
plt.tight_layout()
plt.show()

Smoker ,BMI and age are most importent factor to determines charges , and Sex ,childer and region do not affect the charges

In [None]:
medical_ins_df['sex'].value_counts().sort_values()


In [None]:
medical_ins_df['smoker'].value_counts().sort_values()


In [None]:
medical_ins_df['region'].value_counts().sort_values()

#

In [None]:
plt.figure(figsize= (12,9))
plt.title(' Age  and Charges ')
sns.barplot(x='age',y='charges',data =cp_medical_ins_df,palette='Dark2' )
plt.show()

In [None]:
plt.figure(figsize= (12,9))
plt.title(' region  and Cgarges ')
sns.barplot(x='region',y='charges',data =cp_medical_ins_df,palette='Dark2' )
plt.show()

In [None]:
plt.figure(figsize= (12,9))
plt.title('BMI VS Charges ')
sns.scatterplot(x='bmi',y='charges',data =cp_medical_ins_df,palette='Dark2',hue= 'sex' ) #Scatterplot
plt.show()

In [None]:
plt.figure(figsize= (12,9))
plt.title('BMI VS Charges ')
sns.lineplot(x='bmi',y='charges',data =cp_medical_ins_df,color = 'blue' )
plt.show()

In [None]:
#Smoker Vs Charges

plt.figure(figsize= (12,9))
plt.title('Smoker VS Charges ')
sns.barplot(x='smoker',y='charges',data =cp_medical_ins_df,color = 'blue',hue='sex' )
plt.show()

In [None]:
#Smoker Vs Charges

plt.figure(figsize= (12,9))
plt.title('Sex  VS Charges ')
sns.barplot(x='sex',y='charges',data =cp_medical_ins_df,palette='Set1' )
plt.show()

In [None]:
# to compute the skewness of eatch Attributes

cp_medical_ins_df.skew()


In [None]:
columns = cp_medical_ins_df.columns.tolist()
columns.remove("charges")

In [None]:
skew_df = cp_medical_ins_df[columns].skew().to_frame().rename(columns={0:"skewness"})
skew_df

In [None]:
plt.figure(figsize=(12,9))
sns.distplot(cp_medical_ins_df['age'])
plt.title("AGE Plot")
plt.xlabel('Age')
plt.ylabel('Count')

In [None]:
plt.figure(figsize=(12,9))
sns.distplot(cp_medical_ins_df['bmi'])
plt.title("BMI Plot")
plt.xlabel('BMI')
plt.ylabel('Count')

In [None]:
#Medical Insurance Charges using histplot
plt.figure(figsize=(12,9))
sns.histplot(cp_medical_ins_df['charges'],kde=True)
plt.title("Medical Insurance Charges ", fontsize=20)
#plt.xlabel('Charges')
#plt.ylabel('Count')
plt.show()

In [None]:
#Medical Insurance Charges using BoxPlot
plt.figure(figsize=(12,9))
sns.boxplot(cp_medical_ins_df['charges'])
plt.title("Medical Insurance Charges ", fontsize=20)
#plt.xlabel('Charges')
#plt.ylabel('Count')
plt.show()

In [None]:
#Medical Insurance Charges using histplot
plt.figure(figsize=(12,9))
sns.histplot(cp_medical_ins_df['age'],kde=True)
plt.title("Medical Insurance Age ", fontsize=20)
#plt.xlabel('Charges')
#plt.ylabel('Count')
plt.show()

In [None]:
cp_medical_ins_df.tail()

#There will be a outliers in charges
##we can scale BMI and Charges columns before prediction

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
DataPreprocess_medical_ins_df = cp_medical_ins_df.copy()

DataPreprocess_medical_ins_df

Bmi_temp = DataPreprocess_medical_ins_df.bmi
Bmi_temp = Bmi_temp.values.reshape(-1,1)
DataPreprocess_medical_ins_df['bmi'] = StandardScaler().fit_transform(Bmi_temp)


Age = DataPreprocess_medical_ins_df.age
Age = Age.values.reshape(-1,1)
DataPreprocess_medical_ins_df['age'] = StandardScaler().fit_transform(Age)


Age = DataPreprocess_medical_ins_df.age
Age = Age.values.reshape(-1,1)
DataPreprocess_medical_ins_df['age'] = StandardScaler().fit_transform(Age)


Charges_t = DataPreprocess_medical_ins_df.charges
Charges_t = Charges_t.values.reshape(-1,1)
DataPreprocess_medical_ins_df['age'] = StandardScaler().fit_transform(Charges_t)


DataPreprocess_medical_ins_df.head()

In [None]:
X = DataPreprocess_medical_ins_df.drop('charges',axis=1).values
y = DataPreprocess_medical_ins_df['charges'].values.reshape(-1,1)



In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.20, random_state=42)
print("Xtrain Size", X_train.shape)
print("X_test", X_test.shape)
print("Y train size",y_train.shape)
print("Y_test Size ", y_test.shape)



**Model Building**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import xgboost as xgb

from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV


In [None]:
LR = LinearRegression()
LR = LR.fit(X_train,y_train)
LR_y_pred = LR.predict(X_test)

print("r2_score: ", r2_score(y_test,LR_y_pred))
print(mean_squared_error(y_test,LR_y_pred))

In [None]:
cv_LR = cross_val_score(estimator = LR, X = X, y = y, cv = 10)

y_pred_LR_train = LR.predict(X_train)
r2_score_LR_train = r2_score(y_train, y_pred_LR_train)

y_pred_LR_test = LR.predict(X_test)
r2_score_LR_test = r2_score(y_test, y_pred_LR_test)

rmse_linear = (np.sqrt(mean_squared_error(y_test, y_pred_LR_test)))

print('CV Linear Regression : {0:.3f}'.format(cv_LR.mean()))
print('R2_score (train) : {0:.3f}'.format(r2_score_LR_train))
print('R2_score (test) : {0:.3f}'.format(r2_score_LR_test))
print('RMSE : {0:.3f}'.format(rmse_linear))

#Support Vector Machine Regression

In [None]:
svr = SVR()
svr = svr.fit(X_train,y_train)
svr_y_pred = svr.predict(X_test)

X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)
y_train_scaled = StandardScaler().fit_transform(y_train)
y_test_scaled = StandardScaler().fit_transform(y_test)

print("r2_score: ", r2_score(y_test,svr_y_pred))
print(mean_squared_error(y_test,svr_y_pred))




In [None]:
parameters =  { 'kernel' : ['rbf', 'sigmoid'],
                 'gamma' : [0.001, 0.01, 0.1, 1, 'scale'],
                 'tol' : [0.0001],
                 'C': [0.001, 0.01, 0.1, 1, 10, 100] }


In [None]:
svr_grid = GridSearchCV(estimator=svr, param_grid=parameters, cv=10, verbose=4, n_jobs=-1)
svr_grid.fit(X_train_scaled, y_train_scaled.ravel())

In [None]:
svr = SVR(C=100, gamma = 0.1, tol=0.0001)
svr.fit(X_train_scaled,y_train_scaled)
print("Best Parameters", svr_grid.best_params_)
print("Best Score" , svr_grid.best_score_)

In [None]:
svr_cv = svr_grid.best_score_

y_pred_svr_train = svr.predict(X_train_scaled)
r2_score_svr_train = r2_score(y_train_scaled, y_pred_svr_train)

y_pred_svr_test = svr.predict(X_test_scaled)
r2_score_svr_test = r2_score(y_test_scaled, y_pred_svr_test)

rmse_svr = (np.sqrt(mean_squared_error(y_test_scaled, y_pred_svr_test)))

print('CV : {0:.3f}'.format(svr_cv.mean()))
print('R2_score (train) : {0:.3f}'.format(r2_score_svr_train))
print('R2 score (test) : {0:.3f}'.format(r2_score_svr_test))
print('RMSE : {0:.3f}'.format(rmse_svr))

**Random Forest Regression **

In [None]:
RFReg = RandomForestRegressor()
RFReg = RFReg.fit(X_train, y_train)

RFreg_pred = RFReg.predict(X_test)

print("R2 Score: ", r2_score(y_test,RFreg_pred))