## ML Project - Health Insurance Cost Estimation Project

The project is divided into the following sections: 
- Data understanding and exploration
- Data cleaning
- Data preparation
- Model building and evaluation

# 1. Data Understanding and Exploration


In [None]:
#importing the required libraries
import numpy as np 
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style
plt.style.use('classic')
sns.set(rc={'figure.figsize':(30,10)})

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading the dataset
df = pd.read_csv('Data.csv')

In [None]:
# Looking at the first few rows
df.head()

In [None]:
# Summary of the dataset : 1460 rows, 81 columns
df.shape

In [None]:
df.info()

In [None]:
# checking for null values
df_null_count = df.isna().sum().reset_index()

In [None]:
df_null_count.columns = ['Variable','Null_Count']

In [None]:
df_null_count.head()

# 2. Data cleaning


### Treating Null Values

In [None]:
# checking for percentage of null values
df_null_count['Null_Percentage'] = round((df_null_count['Null_Count']/df.shape[0])*100,2)

In [None]:
#  displaying null values greater than 75
df_null_count[df_null_count['Null_Percentage']>=75]

In [None]:
#  displaying null values greater than 20 and less than 50
df_null_count[(df_null_count['Null_Percentage']>=20) & (df_null_count['Null_Percentage']<50)]

In [None]:
#  displaying null values less than 20
df_null_count[(df_null_count['Null_Percentage']>0) & (df_null_count['Null_Percentage']<20)]

In [None]:
df.shape

## Dropping few variables by looking into their dtype and their fill rate - 

In [None]:
# storing these values
null_75 = df_null_count[df_null_count['Null_Percentage']>=75]
null_50_75 = df_null_count[(df_null_count['Null_Percentage']>=50) & (df_null_count['Null_Percentage']<75)]
null_20_50 = df_null_count[(df_null_count['Null_Percentage']>=20) & (df_null_count['Null_Percentage']<50)]
null_0_20 = df_null_count[(df_null_count['Null_Percentage']>0) & (df_null_count['Null_Percentage']<20)]

In [None]:
df[null_75['Variable']].info()

In [None]:
df[null_50_75['Variable']].info()

In [None]:
df[null_20_50['Variable']].info()

In [None]:
df[null_0_20['Variable']].info()

In [None]:
df['Year_last_admitted'].unique()

In [None]:
df['Year_last_admitted'].fillna(0,inplace = True)

In [None]:
df['bmi'].unique()

In [None]:
df['bmi'].fillna(df['bmi'].mean(),inplace = True)

## Final Shape of data after removing null values

In [None]:
df.shape

In [None]:
 df.isna().sum()

In [None]:
df.info()

In [None]:
df.drop(['applicant_id'],axis = 1, inplace = True)

In [None]:
# seperating numerical and categorical columns
num_cols = list(df.select_dtypes(include= ['int','float']).columns)

In [None]:
cat_cols = list(df.select_dtypes(include = ['object']).columns)

In [None]:
# checking length of categorical columns
len(cat_cols)

# 3.Data preparation

## Univariate Analysis


In [None]:
# making a dictionary having columns of df as keys and unique values in those columns as values for these keys 
unique_count_dict = {}
for i in cat_cols:
    a = df[i].nunique()
    unique_count_dict[i] = a

In [None]:
unique_count_dict

In [None]:
# checking how many columns are left having uniques values 
len(unique_count_dict)

In [None]:
# checking length of numerical columns
len(num_cols)

In [None]:
print(num_cols)

In [None]:
df[num_cols].describe().T

In [None]:
# checking count of all the categories in the categorical columns
for j in unique_count_dict :
     plt.figure(figsize=(25,5))
     sns.countplot (x=j, data = df)
     plt.show()

In [None]:
# plotting histograms for all the numerical columns  
for col in num_cols:
    sns.histplot(x=df[col])
    plt.show()

In [None]:
for column in df.columns:
    if df[column].dtype == 'object':
        print(column.upper(),': ',df[column].nunique())
        print(df[column].value_counts().sort_values())
        print('\n')

## Bivariate Analysis & Multivariate Analysis

In [None]:
# plotting the heatmap to check for multicollinearity
plt.figure(figsize=(50, 50))
sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', cmap="YlOrBr", annot_kws={"size": 40})
plt.show()

In [None]:
df_cor = pd.DataFrame(df[num_cols].corr())

In [None]:
# checking for corelation
df_cor

In [None]:
# resetting the index for the above DataFrame
df_cor = df_cor.reset_index()

In [None]:
df_cor

In [None]:
# checking for the columns having corelation of more than 0.6 and less than -0.6 as these will be higly corelated columns 
cor_dict = {}

for i in num_cols:
    a = df_cor[((df_cor[i]>=0.6) & (df_cor[i]<1)) | ((df_cor[i] >-1 ) & df_cor[i]<=-0.6)]
    cor_dict[i] = list(a['index'])

In [None]:
a

In [None]:
cor_dict

In [None]:
sns.pairplot(df)

In [None]:
df.head()

In [None]:
# Creating dummy columns for all the categorical columns  
df = pd.get_dummies(df, columns=cat_cols,drop_first=True,dtype = 'int')

In [None]:
# Number of columns are changed to 44
df.shape

In [None]:
# Checking first few rows of the DataFrame after creating dummies 
df.head()

In [None]:
# checking for variance in the dataset before scaling so as to choose the best scaling technique
df.cov()

## Scaling- MinMax Scalar

In [None]:
# Applying MinMaxScaler to all the columns of the DataFrame
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])

In [None]:
df

In [None]:
# Checking for potential outliers in the  columns by plotting boxplot
for i in num_cols:
    plt.figure(figsize=(10,10))
    sns.boxplot(data = df[i], orient = 'v')
    plt.show() 

In [None]:
# As SalePrice is the target variable, therefore it has to be removed from the num_cols 
num_cols.remove('insurance_cost')

## Removing Outliers

In [None]:
# Creating a function based on the Inter Quantile Range to remove the outliers outside this range
def remove_outlier(col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range= Q1-(1.5 * IQR)
    upper_range= Q3+(1.5 * IQR)
    return lower_range, upper_range

In [None]:
# Converting all the values greater than Inter Quantile Range to the Upper Limit and values lower than IQR to Lower Limit 
for column in num_cols:
    lr,ur=remove_outlier(df[column])
    df[column]=np.where(df[column]>ur,ur,df[column])
    df[column]=np.where(df[column]<lr,lr,df[column])

In [None]:
# Creating boxplots to check the numerical columns after updating the columns
for i in num_cols:
    plt.figure(figsize=(10,10))
    sns.boxplot(data = df[i], orient = 'v')
    plt.show() 

# 4.Model building and evaluation 

### Splitting the data into training and testing set

In [None]:
# Creating a DataFrame X containing all the independent variables 
X = df.drop('insurance_cost', axis=1)

# Creating a DataFrame y having the target variable
y = df[['insurance_cost']]

In [None]:
# Splitting the data into training and testing data in the ratio of 70:30 respectively 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7,
                                                    test_size = 0.3, random_state=100)

In [None]:
# Importing useful libraries for application of various models 
from sklearn import linear_model, metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

## Linear Regression

In [None]:
# invoking the LinearRegression function and finding the bestfit model on training data

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

In [None]:
# Let us explore the coefficients for each of the independent attributes

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

In [None]:
# Let us check the intercept for the model

intercept = regression_model.intercept_[0]

print("The intercept for our model is {}".format(intercept))

In [None]:
X_train.columns

In [None]:
import statsmodels.api as sm

# Add a constant
X_train_lm = sm.add_constant(X_train)
# Create a first fitted model
lr = sm.OLS(y_train, X_train_lm).fit()

In [None]:
# Plotting the summary table of the first fitted model 
print(lr.summary())

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Creating a DataFrame displaying VIF of all the features 
vif = pd.DataFrame()
vif['Features'] = X_train_lm.columns
vif['VIF'] = [variance_inflation_factor(X_train_lm.values, i) for i in range(X_train_lm.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif[0:50]

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE
y_pred_train = regression_model.predict(X_train)
y_pred_test = regression_model.predict(X_test)

metric1 = []
r2_train_lr = r2_score(y_train, y_pred_train)       #Calculating R2 score for training data
print('R2 Score of Training data: ',round(r2_train_lr,3))
metric1.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)          #Calculating R2 score for testing data
print('R2 Score of Testing data: ',round(r2_test_lr,3))
metric1.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))  #Calculating Residual sum of squares of y_train and y_pred_train
print('RSS1: ',round(rss1_lr[0],3))
metric1.append(rss1_lr[0])

rss2_lr = np.sum(np.square(y_test - y_pred_test))    #Calculating Residual sum of squares of y_test and y_pred_test
print('RSS2: ',round(rss2_lr[0],3))
metric1.append(rss2_lr[0])

mse_train_lr = mean_squared_error(y_train, y_pred_train)  #Calculating Mean Squared Error of y_train and y_pred_train
print('RMSE1: ',round(mse_train_lr**0.5,3))  #Calculating Root Mean Squared Error of y_train and y_pred_train by taking root of MSE
metric1.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)    #Calculating Mean Squared Error of y_test and y_pred_test
print('RMSE2: ',round(mse_test_lr**0.5,3)) #Calculating Root Mean Squared Error of y_test and y_pred_test by taking root of MSE
metric1.append(mse_test_lr**0.5)

# Inferences from Linear Regression:
### 1. R2 Score of Training data:  0.945
### 2. R2 Score of Testing data:  0.945
### 3. RSS1:  46.154
### 4. RSS2:  19.638
### 5. RMSE1:  0.051
### 6. RMSE2:  0.051
### Now we will apply Ridge Regularisation to check we can get improved results.

# Ridge Regularisation

In [None]:
# list of alphas to tune - if value too high it will lead to underfitting, if it is too low, 
# it will not handle the overfitting
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}

ridge = Ridge()

# cross validation
folds = 5
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error',  
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)           
model_cv.fit(X_train, y_train) 

In [None]:
# Printing the best hyperparameter alpha
print(model_cv.best_params_)

In [None]:
#Fitting Ridge model for alpha = 5 and printing coefficients which have been penalised
alpha = 5
ridge = Ridge(alpha=alpha)

ridge.fit(X_train, y_train)
print(ridge.coef_)

In [None]:
# Lets calculate some metrics such as R2 score, RSS and RMSE
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)

metric2 = []
r2_train_lr = r2_score(y_train, y_pred_train)       #Calculating R2 score for training data
print('R2 Score of Training data: ',round(r2_train_lr,3))
metric2.append(r2_train_lr)

r2_test_lr = r2_score(y_test, y_pred_test)          #Calculating R2 score for testing data
print('R2 Score of Testing data: ',round(r2_test_lr,3))
metric2.append(r2_test_lr)

rss1_lr = np.sum(np.square(y_train - y_pred_train))  #Calculating Residual sum of squares of y_train and y_pred_train
print('RSS1: ',round(rss1_lr[0],3))
metric2.append(rss1_lr[0])

rss2_lr = np.sum(np.square(y_test - y_pred_test))    #Calculating Residual sum of squares of y_test and y_pred_test
print('RSS2: ',round(rss2_lr[0],3))
metric2.append(rss2_lr[0])

mse_train_lr = mean_squared_error(y_train, y_pred_train)  #Calculating Mean Squared Error of y_train and y_pred_train
print('RMSE1: ',round(mse_train_lr**0.5,3))  #Calculating Root Mean Squared Error of y_train and y_pred_train by taking root of MSE
metric2.append(mse_train_lr**0.5)

mse_test_lr = mean_squared_error(y_test, y_pred_test)    #Calculating Mean Squared Error of y_test and y_pred_test
print('RMSE2: ',round(mse_test_lr**0.5,3)) #Calculating Root Mean Squared Error of y_test and y_pred_test by taking root of MSE
metric2.append(mse_test_lr**0.5)

# Inferences from Ridge Regression:
### 1. R2 Score of Training data:  0.945
### 2. R2 Score of Testing data:  0.945
### 3. RSS1:  46.192
### 4. RSS2:  19.638

## Performance of Linear Regression Vs Ridge Regularization


In [None]:
# Creating DataFrame containing the performance scores of both the Ridge Regression and the Lasso Regression
Performance_metric = {'Metric': ['R2 Score (Train)','R2 Score (Test)','RSS (Train)','RSS (Test)',
                       'RMSE (Train)','RMSE (Test)'], 
        'Linear Regression': metric1,
        'Ridge Regression': metric2
        }
Performance_metric = pd.DataFrame(Performance_metric)
Performance_metric

In [1]:
for i in df['Features']:     #Printing all the coefficients of Lasso Regression model along with the feature names
    print('({}) * {} '.format(round(final_df[final_df["Features"]==i].iloc[:,2].values[0],2),i),end='+')

NameError: name 'df' is not defined

# Final Conclusion:
## Both Linear and Ridge Regression are performing well on our dataset. Therefore, Linear regression is performing perfectly.

# End