# Medical Cost Analysis in Machine Learning
### Content:
- Exploring Dataset
- Data Preprocessing
- Label Encoding
- Scaling
- Train Test Split
- Hyperparameter Tuning

### Importing Libraries

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score,r2_score,mean_squared_error,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

### Importing Dataset

In [2]:
df = pd.read_csv('medical_cost.csv')

In [3]:
df.head()

Unnamed: 0,Id,age,sex,bmi,children,smoker,region,charges
0,1,19,female,27.9,0,yes,southwest,16884.924
1,2,18,male,33.77,1,no,southeast,1725.5523
2,3,28,male,33.0,3,no,southeast,4449.462
3,4,33,male,22.705,0,no,northwest,21984.47061
4,5,32,male,28.88,0,no,northwest,3866.8552


In [4]:
rows,cols = df.shape

In [5]:
print(f'There are {rows} rows and {cols} columns in dataset')

There are 1338 rows and 8 columns in dataset


### Data Preprocessing
Drop Unnecessary Columns, fill NULL valuesr.g

In [6]:
print(f'There are {df.duplicated().sum()} duplicate values')

There are 0 duplicate values


In [7]:
df.isna().sum()

Id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1338.0,669.5,386.391641,1.0,335.25,669.5,1003.75,1338.0
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Id        1338 non-null   int64  
 1   age       1338 non-null   int64  
 2   sex       1338 non-null   object 
 3   bmi       1338 non-null   float64
 4   children  1338 non-null   int64  
 5   smoker    1338 non-null   object 
 6   region    1338 non-null   object 
 7   charges   1338 non-null   float64
dtypes: float64(2), int64(3), object(3)
memory usage: 83.8+ KB


In [10]:
# Remove id column
df.drop('Id',axis=1,inplace = True)

In [11]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Label Encoding
Categorical variables (sex, smoker, region) are encoded into numerical values using LabelEncoder.

In [12]:
encoder = LabelEncoder()
catcol = ['sex','smoker','region']
for col in catcol:
    df[col] = encoder.fit_transform(df[col])

In [13]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [14]:
scaler = MinMaxScaler()
df['bmi'] = scaler.fit_transform(df[['bmi']])

### Scaling
The bmi column is scaled using MinMaxScaler.

In [15]:
scaler = MinMaxScaler()
df['bmi'] = scaler.fit_transform(df[['bmi']])

In [16]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,0.321227,0,1,3,16884.924
1,18,1,0.47915,1,0,2,1725.5523
2,28,1,0.458434,3,0,2,4449.462
3,33,1,0.181464,0,0,1,21984.47061
4,32,1,0.347592,0,0,1,3866.8552


### Train Test Split
Features and Target: The features (x) and target (y) are separated.  The data is split into training and testing sets with an 80-20 ratio

In [17]:
x = df.drop('charges',axis=1)
y = df['charges']

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,shuffle = True, test_size = 0.2,random_state = 42)

### Hyperparameter Tuning

 Model Evaluation Function:
 A function to evaluate model performance using Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and R-squared (R2).

In [19]:
def evaluate_model(true,predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true,predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square

Model Training and Evaluation
1. Models - A dictionary of various regression models is created.
2. Training and Prediction - Each model is trained on the training data and predictions are made for both training and testing sets.
3. Evaluation - The performance of each model is evaluated using the custom evaluate_model function and printed.

In [20]:
# model dictionary

models = {
            "Linear Regression" : LinearRegression(),
            "Lasso" : Lasso(),
            "Ridge" : Ridge(),
            "k-Neighbors Regression" : KNeighborsRegressor(),
            "Decision Tree" : DecisionTreeRegressor(),
            "Random Forest Regressor" : RandomForestRegressor(),
            "AdaBoost Regressor" :AdaBoostRegressor(),
            "XGBRegressor" : XGBRegressor()
        }

In [21]:
model_name_list=[]
model_train_list=[]
model_test_list = []

for i in range(len(models)):
            model = list(models.values())[i]
            model.fit(x_train,y_train)

            #Make prediction:
            y_train_pred = model.predict(x_train)
            y_test_pred = model.predict(x_test)

            #Evaluate Train and Test dataset :

            model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
            model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
            model_name = list(models.keys())[i]
           
            print("-------------*",model_name,"*----------------")
            print()
            print("Model Performance for Training set :")

            print('Root Mean Squared Error :',model_train_rmse)
            print("Mean Absolute Error : ", model_train_mae)
            print("R2 Score : ", model_train_r2)

            print("----------------------------------------------------")
            
            print("Model Performance for Testing set :")
            
            print('Root Mean Squared Error : ', model_test_rmse)
            print('Mean Absolute Error :  ',model_test_mae)
            print('R2 Score : ', model_test_r2)
            
            
            model_name_list.append(model_name)
            model_train_list.append(model_train_r2)
            model_test_list.append(model_test_r2)
            print()

-------------* Linear Regression *----------------

Model Performance for Training set :
Root Mean Squared Error : 6105.789320191615
Mean Absolute Error :  4208.762029186573
R2 Score :  0.7417049283233981
----------------------------------------------------
Model Performance for Testing set :
Root Mean Squared Error :  5799.587091438356
Mean Absolute Error :   4186.508898366434
R2 Score :  0.7833463107364538

-------------* Lasso *----------------

Model Performance for Training set :
Root Mean Squared Error : 6105.793762284715
Mean Absolute Error :  4207.631557468903
R2 Score :  0.7417045524928172
----------------------------------------------------
Model Performance for Testing set :
Root Mean Squared Error :  5799.8390726922535
Mean Absolute Error :   4185.2722051333
R2 Score :  0.783327483929167

-------------* Ridge *----------------

Model Performance for Training set :
Root Mean Squared Error : 6106.440930593858
Mean Absolute Error :  4201.709455315661
R2 Score :  0.741649794833

### Accuracy Dataframe

In [22]:
data = {
    "model_name":model_name_list,
    "training_accuracy":model_train_list,
    "testing_accuracy":model_test_list
}
acc_df = pd.DataFrame(data)

In [23]:
acc_df

Unnamed: 0,model_name,training_accuracy,testing_accuracy
0,Linear Regression,0.741705,0.783346
1,Lasso,0.741705,0.783327
2,Ridge,0.74165,0.783142
3,k-Neighbors Regression,0.581928,0.456701
4,Decision Tree,0.998308,0.713375
5,Random Forest Regressor,0.975345,0.868225
6,AdaBoost Regressor,0.827529,0.826075
7,XGBRegressor,0.994139,0.850168


### RandomizedSearchCV
1. Parameters: Specifies the hyperparameters to tune for the RandomForestRegressor.2. 
RandomizedSearchCV: Searches for the best hyperparameters using cross-validation
3. 
Best Parameters: Prints the best parameters found for the model.

In [24]:
rf_param = {
    "max_depth":[5,8,15,None,10],
    "max_features":[5,7,'auto',8],
    "min_samples_split":[2,8,15,20],
    "n_estimators":[100,200,500,1000]
}
rf_param

{'max_depth': [5, 8, 15, None, 10],
 'max_features': [5, 7, 'auto', 8],
 'min_samples_split': [2, 8, 15, 20],
 'n_estimators': [100, 200, 500, 1000]}

In [25]:
randomcv_models = [
    ("Random Forest Regressor",RandomForestRegressor(),rf_param)
]

In [26]:
model_param ={}
for name ,model,param in randomcv_models:
    random = RandomizedSearchCV(estimator = model,
                                param_distributions = param,
                                n_iter = 100,cv=3,
                                verbose=2,
                                n_jobs=-1)
    random.fit(x_train,y_train)
    model_param[name]=random.best_params_

for model_name in model_param:
    print(f"---------Best param for {model_name}-----------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------Best param for Random Forest Regressor-----------
{'n_estimators': 200, 'min_samples_split': 8, 'max_features': 5, 'max_depth': 5}


### Retraining with Best Parameters
1. Model Initialization: Initializes the RandomForestRegressor with the best parameters found.
2. Retraining: Retrains the model on the training data.
3. Evaluation: Evaluates the new model on the test data.

In [27]:
rf= RandomForestRegressor(n_estimators= 200, min_samples_split=20, max_features= 5, max_depth= 5)

In [28]:
rf.fit(x_train,y_train)

In [29]:
preds = rf.predict(x_test)

### New Accuracy

In [30]:
r2_score(y_test,preds)

0.8763316058797435

### Saving the model

In [31]:
joblib.dump(rf, 'model.joblib')
print("Model Saved!!!")

Model Saved!!!


### Conclusion

Before Hyperparameter Tuning : 0.8673794573303627
After Hyperparameter Tuning : 0.8780109988609167