In [1]:
#import library
import pandas as pd
from sklearn.metrics import r2_score

In [2]:
# read csv file
dataset=pd.read_csv("insurance_pre.csv")

In [3]:
# view dataset
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [4]:
# one hot encoding in nominal data
dataset = pd.get_dummies(dataset, columns=["sex", "smoker"],drop_first=True)
dataset = dataset.astype(int)
dataset.head()


Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27,0,16884,0,1
1,18,33,1,1725,1,0
2,28,33,3,4449,1,0
3,33,22,0,21984,1,0
4,32,28,0,3866,1,0


In [5]:
# view header of the dataset
dataset.columns.to_list()


['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes']

In [6]:
# get indepenedent and dependent dataset from the dataset
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
dependent = dataset[['charges']]
independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27,0,0,1
1,18,33,1,1,0
2,28,33,3,1,0
3,33,22,0,1,0
4,32,28,0,1,0


In [10]:
# Train and Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, train_size=0.8, random_state=0)


In [11]:
# Multi Linear Regression
from sklearn.linear_model import LinearRegression
mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)
predict = mlr_model.predict(X_test)
score = r2_score(y_test, predict)
score

0.7974504524086945

In [13]:
# SVM 
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_ = sc.fit_transform(X_train)
X_test_ = sc.transform(X_test)

svm_model=SVR( kernel='rbf', 
        degree=3, 
        gamma='scale', 
        C=5000, 
        epsilon=0.001, 
        max_iter=-1, 
        coef0=0.0, 
        tol=0.001, 
        shrinking=True, 
        cache_size=200
    )
svm_model.fit(X_train_,y_train)
predict = svm_model.predict(X_test_)
score = r2_score(y_test, predict)
score


  y = column_or_1d(y, warn=True)


0.881633285044482

In [30]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
decisiontree_model=DecisionTreeRegressor( criterion='absolute_error',
        splitter='best',
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=1,
        max_features=None,
        max_leaf_nodes=20,
        min_impurity_decrease=0,
        random_state=0)
decisiontree_model.fit(X_train,y_train)
predict = decisiontree_model.predict(X_test)
score = r2_score(y_test, predict)
score



0.8893369867925229

In [18]:
# Random Forest 
from sklearn.ensemble import RandomForestRegressor
randforest_model = RandomForestRegressor(
        n_estimators=50,
        criterion='absolute_error',
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=1,
        max_features='sqrt',
        max_leaf_nodes=None,
        min_impurity_decrease=0.01,
        random_state=42,
        n_jobs=-1  # Use all available CPU cores
    )      
randforest_model.fit(X_train, y_train)
predict = randforest_model.predict(X_test)
score = r2_score(y_test, predict)
score


  return fit_method(estimator, *args, **kwargs)


0.8986790557051285

In [19]:
# ---------Model Name ------------- R2 score ----
# Multiple Linear Regression - 0.797450452408694
# Support Vector Regression  - 0.881633285044482
# Decision Tree              - 0.8893369867925229
# Random Forest Regression   - 0.898679055705128


In [20]:
# save best model
import pickle 

pickle.dump(randforest_model, open('best_model.sav', 'wb'))


In [28]:
# Deployment phase

load_model = pickle.load(open('best_model.sav', 'rb'))
pred = load_model.predict([[50,	33,	1,1,	0]])
pred



array([10119.67])