In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [6]:
df.shape

(1338, 7)

In [7]:
df.region.value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [9]:
df.children.value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

# Check Null Values

In [10]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [11]:
df.describe(include="all")

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.663397,1.094918,,,13270.422265
std,14.04996,,6.098187,1.205493,,,12110.011237
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.29625,0.0,,,4740.28715
50%,39.0,,30.4,1.0,,,9382.033
75%,51.0,,34.69375,2.0,,,16639.912515


In [12]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [13]:
df["sex"] = df["sex"].map({"female":0, "male":1})
df["smoker"] = df["smoker"].map({"no":0, "yes":1})

In [14]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [15]:
new_df = pd.get_dummies(df, dtype="int64")

In [16]:
new_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [17]:
X = new_df.drop("charges", axis=1)

In [18]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,0,0,0,1
1,18,1,33.77,1,0,0,0,1,0
2,28,1,33.0,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.88,0,0,0,1,0,0


In [19]:
y = new_df["charges"]
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
X_train.shape, X_test.shape

((1070, 9), (268, 9))

In [22]:
from sklearn.preprocessing import MinMaxScaler

In [23]:
scaler = MinMaxScaler()

In [24]:
scaler.fit(X_train)

In [25]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
X_train[:5]

array([[0.60869565, 0.        , 0.10734463, 0.4       , 0.        ,
        0.        , 1.        , 0.        , 0.        ],
       [0.63043478, 0.        , 0.22491256, 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        ],
       [0.73913043, 0.        , 0.23944041, 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        ],
       [0.45652174, 0.        , 0.49394673, 1.        , 0.        ,
        0.        , 0.        , 1.        , 0.        ],
       [0.7826087 , 0.        , 0.14823783, 0.6       , 0.        ,
        0.        , 1.        , 0.        , 0.        ]])

In [27]:
X_test[:5]

array([[0.58695652, 0.        , 0.24791499, 0.4       , 0.        ,
        1.        , 0.        , 0.        , 0.        ],
       [0.39130435, 0.        , 0.37826204, 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.29391983, 0.        , 1.        ,
        0.        , 1.        , 0.        , 0.        ],
       [0.60869565, 1.        , 0.26324993, 0.6       , 0.        ,
        0.        , 1.        , 0.        , 0.        ],
       [0.02173913, 1.        , 0.42937853, 0.        , 1.        ,
        0.        , 1.        , 0.        , 0.        ]])

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error

**1. Linear Regression**

In [29]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [30]:
y_pred_lr = lr.predict(X_test)

In [31]:
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mae_lr

4181.073709884328

**2. Support Vector Machines**

In [32]:
svm = SVR()
svm.fit(X_train, y_train)

In [33]:
y_pred_svm = svm.predict(X_test)

In [34]:
mae_svm = mean_absolute_error(y_test, y_pred_svm)
mae_svm

8618.358867854111

**3. Random Forest Regressor**

In [35]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [36]:
y_pred_rf = rf.predict(X_test)

In [37]:
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mae_rf

2511.619414374067

**4. Gradient Boosting Regressor**

In [38]:
gr = GradientBoostingRegressor()
gr.fit(X_train, y_train)

In [39]:
y_pred_gr = gr.predict(X_test)

In [40]:
mae_gr = mean_absolute_error(y_test, y_pred_gr)
mae_gr

2407.2740259783

In [41]:
X_train[0]

array([0.60869565, 0.        , 0.10734463, 0.4       , 0.        ,
       0.        , 1.        , 0.        , 0.        ])

In [42]:
data = {"age": 0.60869565,
        "sex": 0.,
        "bmi": 0.10734463,
        "children": 0.4,
        "smoker": 0.,
        "region_northeast": 0.,
        "region_northwest": 1.,
        "region_southeast": 0.,
        "region_southwest": 0.}

insurance_df = pd.DataFrame(data, index=[0])
insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,0.608696,0.0,0.107345,0.4,0.0,0.0,1.0,0.0,0.0


In [43]:
import warnings
warnings.filterwarnings("ignore")

In [44]:
new_pred = gr.predict(insurance_df)
print(new_pred)

[8420.4656202]


In [45]:
gr = GradientBoostingRegressor()
gr.fit(X, y)

In [46]:
import joblib

In [47]:
joblib.dump(gr, "model_gr_joblib")

['model_gr_joblib']

In [48]:
model = joblib.load("model_gr_joblib")

In [50]:
model.predict(insurance_df)

array([1818.43709089])