## Importing the Libraries


In [1]:
import numpy as np
import pandas as pd

## Importing the Dataset


In [10]:
dataset = pd.read_csv('./../Data/insurance.csv')

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [11]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [12]:
X[1]

array([18, 'male', 33.77, 1, 'no', 'southeast'], dtype=object)

## Encoding Categorical Data


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 4, 5])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [17]:
print(X[1])

[0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 18 33.77 1]


## Spliting dataset into Training set & Test set

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [36]:
print(X_test)

[[0.0 1.0 1.0 ... 52 30.2 1]
 [1.0 0.0 1.0 ... 47 29.37 1]
 [0.0 1.0 0.0 ... 48 40.565 2]
 ...
 [0.0 1.0 1.0 ... 57 40.28 0]
 [1.0 0.0 0.0 ... 30 39.05 3]
 [0.0 1.0 1.0 ... 46 24.795 3]]


## Training the Regression Model

In [41]:
# from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=0)
regressor.fit(X_train, y_train)

## Predicting Test set results

In [42]:
y_pred = regressor.predict(X_test)

np.printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[10209.0409122   9724.53      ]
 [ 8985.5932532   8547.6913    ]
 [44751.996095   45702.02235   ]
 [13236.0752505  12950.0712    ]
 [ 9990.7574888   9644.2525    ]
 [11309.542474    4500.33925   ]
 [ 2762.231898    2198.18985   ]
 [11900.2724711  11436.73815   ]
 [ 6985.42757     7537.1639    ]
 [ 8010.496016    5425.02335   ]
 [ 8247.7690142   6753.038     ]
 [17125.4506636  10493.9458    ]
 [ 7983.7320527   7337.748     ]
 [ 5819.9251207   4185.0979    ]
 [20224.2773501  18310.742     ]
 [13003.4332266  10702.6424    ]
 [13968.5121489  12523.6048    ]
 [ 7324.9147327   3490.5491    ]
 [ 6395.15969025  6457.8434    ]
 [34105.044447   33475.81715   ]
 [23970.510889   23967.38305   ]
 [13525.0811061  12643.3778    ]
 [11375.9483156  23045.56616   ]
 [25216.201393   23065.4207    ]
 [ 2828.0925106   1674.6323    ]
 [ 9023.16211     4667.60765   ]
 [ 5832.6078613   3732.6251    ]
 [ 8349.235128    7682.67      ]
 [ 3753.9278135   3756.6216    ]
 [11163.0062798   8413.46305   ]
 [ 8222.39

## Confusion Metrix

In [43]:
from sklearn.metrics import r2_score
accuracy = r2_score(y_test, y_pred)
print(accuracy)

0.883281905423329
