## Importing the Libraries


In [1]:
import numpy as np
import pandas as pd

## Importing the Dataset


In [10]:
dataset = pd.read_csv('./../Data/insurance.csv')

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [11]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [12]:
X[1]

array([18, 'male', 33.77, 1, 'no', 'southeast'], dtype=object)

## Encoding Categorical Data


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 4, 5])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [17]:
print(X[1])

[0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 18 33.77 1]


## Spliting dataset into Training set & Test set

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [36]:
print(X_test)

[[0.0 1.0 1.0 ... 52 30.2 1]
 [1.0 0.0 1.0 ... 47 29.37 1]
 [0.0 1.0 0.0 ... 48 40.565 2]
 ...
 [0.0 1.0 1.0 ... 57 40.28 0]
 [1.0 0.0 0.0 ... 30 39.05 3]
 [0.0 1.0 1.0 ... 46 24.795 3]]


## Training the Regression Model

In [31]:
# from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, random_state=0)
regressor.fit(X_train, y_train)

## Predicting Test set results

In [32]:
y_pred = regressor.predict(X_test)

np.printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[10164.6967323   9724.53      ]
 [ 9177.111904    8547.6913    ]
 [44580.4595308  45702.02235   ]
 [13171.111583   12950.0712    ]
 [ 9970.0914809   9644.2525    ]
 [12016.2158971   4500.33925   ]
 [ 2669.33558573  2198.18985   ]
 [12184.6814594  11436.73815   ]
 [ 7706.3592945   7537.1639    ]
 [ 6502.257596    5425.02335   ]
 [ 8318.5367905   6753.038     ]
 [18415.7306961  10493.9458    ]
 [ 8451.2551823   7337.748     ]
 [ 6217.8067893   4185.0979    ]
 [22827.9884115  18310.742     ]
 [12682.0716995  10702.6424    ]
 [12922.7176253  12523.6048    ]
 [10511.226542    3490.5491    ]
 [ 6507.1262515   6457.8434    ]
 [34127.7315605  33475.81715   ]
 [24026.6191675  23967.38305   ]
 [13207.8398548  12643.3778    ]
 [10874.3870785  23045.56616   ]
 [26947.9827013  23065.4207    ]
 [ 3263.8774055   1674.6323    ]
 [ 9670.2212286   4667.60765   ]
 [ 9198.5885111   3732.6251    ]
 [ 8511.8529036   7682.67      ]
 [ 3979.5720499   3756.6216    ]
 [10879.878859    8413.46305   ]
 [ 7411.73

## Confusion Metrix

In [33]:
from sklearn.metrics import r2_score
accuracy = r2_score(y_test, y_pred)
print(accuracy)

0.8774128912084722
