In [41]:
import pandas as pd


In [42]:
df = pd.read_csv('insurance.csv')
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [43]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [44]:
# replace male value with 1, female value with 0
gender = {'female': 0,
          'male': 1}

df['sex'] = [gender[person] for person in df['sex']]

df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,yes,southwest,16884.92400
1,18,1,33.770,1,no,southeast,1725.55230
2,28,1,33.000,3,no,southeast,4449.46200
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,no,northwest,10600.54830
1334,18,0,31.920,0,no,northeast,2205.98080
1335,18,0,36.850,0,no,southeast,1629.83350
1336,21,0,25.800,0,no,southwest,2007.94500


In [45]:
# do the same for smoker - smoker = 1 non smoker = 0

smoker = {
    'yes': 1,
    'no': 0
}

df.smoker = [smoker[person] for person in df.smoker]

df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,southwest,16884.92400
1,18,1,33.770,1,0,southeast,1725.55230
2,28,1,33.000,3,0,southeast,4449.46200
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest,10600.54830
1334,18,0,31.920,0,0,northeast,2205.98080
1335,18,0,36.850,0,0,southeast,1629.83350
1336,21,0,25.800,0,0,southwest,2007.94500


In [46]:
#print out unique region values
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [47]:
#similar for region
region = {
    'southwest': 0,
    'southeast': 1,
    'northwest': 2,
    'northeast': 3
}

df.region = [region[person] for person in df.region]

df.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


In [48]:
#data is ready to analyze with linear reg

In [49]:
df.corr()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
age,1.0,-0.020856,0.109272,0.042469,-0.025019,-0.002127,0.299008
sex,-0.020856,1.0,0.046371,0.017163,0.076185,-0.004588,0.057292
bmi,0.109272,0.046371,1.0,0.012759,0.00375,-0.157566,0.198341
children,0.042469,0.017163,0.012759,1.0,0.007673,-0.016569,0.067998
smoker,-0.025019,0.076185,0.00375,0.007673,1.0,0.002181,0.787251
region,-0.002127,-0.004588,-0.157566,-0.016569,0.002181,1.0,0.006208
charges,0.299008,0.057292,0.198341,0.067998,0.787251,0.006208,1.0


In [50]:
# we notice that charges are stronly correlated with the person being a smoker.
# also children, region and sex are weakly correlated with

df = df.drop(['sex', 'children', 'region'], axis=1)

df.head(5)

Unnamed: 0,age,bmi,smoker,charges
0,19,27.9,1,16884.924
1,18,33.77,0,1725.5523
2,28,33.0,0,4449.462
3,33,22.705,0,21984.47061
4,32,28.88,0,3866.8552


In [51]:
#splitting data into independent variables and dependent variable charges

X = df.iloc[:, :3]
Y = df.iloc[:, -1]

In [52]:
X

Unnamed: 0,age,bmi,smoker
0,19,27.900,1
1,18,33.770,0
2,28,33.000,0
3,33,22.705,0
4,32,28.880,0
...,...,...,...
1333,50,30.970,0
1334,18,31.920,0
1335,18,36.850,0
1336,21,25.800,0


In [53]:
Y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [54]:
#split data into training data and test data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [55]:
print("x_train: ", x_train.shape)
print("x_test: ", x_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

x_train:  (936, 3)
x_test:  (402, 3)
y_train:  (936,)
y_test:  (402,)


In [56]:
#training the model
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(x_train, y_train)
#predicted values of test values
predictions = model.predict(x_test)

In [57]:
#calculate least squares
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, predictions)
mse

33920514.90536015

In [58]:
# mean square error is really high due to the sensitivity of MSE to outliers
# we will use R^2 instead

In [59]:
model.score(x_test, y_test)

0.7872914737498632

In [60]:
#coeffients
model.coef_

array([  260.31215553,   321.78437125, 23435.03624238])

In [61]:
# intercept
model.intercept_

-11694.64763292871

In [62]:
#R ^ 2 is not really high.
#This could be because:
# * The dataset is small and we need more instances to have more accurate predictions
# * The features are not sufficient and we need more explanatory features
# * The relashionship between the explanatory features and the target is not linear

In [63]:
# one way of approaching the low R^2 value is using polynomial regression models instead of linear regression