In [1]:
#libraries
import sklearn as sk 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#load in raw data
#data can be found here: https://www.kaggle.com/mirichoi0218/insurance
insurance_raw = pd.read_csv('insurance.csv')

In [3]:
#save into a table that I will manipulate
insurance = insurance_raw

In [4]:
#gather basic info about the table
print(insurance.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [5]:
print(insurance.dtypes)

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object


In [6]:
print(insurance.describe())

               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


In [7]:
#correlation matrix
#which variables will be useful for the linear regression
print(insurance.corr())

               age       bmi  children   charges
age       1.000000  0.109272  0.042469  0.299008
bmi       0.109272  1.000000  0.012759  0.198341
children  0.042469  0.012759  1.000000  0.067998
charges   0.299008  0.198341  0.067998  1.000000


In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(insurance['region'].unique())
insurance['region'] = le.transform(insurance['region'])

le.fit(insurance['smoker'].unique())
insurance['smoker'] = le.transform(insurance['smoker'])

le.fit(insurance['sex'].unique())
insurance['sex'] = le.transform(insurance['sex'])


In [9]:
print(insurance.head())

   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520


In [10]:
print(insurance.corr())

               age       sex       bmi  children    smoker    region   charges
age       1.000000 -0.020856  0.109272  0.042469 -0.025019  0.002127  0.299008
sex      -0.020856  1.000000  0.046371  0.017163  0.076185  0.004588  0.057292
bmi       0.109272  0.046371  1.000000  0.012759  0.003750  0.157566  0.198341
children  0.042469  0.017163  0.012759  1.000000  0.007673  0.016569  0.067998
smoker   -0.025019  0.076185  0.003750  0.007673  1.000000 -0.002181  0.787251
region    0.002127  0.004588  0.157566  0.016569 -0.002181  1.000000 -0.006208
charges   0.299008  0.057292  0.198341  0.067998  0.787251 -0.006208  1.000000


In [11]:
#seperate data into X and y
X = insurance[['age', 'bmi','smoker']]
y = insurance['charges']

In [12]:
#split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [13]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
reg.score(X, y)

0.74717962952283

In [14]:
#fit the model
predicted_charges = reg.predict(X_test)

In [15]:
#calculate the error
from sklearn.metrics import r2_score
r2 = r2_score(y_test, predicted_charges)
print(r2)

0.7337859335608572
