# U.S. Medical Insurance Costs

# Importing Data

In [135]:
# import necessary libraries
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [70]:
# import data
dt = pd.read_csv('insurance.csv')

In [71]:
# take a first look at the data
print(dt.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [72]:
print(dt.describe())

               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


In [118]:
# number of rows in database
R = dt.shape[0]
print(R)

1338


In [119]:
# it is better for analysis to code male as 0 and female as 1. Similarly
# lets code smoker as 1.
dt['smoker_code'] = 0
dt['sex_code'] = 0
for i in range(R):
    if dt.loc[i,'smoker'] == 'yes':
        dt.loc[i,'smoker_code'] = 1
    if dt.loc[i,'sex'] == 'female':
        dt.loc[i,'sex_code'] = 1
print(dt.head())

   age     sex     bmi  children smoker     region      charges age_group  \
0   19  female  27.900         0    yes  southwest  16884.92400     18-29   
1   18    male  33.770         1     no  southeast   1725.55230     18-29   
2   28    male  33.000         3     no  southeast   4449.46200     18-29   
3   33    male  22.705         0     no  northwest  21984.47061     30-49   
4   32    male  28.880         0     no  northwest   3866.85520     30-49   

    bmi_group  sex_code  smoker_code  
0  overweight         1            1  
1       obese         0            0  
2       obese         0            0  
3      normal         0            0  
4  overweight         0            0  


# Analysis

## We expect that facotrs like age, bmi, or smoker incraese the cost of insurance. Let us first look at the average cost of insurance for each of this factor. 

### Effect of age on cost

In [120]:
# To see the effect of age, fisrt we divide age to four categories. 
bins = [18,30,50,80]
labels = ['18-29','30-49','50-70']
dt['age_group'] = pd.cut(dt.age, bins, labels = labels, include_lowest=True)
        
print(dt.tail())        

      age     sex    bmi  children smoker     region     charges age_group  \
1333   50    male  30.97         3     no  northwest  10600.5483     30-49   
1334   18  female  31.92         0     no  northeast   2205.9808     18-29   
1335   18  female  36.85         0     no  southeast   1629.8335     18-29   
1336   21  female  25.80         0     no  southwest   2007.9450     18-29   
1337   61  female  29.07         0    yes  northwest  29141.3603     50-70   

       bmi_group  sex_code  smoker_code  
1333       obese         0            0  
1334       obese         1            0  
1335       obese         1            0  
1336  overweight         1            0  
1337  overweight         1            1  


In [131]:
# average cost group by age
da = dt.groupby(['age_group']).agg({'charges':['mean'], 'age':'count'}).reset_index()
print(da)

  age_group       charges   age
                     mean count
0     18-29   9397.552051   444
1     30-49  13280.774031   538
2     50-70  18084.987223   356


We obsereve that the cost of insurance is higher for older participants. We also observe that most of the participants in the data belong to the age group of 30 to 49. 

## Effect of smoking on cost

In [123]:
# avergate cost by smoking status
ds = dt.groupby(['smoker']).agg({'charges':['mean']}).reset_index()
print(ds)

  smoker       charges
                  mean
0     no   8434.268298
1    yes  32050.231832


We observe that smoking increase the cost of insurance rather significantly. 

## Effect of bmi on cost

In [124]:
# To see the effect of bmi, fisrt we divide bmi to four categories. 

bins = [10,18.5,25,30,50]
labels = ['underweight','normal','overweight','obese']
dt['bmi_group'] = pd.cut(dt.bmi, bins, labels = labels, include_lowest=True)
        
print(dt.tail())  


      age     sex    bmi  children smoker     region     charges age_group  \
1333   50    male  30.97         3     no  northwest  10600.5483     30-49   
1334   18  female  31.92         0     no  northeast   2205.9808     18-29   
1335   18  female  36.85         0     no  southeast   1629.8335     18-29   
1336   21  female  25.80         0     no  southwest   2007.9450     18-29   
1337   61  female  29.07         0    yes  northwest  29141.3603     50-70   

       bmi_group  sex_code  smoker_code  
1333       obese         0            0  
1334       obese         1            0  
1335       obese         1            0  
1336  overweight         1            0  
1337  overweight         1            1  


In [125]:
# avergate cost by bmi status
db = dt.groupby(['bmi_group']).agg({'charges':['mean']}).reset_index()
print(db)

     bmi_group       charges
                        mean
0  underweight   8657.620652
1       normal  10435.440719
2   overweight  10997.803881
3        obese  15558.903334


We observe that bmi, like smoking and age are positively correlated to insurance cost.

## Do male pay more or less in average for health insurance?

In [133]:
# average cost between male and female
dg = dt.groupby(['sex']).agg({'charges':'mean', 'smoker':'count','age':'mean','bmi':'mean'}).reset_index()
print(dg)

      sex       charges  smoker        age        bmi
0  female  12569.578844     662  39.503021  30.377749
1    male  13956.751178     676  38.917160  30.943129


We observe that men seem to pay more in average, however, data suggest that this could be because there were more men smoker in the data than there are women.

## Let us now look if insurance cost varies in different regions.

In [130]:
dr = dt.groupby('region').agg({'charges':'mean','smoker_code':'sum','sex_code':'sum','bmi':'mean', 'region':'count'})
dr['number of males'] = dr['region'] - dr['sex_code']
print(dr.sort_values(by=['charges']))

                charges  smoker_code  sex_code        bmi  region  \
region                                                              
southwest  12346.937377           58       162  30.596615     325   
northwest  12417.575374           58       164  29.199785     325   
northeast  13406.384516           67       161  29.173503     324   
southeast  14735.411438           91       175  33.355989     364   

           number of males  
region                      
southwest              163  
northwest              161  
northeast              163  
southeast              189  


We observe that the average cost is highest in southeast and lowest in southwest. One reason for that could be because there are more smokers in southeast than there are in southwest. Also, the average bmi is highest in southeast compare to other three regions. 