# U.S. Medical Insurance Costs

In [1]:
#In this project, we are going to analyze the medical insurance costs in the US.
#The main purposes of the analysis are: 
#   -estimatation of insurance costs and the impact that different factors have on them 
#   -comparison between these estimates with the actual insurance costs

#As first step, we want to extract data from a csv and turn it into a dictionary:

import csv

def csv_to_list(filename):
    insurance_list = []
    with open(filename, newline = '') as insurance_csv:
        insurance_dict_reader = csv.DictReader(insurance_csv)    #csv.DictReader() allows us to convert each line of the csv in a dictionary
        for row in insurance_dict_reader:
            insurance_list.append(dict(row))
    return insurance_list
insurance_list = csv_to_list('insurance.csv')
#print(insurance_list)


In [2]:
#Analysis key points:

#1. Average insurance cost --> call: average_medical_insurance_cost(csv_to_list)

#2. Average insurance cost by category(smoker vs non-smoker)  --> call: average_medical_insurance_cost_by_category(csv_to_list)

#3. Average insurance cost % change across age classes      --> call: avg_insurance_by_age_classes(csv_to_list)
    #3.1 Average insurance cost % change across age classes (non-smoker only) --> call: avg_insurance_by_age_classes_non_smokers(csv_to_list)
    #3.2 Average insurance cost % change across age classes (smoker only)     --> call: avg_insurance_by_age_classes_smokers(csv_to_list)

#4. Smoke impact by age class  --> call: smoke_impact(avg_insurance_by_age_classes_smokers(csv_to_list), avg_insurance_by_age_classes_non_smokers(csv_to_list))

#5. Age impact:    
    #5.1 Age impact for non-smokers  --> call: age_impact_for_non_smokers(avg_insurance_by_age_classes_non_smokers(csv_to_list))
    #5.2 Age impact for smokers  --> call: age_impact_for_smokers(avg_insurance_by_age_classes_non_smokers(csv_to_list))

#6. Average insurance cost by region (non-smoker only) --> call: 

#7. Percentage of people with bmi in a healty range

#8. BMI impact (non-smoker only)

#9. Percentage of smokers

In [3]:
#Formula to estimate the insurance cost:
#estimated_insurance_cost = 250*age - 128*sex + 370*bmi + 425*num_of_children + 24000*smoker - 12500

In [4]:
#1. Average insurance cost (actual)

def average_medical_insurance_cost(lst):
    total_insurance_cost = 0

    for dictionary in insurance_list:
        total_insurance_cost += float(dictionary['charges'])

    population = 0

    for dictionary in insurance_list:
        population += 1

    average_cost = total_insurance_cost/population
    print("The average yearly medical insurance cost in the US is: $" + str(round(average_cost,2)))


In [5]:
average_medical_insurance_cost(csv_to_list)

The average yearly medical insurance cost in the US is: $13270.42


In [6]:
#2.1. Average insurance cost by category (smoker vs non-smoker) - 20 y.o.

def average_medical_insurance_cost_by_category(lst):
    smokers_list = []
    for dictionary in insurance_list:
        if dictionary['smoker'] == 'yes':
            smokers_list.append(dictionary)

    non_smokers_list = []
    for dictionary in insurance_list:
        if dictionary['smoker'] == 'no':
            non_smokers_list.append(dictionary)
    
    smoker_dict = {}
    non_smoker_dict = {}
    for non_smoker in non_smokers_list:
        for smoker in smokers_list:
            if 17 < float(non_smoker['age']) <=20 and 17 < float(smoker['age']) <=20 and non_smoker['sex'] == smoker['sex'] and non_smoker['children'] == smoker['children'] and non_smoker['region'] == smoker['region'] and 22 < float(non_smoker['bmi']) <= 24 and 22 < float(smoker['bmi']) <= 24:
                smoker_dict.update(smoker)
                non_smoker_dict.update(non_smoker)
    smoke_impact = {'difference':[float(smoker_dict['charges'])-float(non_smoker_dict['charges']), 100.0*float(smoker_dict['charges'])/float(non_smoker_dict['charges'])-1] for key, value in smoker_dict.items()}
    print({'smoker':smoker_dict})
    print({'non-smoker':non_smoker_dict})
    for key, value in smoke_impact.items():
        print('The smoke impact over the medical insurance cost of a 20 y.o. is $:' +str(round(value[0],2)) +  ' or +' + str(round(value[1],2)) +'%.')
    

In [7]:
average_medical_insurance_cost_by_category(csv_to_list)

{'smoker': {'age': '20', 'sex': 'female', 'bmi': '22.42', 'children': '0', 'smoker': 'yes', 'region': 'northwest', 'charges': '14711.7438'}}
{'non-smoker': {'age': '19', 'sex': 'female', 'bmi': '22.515', 'children': '0', 'smoker': 'no', 'region': 'northwest', 'charges': '2117.33885'}}
The smoke impact over the medical insurance cost of a 20 y.o. is $:12594.4 or +693.82%.


In [8]:
#2.2. Average insurance cost by category (smoker vs non-smoker) - 40 y.o.

def average_medical_insurance_cost_by_category(lst):
    smokers_list = []
    for dictionary in insurance_list:
        if dictionary['smoker'] == 'yes':
            smokers_list.append(dictionary)

    non_smokers_list = []
    for dictionary in insurance_list:
        if dictionary['smoker'] == 'no':
            non_smokers_list.append(dictionary)
    
    smoker_dict = {}
    non_smoker_dict = {}
    for non_smoker in non_smokers_list:
        for smoker in smokers_list:
            if 39 < float(non_smoker['age']) <=45 and 39 < float(smoker['age']) <=45 and non_smoker['sex'] == smoker['sex'] and non_smoker['children'] == smoker['children'] and non_smoker['region'] == smoker['region'] and 25 < float(non_smoker['bmi']) <= 27 and 25 < float(smoker['bmi']) <= 27:
                smoker_dict.update(smoker)
                non_smoker_dict.update(non_smoker)
    smoke_impact = {'difference':[float(smoker_dict['charges'])-float(non_smoker_dict['charges']), 100.0*float(smoker_dict['charges'])/float(non_smoker_dict['charges'])-1] for key, value in smoker_dict.items()}
    print({'smoker':smoker_dict})
    print({'non-smoker':non_smoker_dict})
    for key, value in smoke_impact.items():
        print('The smoke impact over the medical insurance cost of a 44 y.o. is $:' +str(round(value[0],2)) +  ' or +' + str(round(value[1],2)) +'%.')
    

In [9]:
average_medical_insurance_cost_by_category(csv_to_list)

{'smoker': {'age': '43', 'sex': 'female', 'bmi': '26.885', 'children': '0', 'smoker': 'yes', 'region': 'northwest', 'charges': '21774.32215'}}
{'non-smoker': {'age': '44', 'sex': 'female', 'bmi': '26.41', 'children': '0', 'smoker': 'no', 'region': 'northwest', 'charges': '7419.4779'}}
The smoke impact over the medical insurance cost of a 44 y.o. is $:14354.84 or +292.48%.


In [10]:
#2.3. Average insurance cost by category (smoker vs non-smoker) - 50 y.o.

def average_medical_insurance_cost_by_category(lst):
    smokers_list = []
    for dictionary in insurance_list:
        if dictionary['smoker'] == 'yes':
            smokers_list.append(dictionary)

    non_smokers_list = []
    for dictionary in insurance_list:
        if dictionary['smoker'] == 'no':
            non_smokers_list.append(dictionary)
    
    smoker_dict = {}
    non_smoker_dict = {}
    for non_smoker in non_smokers_list:
        for smoker in smokers_list:
            if 50 < float(non_smoker['age']) <=53 and 50 < float(smoker['age']) <=53 and non_smoker['sex'] == smoker['sex'] and non_smoker['children'] == smoker['children'] and non_smoker['region'] == smoker['region'] and 27 < float(non_smoker['bmi']) <= 30 and 27 < float(smoker['bmi']) <= 30:
                smoker_dict.update(smoker)
                non_smoker_dict.update(non_smoker)
    smoke_impact = {'difference':[float(smoker_dict['charges'])-float(non_smoker_dict['charges']), 100.0*float(smoker_dict['charges'])/float(non_smoker_dict['charges'])-1] for key, value in smoker_dict.items()}
    print({'smoker':smoker_dict})
    print({'non-smoker':non_smoker_dict})
    for key, value in smoke_impact.items():
        print('The smoke impact over the medical insurance cost of a 53 y.o. is $:' +str(round(value[0],2)) +  ' or +' + str(round(value[1],2)) +'%.')
    

In [11]:
average_medical_insurance_cost_by_category(csv_to_list)

{'smoker': {'age': '52', 'sex': 'male', 'bmi': '27.36', 'children': '0', 'smoker': 'yes', 'region': 'northwest', 'charges': '24393.6224'}}
{'non-smoker': {'age': '53', 'sex': 'male', 'bmi': '28.88', 'children': '0', 'smoker': 'no', 'region': 'northwest', 'charges': '9869.8102'}}
The smoke impact over the medical insurance cost of a 53 y.o. is $:14523.81 or +246.15%.


In [12]:
#Looking at the results on smoke impact, we can say that the medical insurance cost for a smoker is approximately $14000 higher if compared to a non-smoker of the same age,sex and region and with the same bmi and n.of children.

In [13]:
#We want to divide the average insurance cost by age classes, therefore let's look first at what are the minimum and the maximum ages of the population.

max_age = 0
for dictionary in insurance_list:
    if float(dictionary['age']) > max_age:
        max_age = int(float(dictionary['age']))

min_age = float('inf')
for dictionary in insurance_list:
    if float(dictionary['age']) < min_age:
        min_age = int(float(dictionary['age']))

print('Minimum age: ' + str(min_age))
print('Maximum age: ' + str(max_age))

Minimum age: 18
Maximum age: 64


In [14]:
#3. Average insurance cost % change across age classes

def avg_insurance_by_age_classes(lst):
    age_classes = {'18-25': [], '26-35': [], '36-45': [], '46-55': [], '56-64': []}
    for dictionary in insurance_list:
        total_insurance_cost = 0
        population = 0
        if float(dictionary['age']) >=18 and float(dictionary['age']) <26:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['18-25'].append(total_insurance_cost)
        elif float(dictionary['age']) >=26 and float(dictionary['age']) <35:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['26-35'].append(total_insurance_cost)
        elif float(dictionary['age']) >=35 and float(dictionary['age']) <46:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['36-45'].append(total_insurance_cost)
        elif float(dictionary['age']) >=46 and float(dictionary['age']) <55:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['46-55'].append(total_insurance_cost)
        elif float(dictionary['age']) >=55 and float(dictionary['age']) <65:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['56-64'].append(total_insurance_cost)
    
    average = {key:round(sum(value)/float(len(value)),2) for key, value in age_classes.items()}

    return average

In [15]:
avg_insurance_by_age_classes(csv_to_list)

{'18-25': 9087.02,
 '26-35': 10411.62,
 '36-45': 13304.36,
 '46-55': 15969.0,
 '56-64': 18513.28}

In [16]:
#3.2 Average insurance cost % change between age classes (non-smoker only)

def avg_insurance_by_age_classes_non_smokers(lst):
    age_classes = {'18-25': [], '26-35': [], '36-45': [], '46-55': [], '56-64': []}
    for dictionary in insurance_list:
        total_insurance_cost = 0
        population = 0
        if dictionary['smoker'] == 'no' and float(dictionary['age']) >=18 and float(dictionary['age']) <26:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['18-25'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'no' and float(dictionary['age']) >=26 and float(dictionary['age']) <35:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['26-35'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'no' and float(dictionary['age']) >=35 and float(dictionary['age']) <46:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['36-45'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'no' and float(dictionary['age']) >=46 and float(dictionary['age']) <55:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['46-55'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'no' and float(dictionary['age']) >=55 and float(dictionary['age']) <65:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['56-64'].append(total_insurance_cost)
    
    non_smoker_average = {key:round(sum(value)/float(len(value)),2) for key, value in age_classes.items()}

    return non_smoker_average

In [17]:
avg_insurance_by_age_classes_non_smokers(csv_to_list)

{'18-25': 4003.92,
 '26-35': 5659.32,
 '36-45': 7759.57,
 '46-55': 11438.35,
 '56-64': 14064.83}

In [18]:
#3.3 Average insurance cost % change between age classes (smoker only)

def avg_insurance_by_age_classes_smokers(lst):
    age_classes = {'18-25': [], '26-35': [], '36-45': [], '46-55': [], '56-64': []}
    for dictionary in insurance_list:
        total_insurance_cost = 0
        population = 0
        if dictionary['smoker'] == 'yes' and float(dictionary['age']) >=18 and float(dictionary['age']) <26:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['18-25'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'yes' and float(dictionary['age']) >=26 and float(dictionary['age']) <35:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['26-35'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'yes' and float(dictionary['age']) >=35 and float(dictionary['age']) <46:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['36-45'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'yes' and float(dictionary['age']) >=46 and float(dictionary['age']) <55:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['46-55'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'yes' and float(dictionary['age']) >=55 and float(dictionary['age']) <65:
            total_insurance_cost += float(dictionary['charges'])
            population += 1
            average_insurance_cost = total_insurance_cost/population
            age_classes['56-64'].append(total_insurance_cost)
    
    smoker_average = {key:round(sum(value)/float(len(value)),2) for key, value in age_classes.items()}

    return smoker_average

In [19]:
avg_insurance_by_age_classes_smokers(csv_to_list)

{'18-25': 27933.56,
 '26-35': 28302.63,
 '36-45': 32039.02,
 '46-55': 34816.51,
 '56-64': 39696.37}

In [20]:
#4. Smoke impact by age class

def smoke_impact(dict2, dict1):
    difference = {key:[dict2[key]-dict1[key], 100.0*dict2[key]/dict1[key]-1] for key, value in dict2.items()}
    for key, value in difference.items():
        print('Within the age class <' + key + '>, the medical insurance cost is $' + str(round(value[0],2)) + ' higher for the smokers category compared to the non-smokers, or +'+ str(round(value[1],2)) + '%.')

In [21]:
smoke_impact(avg_insurance_by_age_classes_smokers(csv_to_list), avg_insurance_by_age_classes_non_smokers(csv_to_list))

Within the age class <18-25>, the medical insurance cost is $23929.64 higher for the smokers category compared to the non-smokers, or +696.66%.
Within the age class <26-35>, the medical insurance cost is $22643.31 higher for the smokers category compared to the non-smokers, or +499.11%.
Within the age class <36-45>, the medical insurance cost is $24279.45 higher for the smokers category compared to the non-smokers, or +411.9%.
Within the age class <46-55>, the medical insurance cost is $23378.16 higher for the smokers category compared to the non-smokers, or +303.38%.
Within the age class <56-64>, the medical insurance cost is $25631.54 higher for the smokers category compared to the non-smokers, or +281.24%.


In [22]:
#5.1. Age impact for non-smokers:

def age_impact_for_non_smokers(dict1):
    age_classes = {'18-25': 0, '26-35': 0, '36-45': 0, '46-55': 0,'56-64': 0}
    values_list = []
    for value in dict1.values():
        values_list.append(value)
    difference_list = [100.0*j/i-1 for i, j in zip(values_list[:-1], values_list[1:])]    #metodo per fare la differenza tra due valori adiacenti in una lista
    difference_list.insert(0, 0)               #list.insert(position, value) mi consente di inserire un valore in una posizione da me indicata (in questo caso valore 0 in posizione 0)
    count_difference = 0
    for i in difference_list:
        count_difference += i
    average_difference = count_difference/len(difference_list)
    print('On average, every 10 years the medical insurance for a non-smoker person increase by ' + str(round(average_difference,2)) + '%.')

In [23]:
age_impact_for_non_smokers(avg_insurance_by_age_classes_non_smokers(csv_to_list))

On average, every 10 years the medical insurance for a non-smoker person increase by 108.97%.


In [24]:
#5.2. Age impact for smokers:

def age_impact_for_smokers(dict2):
    age_classes = {'18-25': 0, '26-35': 0, '36-45': 0, '46-55': 0,'56-64': 0}
    values_list = []
    for value in dict2.values():
        values_list.append(value)
    difference_list = [100.0*j/i-1 for i, j in zip(values_list[:-1], values_list[1:])]    #metodo per fare la differenza tra due valori adiacenti in una lista
    difference_list.insert(0, 0)               #list.insert(position, value) mi consente di inserire un valore in una posizione da me indicata (in questo caso valore 0 in posizione 0)
    count_difference = 0
    for i in difference_list:
        count_difference += i
    average_difference = count_difference/len(difference_list)
    print('On average, every 10 years the medical insurance for a smoker person increase by ' + str(round(average_difference,2)) + '%.')

In [25]:
age_impact_for_smokers(avg_insurance_by_age_classes_smokers(csv_to_list))

On average, every 10 years the medical insurance for a smoker person increase by 86.64%.


In [26]:
#6. Average insurance cost by region (non-smoker only)
def avg_insurance_cost_by_region(lst):
    regions = []
    for dictionary in insurance_list:
        for key, value in dictionary.items():
            if key == 'region' and dictionary['region'] not in regions:
                regions.append(dictionary['region'])
    region_dictionary = {region:[] for region in regions}
    
    for dictionary in insurance_list:
        total_insurance_cost = 0
        if dictionary['smoker'] == 'no' and dictionary['region'] == 'southwest':
            total_insurance_cost += float(dictionary['charges'])
            region_dictionary['southwest'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'no' and dictionary['region'] == 'southeast':
            total_insurance_cost += float(dictionary['charges'])
            region_dictionary['southeast'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'no' and dictionary['region'] == 'northwest':
            total_insurance_cost += float(dictionary['charges'])
            region_dictionary['northwest'].append(total_insurance_cost)
        elif dictionary['smoker'] == 'no' and dictionary['region'] == 'northeast':
            total_insurance_cost += float(dictionary['charges'])
            region_dictionary['northeast'].append(total_insurance_cost)
    
    average_by_region = {key:round(sum(value)/float(len(value)),2) for key, value in region_dictionary.items()}
    
    highest_average = 0
    for region, average in average_by_region.items():
        if average > highest_average:
            highest_average = average
            region_name = region
    region_with_highest_average = {region_name:highest_average}
    
    print('The region with the highest average is <' + region_name + '>, with an average of $' + str(highest_average))
    return average_by_region

In [27]:
avg_insurance_cost_by_region(csv_to_list)

The region with the highest average is <northeast>, with an average of $9165.53


{'southwest': 8019.28,
 'southeast': 8032.22,
 'northwest': 8556.46,
 'northeast': 9165.53}

In [28]:
#why does the <northeast> have a higher average?
#The reason could be:
#Older people
#Higher BMI
#More men than women
#A mix of the above reasons

#As next step we will need to test these assumptions