# U.S. Medical Insurance Costs

<hr>

## Project Goals
- Find the average age
- Analyze where the majority of the patients are from
- Look in the impact of smoking
- Find the impact of having children
- Find the average age of people who has children<br>
[...]

In [1]:
import csv

insurance_data_rows = []
with open('insurance.csv') as insurance_csv:
    insurance_data = csv.DictReader(insurance_csv)
    
    for row in insurance_data:
        insurance_data_rows.append(row)
        print(f'Age: {row['age']}, Sex: {row['sex']} BMI: {row['bmi']}, Children: {row['children']}, {row['smoker']}, Region: {row['region']}, Cost: {row['charges']}')

Age: 19, Sex: female BMI: 27.9, Children: 0, yes, Region: southwest, Cost: 16884.924
Age: 18, Sex: male BMI: 33.77, Children: 1, no, Region: southeast, Cost: 1725.5523
Age: 28, Sex: male BMI: 33, Children: 3, no, Region: southeast, Cost: 4449.462
Age: 33, Sex: male BMI: 22.705, Children: 0, no, Region: northwest, Cost: 21984.47061
Age: 32, Sex: male BMI: 28.88, Children: 0, no, Region: northwest, Cost: 3866.8552
Age: 31, Sex: female BMI: 25.74, Children: 0, no, Region: southeast, Cost: 3756.6216
Age: 46, Sex: female BMI: 33.44, Children: 1, no, Region: southeast, Cost: 8240.5896
Age: 37, Sex: female BMI: 27.74, Children: 3, no, Region: northwest, Cost: 7281.5056
Age: 37, Sex: male BMI: 29.83, Children: 2, no, Region: northeast, Cost: 6406.4107
Age: 60, Sex: female BMI: 25.84, Children: 0, no, Region: northwest, Cost: 28923.13692
Age: 25, Sex: male BMI: 26.22, Children: 0, no, Region: northeast, Cost: 2721.3208
Age: 62, Sex: female BMI: 26.29, Children: 0, yes, Region: southeast, Cost: 

In [2]:
#insurance_data_rows
def find_average_age(dictionary):
    average_age = 0
    for row in dictionary:
        average_age += int(row['age'])

    average_age /= len(dictionary)
    return average_age

average_age = find_average_age(insurance_data_rows)
print(f'The average age of the patients is {round(average_age, 2)}')

The average age of the patients is 39.21


In [3]:
def region_counter(dictionary):
    region_counter = {}

    for row in dictionary:
        if row['region'] not in region_counter:
            region_counter[row['region']] = 1
        else:
            region_counter[row['region']] += 1

    return region_counter

region_counter(insurance_data_rows)

{'southwest': 325, 'southeast': 364, 'northwest': 325, 'northeast': 324}

In [4]:
def find_majority_region(dictionary):
    counter = region_counter(dictionary)
    region = counter.keys()
    count = counter.values()
    count_region = list(zip(count, region))
    count_region.sort(reverse=True)

    return count_region[0]

count, region = find_majority_region(insurance_data_rows)
print(f'Region with the majority of patients: {region}, with {count} patients')

Region with the majority of patients: southeast, with 364 patients


In [5]:
def organizing_by_smoking(dictionary):
    return {'Smoker': [row['charges'] for row in dictionary if row['smoker'].lower() == 'yes'], 'Non-smoker': [row['charges'] for row in dictionary if row['smoker'].lower() == 'no']}

organizing_by_smoking(insurance_data_rows)

{'Smoker': ['16884.924',
  '27808.7251',
  '39611.7577',
  '36837.467',
  '37701.8768',
  '38711',
  '35585.576',
  '51194.55914',
  '39774.2763',
  '48173.361',
  '38709.176',
  '23568.272',
  '37742.5757',
  '47496.49445',
  '34303.1672',
  '23244.7902',
  '14711.7438',
  '17663.1442',
  '16577.7795',
  '37165.1638',
  '39836.519',
  '21098.55405',
  '43578.9394',
  '30184.9367',
  '47291.055',
  '22412.6485',
  '15820.699',
  '30942.1918',
  '17560.37975',
  '47055.5321',
  '19107.7796',
  '39556.4945',
  '17081.08',
  '32734.1863',
  '18972.495',
  '20745.9891',
  '40720.55105',
  '19964.7463',
  '21223.6758',
  '15518.18025',
  '36950.2567',
  '21348.706',
  '36149.4835',
  '48824.45',
  '43753.33705',
  '37133.8982',
  '20984.0936',
  '34779.615',
  '19515.5416',
  '19444.2658',
  '17352.6803',
  '38511.6283',
  '29523.1656',
  '12829.4551',
  '47305.305',
  '44260.7499',
  '41097.16175',
  '43921.1837',
  '33750.2918',
  '17085.2676',
  '24869.8368',
  '36219.40545',
  '46151.12

In [6]:
def diff_cost_smoking(dictionary):
    smoker_dict = organizing_by_smoking(dictionary)

    smoker_cost = 0
    non_smoker_cost = 0
    
    for cost in smoker_dict['Smoker']:
        smoker_cost += float(cost)
    
    for cost in smoker_dict['Non-smoker']:
        non_smoker_cost += float(cost)

    smoker_cost /= len(smoker_dict['Smoker'])
    non_smoker_cost /= len(smoker_dict['Non-smoker'])

    return smoker_cost - non_smoker_cost

diff = diff_cost_smoking(insurance_data_rows)
print(f'Smoking costs {diff:.2f} dollars more than non-smoking')

Smoking costs 23615.96 dollars more than non-smoking


In [7]:
def organizing_by_children(dictionary):
    children_dict = {}

    for row in dictionary:
        if row['children'] not in children_dict:
            children_dict[row['children']] = [row['charges']]
        else:
            children_dict[row['children']].append(row['charges'])

    return children_dict

organizing_by_children(insurance_data_rows)

{'0': ['16884.924',
  '21984.47061',
  '3866.8552',
  '3756.6216',
  '28923.13692',
  '2721.3208',
  '27808.7251',
  '1826.843',
  '11090.7178',
  '39611.7577',
  '2395.17155',
  '10602.385',
  '36837.467',
  '13228.84695',
  '1137.011',
  '14451.83515',
  '35585.576',
  '2198.18985',
  '13770.0979',
  '1625.43375',
  '2302.3',
  '48173.361',
  '3046.062',
  '20630.28351',
  '3556.9223',
  '12629.8967',
  '2211.13075',
  '37742.5757',
  '14711.7438',
  '1743.214',
  '5920.1041',
  '16577.7795',
  '11356.6609',
  '1532.4697',
  '2755.02095',
  '6571.02435',
  '7935.29115',
  '43578.9394',
  '11073.176',
  '8026.6666',
  '11082.5772',
  '2026.9741',
  '10226.2842',
  '22412.6485',
  '15820.699',
  '6186.127',
  '3645.0894',
  '21344.8467',
  '2867.1196',
  '47055.5321',
  '4646.759',
  '2404.7338',
  '30259.99556',
  '11381.3254',
  '8601.3293',
  '1705.6245',
  '2257.47525',
  '3385.39915',
  '17081.08',
  '9634.538',
  '12815.44495',
  '13616.3586',
  '1632.56445',
  '2457.21115',
  '2

In [8]:
def calculate_stats_children(dictionary):
    children_dict = organizing_by_children(dictionary)
    diff = {}
    avg = {}

    for num_children, costs in children_dict.items():
        # num_children = children_dict.keys()[i]
        # costs = children_dict.values()[i]
        avg[num_children] = 0
        for cost in costs:
            avg[num_children] += float(cost)

        avg[num_children] /= len(costs)

    for num_children1, avg1 in avg.items():
        # num_children1 = avg.keys()[i]
        for num_children2, avg2 in avg.items():
            diff[f'avg({num_children1}) - avg({num_children2})'] = avg1 - avg2

    #print(avg)
    return avg, diff

In [9]:
avg, diff = calculate_stats_children(insurance_data_rows)
print(avg)
print(diff)

{'0': 12365.975601635882, '1': 12731.171831635793, '3': 15355.31836681528, '2': 15073.563733958328, '5': 8786.035247222222, '4': 13850.656311199999}
{'avg(0) - avg(0)': 0.0, 'avg(0) - avg(1)': -365.19622999991043, 'avg(0) - avg(3)': -2989.3427651793972, 'avg(0) - avg(2)': -2707.5881323224457, 'avg(0) - avg(5)': 3579.9403544136603, 'avg(0) - avg(4)': -1484.6807095641161, 'avg(1) - avg(0)': 365.19622999991043, 'avg(1) - avg(1)': 0.0, 'avg(1) - avg(3)': -2624.146535179487, 'avg(1) - avg(2)': -2342.3919023225353, 'avg(1) - avg(5)': 3945.1365844135707, 'avg(1) - avg(4)': -1119.4844795642057, 'avg(3) - avg(0)': 2989.3427651793972, 'avg(3) - avg(1)': 2624.146535179487, 'avg(3) - avg(3)': 0.0, 'avg(3) - avg(2)': 281.7546328569515, 'avg(3) - avg(5)': 6569.283119593058, 'avg(3) - avg(4)': 1504.6620556152811, 'avg(2) - avg(0)': 2707.5881323224457, 'avg(2) - avg(1)': 2342.3919023225353, 'avg(2) - avg(3)': -281.7546328569515, 'avg(2) - avg(2)': 0.0, 'avg(2) - avg(5)': 6287.528486736106, 'avg(2) - a

In [10]:
for key, value in avg.items():
    print(f'Average for patients with {key} children: {value:.2f} dollars')

Average for patients with 0 children: 12365.98 dollars
Average for patients with 1 children: 12731.17 dollars
Average for patients with 3 children: 15355.32 dollars
Average for patients with 2 children: 15073.56 dollars
Average for patients with 5 children: 8786.04 dollars
Average for patients with 4 children: 13850.66 dollars


In [11]:
for key, value in diff.items():
    print(f'Difference {key} children: {value:.2f} dollars')

Difference avg(0) - avg(0) children: 0.00 dollars
Difference avg(0) - avg(1) children: -365.20 dollars
Difference avg(0) - avg(3) children: -2989.34 dollars
Difference avg(0) - avg(2) children: -2707.59 dollars
Difference avg(0) - avg(5) children: 3579.94 dollars
Difference avg(0) - avg(4) children: -1484.68 dollars
Difference avg(1) - avg(0) children: 365.20 dollars
Difference avg(1) - avg(1) children: 0.00 dollars
Difference avg(1) - avg(3) children: -2624.15 dollars
Difference avg(1) - avg(2) children: -2342.39 dollars
Difference avg(1) - avg(5) children: 3945.14 dollars
Difference avg(1) - avg(4) children: -1119.48 dollars
Difference avg(3) - avg(0) children: 2989.34 dollars
Difference avg(3) - avg(1) children: 2624.15 dollars
Difference avg(3) - avg(3) children: 0.00 dollars
Difference avg(3) - avg(2) children: 281.75 dollars
Difference avg(3) - avg(5) children: 6569.28 dollars
Difference avg(3) - avg(4) children: 1504.66 dollars
Difference avg(2) - avg(0) children: 2707.59 dollar

In [12]:
def avg_cost_children(dictionary):
    children_dict = organizing_by_children(dictionary)
    no_children_avg = 0
    children_avg = 0

    for num_children, costs in children_dict.items():
        if int(num_children) == 0:
            for cost in costs:
                no_children_avg += float(cost)
        
        else:
            for cost in costs:
                children_avg += float(cost)

    no_children_avg /= len(children_dict['0'])
    length = 0
    for i in range(1, len(children_dict)):
        length +=len(children_dict[str(i)])

    children_avg /= length

    return no_children_avg, children_avg

no_children_avg, children_avg = avg_cost_children(insurance_data_rows)
print(f'No children average: {no_children_avg:.2f} dollars')
print(f'With children average: {children_avg:.2f} dollars')

No children average: 12365.98 dollars
With children average: 13949.94 dollars


In [15]:
def average_bmi(dictionary):
    avg = 0

    for row in dictionary:
        avg += float(row['bmi'])

    avg /= len(dictionary)
    return avg

print(f'Average BMI: {average_bmi(insurance_data_rows):.2f}')

Average BMI: 30.66


In [16]:
def highest_lowest_bmi_cost(dictionary):
    highest_bmi = -1
    highest_bmi_cost = 0
    lowest_bmi = float("inf")
    lowest_bmi_cost = 0

    for row in dictionary:
        bmi = float(row['bmi'])
        if bmi > highest_bmi:
            highest_bmi = bmi
            highest_bmi_cost = float(row['charges'])

        if bmi < lowest_bmi:
            lowest_bmi = bmi
            lowest_bmi_cost = float(row['charges'])

    return {
        'Highest BMI': highest_bmi,
        'Highest BMI cost': highest_bmi_cost,
        'Lowest BMI': lowest_bmi,
        'Lowest BMI cost': lowest_bmi_cost
        }

print(highest_lowest_bmi_cost(insurance_data_rows))

{'Highest BMI': 53.13, 'Highest BMI cost': 1163.4627, 'Lowest BMI': 15.96, 'Lowest BMI cost': 1694.7964}


In [21]:
def summary(dictionary):
    avg_age = find_average_age(dictionary)
    region, num_patients = find_majority_region(dictionary)
    smoking_diff = diff_cost_smoking(dictionary)
    by_children_avg, children_diff = calculate_stats_children(dictionary)
    no_children_avg, children_avg = avg_cost_children(dictionary)
    bmi_data = highest_lowest_bmi_cost(dictionary)
    return {
                'Average age': avg_age,
                'Region with max patients': region,
                'Max number of patients': num_patients,
                'Difference on smoking': smoking_diff,
                'Children average': {num_children: avg for num_children, avg in by_children_avg.items()},
                'Children impact': {
                    'Diff 1 - 0': children_diff['avg(1) - avg(0)'],
                    'Diff 2 - 1': children_diff['avg(2) - avg(1)'],
                    'Diff 3 - 2': children_diff['avg(3) - avg(2)'],
                    'Diff 4 - 3': children_diff['avg(4) - avg(3)'],
                    'Diff 2 - 0': children_diff['avg(2) - avg(0)'],
                    'Diff 4 - 2': children_diff['avg(4) - avg(2)'],
                    'Diff 4 - 0': children_diff['avg(4) - avg(0)'],
                },
                'Children x no children': {'Children': children_avg, 'No-children': no_children_avg},
                'BMI': bmi_data
        }

In [22]:
summary(insurance_data_rows)

{'Average age': 39.20702541106129,
 'Region with max patients': 364,
 'Max number of patients': 'southeast',
 'Difference on smoking': 23615.96353367665,
 'Children average': {'0': 12365.975601635882,
  '1': 12731.171831635793,
  '3': 15355.31836681528,
  '2': 15073.563733958328,
  '5': 8786.035247222222,
  '4': 13850.656311199999},
 'Children impact': {'Diff 1 - 0': 365.19622999991043,
  'Diff 2 - 1': 2342.3919023225353,
  'Diff 3 - 2': 281.7546328569515,
  'Diff 4 - 3': -1504.6620556152811,
  'Diff 2 - 0': 2707.5881323224457,
  'Diff 4 - 2': -1222.9074227583296,
  'Diff 4 - 0': 1484.6807095641161},
 'Children x no children': {'Children': 13949.941093481697,
  'No-children': 12365.975601635882},
 'BMI': {'Highest BMI': 53.13,
  'Highest BMI cost': 1163.4627,
  'Lowest BMI': 15.96,
  'Lowest BMI cost': 1694.7964}}