# U.S. Medical Insurance Costs

In [239]:
import csv

#create empty lists for the different sets of data
age = []
sex = []
bmi = []
children = []
smoker = []
region = []
charges = []

#access csv file with data
with open('insurance.csv') as insurance_csv:
    #create DictReader object to make accessing data easier
    insurance_dict  = csv.DictReader(insurance_csv)
    #iterate through the DictReader object to add the data to the empty lists we created
    for dictionary in insurance_dict:
        age.append(dictionary['age'])
        sex.append(dictionary['sex'])
        bmi.append(dictionary['bmi'])
        children.append(dictionary['children'])
        smoker.append(dictionary['smoker'])
        region.append(dictionary['region'])
        charges.append(dictionary['charges'])
        
#some print() statements we used to ensure our data is looking how we expected
#print(age)
#print(sex)
#print(bmi)
#print(children)
#print(smoker)
#print(region)
#print(charges)

#it's important to remember that our data is now stored in arrays of STRINGS, even the data that is numbers

In [240]:
#This function takes a list and returns the mean of the values in the list.
#The function will cause a TypeError if used on lists with non-number elements.
#It would be useful to add a condition that causes it to return None should their be non-numbers in the list.

def average_value(list):
    #sum_values variable and iteration over list used to find sum of values in the given list
    sum_values = 0
    for value in list:
        sum_values += float(value)
    #the average is the sum of the values in the list divided by the number of values in the list
    average_value = sum_values/len(list)
    #return the average
    return average_value

#testing our function on the array of ages
#print(average_value(age))

In [241]:
#This function takes a list of the strings 'female' and 'male' and returns the ratio of males to females.
#Obviously only useful for our sex list.

#This function is not commented as much as the average_value() function because I have decided that these functions
#are not complicated enough to warrant as detailed an explanation. 

def sex_breakdown(list):
    num_females = 0
    num_males = 0
    total = len(list)
    for person in list:
        if person=='female':
            num_females += 1
        elif person=='male':
            num_males += 1
    ratio_males_to_females = num_males/num_females
    #this print statement explains what our ratio means
    print("For every female in the data set, there are {males} males.".format(males = ratio_males_to_females))
    return ratio_males_to_females

breakdown_by_sex = sex_breakdown(sex)

#print(breakdown_by_sex)

#We find that there are slightly more males than females in the data set,
#but the discrepancy is not large enough to cause concern.

For every female in the data set, there are 1.0211480362537764 males.


In [242]:
#We want to do analysis based on region, but we don't know how many regions there are, and don't feel like manually 
#checking, so this function will make and return an array with every element in a given array listed once.

def list_elements(list):
    diff_elements = []
    #"For every element in the given list, if the element is not already in diff_elements, add it to diff_elements."
    for element in list:
        if not element in diff_elements:
            diff_elements.append(element)
    return diff_elements

list_of_regions = list_regions(region)

print(list_of_regions)

#We find that there are four different regions.

['southwest', 'southeast', 'northwest', 'northeast']


In [243]:
#This function will sort the insurance costs (elements of the charges list) into a dictionary of lists
#where the keys are certain categories and the values are lists of the insurance costs for people in that category.

def sort_costs(category_list, costs):
    #make a list of the different categories from the category list
    categories = list_elements(category_list)
    #create an empty dictionary that we will fill and later return
    cost_dictionary = {}
    #For each category, we add a key/value pair to the dictionary where the 
    #key is the category name, and the value is a list of costs in that category.
    for category in categories:
        cost_dictionary.update({category : [costs[i] for i in range(len(costs)) if category_list[i]==category]})
        #Praise the lord for list comprehensions; this one adds costs to the list depending on whether
        #or not the element of the category list at the same index is the category we are making the list for at the moment. 
    return cost_dictionary

sorted_by_region = sort_costs(region, charges)

#print(sorted_by_region.keys())
#print(sorted_by_region['southeast'])


#This function will be useful in our analysis as it can be used to sort our data into categories.
#From there, we can find average costs for people in certain categories, and make insights about 
#how being in different categories can affect your insurance cost. 

In [244]:
#This function takes a dictionary created by the previous function and returns the same dictionary, 
#but instead of lists of insurance costs, the values are the average insurance costs.

def sorted_averages(sorted_costs_dict):
    cost_dictionary_average = {}
    for key in sorted_costs_dict.keys():
        cost_dictionary_average.update({key : average_value(sorted_costs_dict[key])})
    return cost_dictionary_average

#It's important to remember that sorted means sorted into categories, not alphabetically or numerically.
#Perhaps the names for this function and the sort_costs function could have been chosen better. 

sorted_by_region_averages = sorted_averages(sorted_by_region)

for region in sorted_by_region_averages.keys():
    print(region+": "+str(sorted_by_region_averages[region]))

print("")

print(sorted_by_region_averages.keys())
print(sorted_by_region_averages.values())


southwest: 12346.93737729231
southeast: 14735.411437609895
northwest: 12417.575373969228
northeast: 13406.3845163858

dict_keys(['southwest', 'southeast', 'northwest', 'northeast'])
dict_values([12346.93737729231, 14735.411437609895, 12417.575373969228, 13406.3845163858])


In [245]:
#some functions to help us make sense of the data

#These functions return the maximum or minimum value in a dictionary 
#(we plan on using a dictionary returned by the sorted_averages function) and that value's key.

#how each function works should be pretty self-explanatory...

def find_minimum(sorted_averages_dict):
    minimum = float('inf')
    minimum_key = ""
    for key in sorted_averages_dict:
        if sorted_averages_dict[key]<minimum:
            minimum = sorted_averages_dict[key]
            minimum_key = key
    return (minimum_key, minimum)

def find_maximum(sorted_averages_dict):
    maximum = 0
    maximum_key = ""
    for key in sorted_averages_dict:
        if sorted_averages_dict[key]>maximum:
            maximum = sorted_averages_dict[key]
            maximum_key = key
    return (maximum_key, maximum)

print(find_minimum(sorted_by_region_averages))
print(find_maximum(sorted_by_region_averages))

('southwest', 12346.93737729231)
('southeast', 14735.411437609895)


In [246]:
#This function will take a dictionary returned by the sorted_averages function
#and present the data in a fashion that is easy to understand.


def compile_data(sorted_averages_dict):
    minimum = find_minimum(sorted_averages_dict)
    maximum = find_maximum(sorted_averages_dict)
    print("Minimum Average Cost: {category}: {minimum}".format(category=minimum[0], minimum=minimum[1]))
    print("Maximum Average Cost: {category}: {maximum}".format(category=maximum[0], maximum=maximum[1]))

compile_data(sorted_by_region_averages)


Minimum Average Cost: southwest: 12346.93737729231
Maximum Average Cost: southeast: 14735.411437609895


In [247]:
#This function will complete cost-based analysis on whatever other piece of data you choose. 
#For example: minimum/maximum cost based on region

#The first parameter should be the name of one of the lists of data, such as age or children. 
#The second parameter should be the same name, but as a string, such as "Age" or "Children". (capitalization optional)
def analyze(factor_to_analyze, factor_as_string):
    print("{factor}:".format(factor=factor_as_string))
    compile_data(sorted_averages(sort_costs(factor_to_analyze, charges)))
    return None

analyze(age, "Age")
analyze(smoker, "Smoker")
analyze(bmi, "BMI")
analyze(children, "Children")

#analyze(region, "Regions")


#Some of the less broad categories such as BMI are very specific and their minimum/maximum costs will not be very insightful.


Age:
Minimum Average Cost: 21: 4730.464329642857
Maximum Average Cost: 64: 23275.530837272723
Smoker:
Minimum Average Cost: no: 8434.268297856199
Maximum Average Cost: yes: 32050.23183153285
BMI:
Minimum Average Cost: 43.01: 1149.3959
Maximum Average Cost: 47.41: 63770.42801
Children:
Minimum Average Cost: 5: 8786.035247222222
Maximum Average Cost: 3: 15355.31836681528
