# U.S. Medical Insurance Costs

In this project I will analyse the data associated with patients and their insurance costs. The purpose of this analysis is to identify and illuminate the relationships between different health factors and the cost of insurance. These are: age, sex, bmi, number of children, smoking habits and residence. 

In [31]:
import csv

age_lst = []
bmi_lst = []
sex_lst = []
children_lst = []
smoker_lst = []
charges_lst = []
region_lst = []
full_dict = []

sex_dict = {"female": 0, "male": 0}
children_dict = {0: 0, 1: 0, 2: 0,  3: 0, 4: 0, 5: 0}
smoker_dict = {"Yes": 0, "No": 0}
region_dict = {"Northwest": 0, "Southeast": 0, "Southwest": 0, "Northeast": 0}

with open('insurance.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        age_lst.append(int(row['age']))
        bmi_lst.append(float(row['bmi']))
        charges_lst.append(row['charges'])
        sex_lst.append(row["sex"])
        children_lst.append(int(row["children"]))
        smoker_lst.append(row["smoker"])
        region_lst.append(row["region"])
        sex_dict[row["sex"]] += 1
        children_dict[int(row["children"])] += 1
        smoker_dict[row["smoker"].capitalize()] += 1
        region_dict[row["region"].capitalize()] += 1
        full_dict.append(row)            

[0, 1, 3, 0, 0, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 2, 3, 0, 2, 1, 2, 0, 0, 5, 0, 1, 0, 3, 0, 1, 0, 0, 2, 1, 2, 1, 0, 2, 0, 0, 1, 0, 2, 1, 0, 3, 2, 2, 2, 1, 2, 3, 4, 1, 1, 0, 0, 2, 1, 0, 3, 0, 5, 3, 1, 2, 0, 1, 0, 0, 0, 1, 0, 1, 4, 2, 2, 0, 0, 0, 0, 0, 1, 3, 2, 2, 1, 3, 0, 0, 0, 0, 0, 0, 3, 1, 1, 1, 2, 0, 0, 1, 2, 0, 0, 3, 0, 0, 1, 0, 2, 2, 0, 0, 1, 3, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 2, 1, 2, 2, 3, 3, 3, 1, 1, 1, 1, 0, 3, 0, 1, 0, 0, 0, 0, 3, 0, 0, 1, 2, 0, 4, 5, 3, 1, 3, 0, 0, 0, 1, 0, 0, 2, 1, 2, 3, 0, 0, 3, 0, 2, 3, 2, 3, 1, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 1, 4, 2, 1, 2, 2, 0, 0, 1, 0, 1, 0, 3, 0, 0, 3, 0, 0, 1, 2, 1, 3, 0, 1, 2, 2, 0, 2, 0, 1, 2, 1, 1, 3, 0, 0, 0, 0, 1, 1, 2, 2, 2, 3, 0, 3, 0, 0, 4, 0, 0, 1, 3, 0, 3, 3, 1, 3, 1, 1, 1, 2, 2, 1, 0, 2, 2, 0, 2, 1, 1, 3, 1, 1, 0, 1, 2, 0, 1, 3, 0, 1, 2, 0, 3, 0, 0, 1, 3, 1, 3, 3, 2, 2, 2, 2, 2, 1, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 1, 4, 0, 0, 0, 1, 1, 2, 1, 0, 1, 0, 0, 

Large quantities of data are filtered into variables containing specific factors. The first function defined creates a dictionary of BMI weight-groups and their associated average insurance costs. This is an example of how we could analyse the relationships between factors - the factor of interest in this case being BMI and its influence on insurance cost. 

In [7]:
def bmi_price_dictionary_maker():
    underweight = 0
    u_total_cost = 0
    healthy_weight = 0
    h_total_cost = 0
    overweight = 0
    over_total_cost = 0
    obese = 0
    obese_total_cost = 0
    for item in full_dict:
        if float(item['bmi']) < 18.5:
            underweight += 1
            u_total_cost += float(item["charges"])
        elif float(item['bmi']) < 24.9:
            healthy_weight += 1
            h_total_cost += float(item["charges"])
        elif float(item['bmi']) < 29.9:
            overweight += 1
            over_total_cost += float(item["charges"])
        else:
            obese += 1
            obese_total_cost += float(item["charges"])
    bmi_price_dictionary = {"Underweight": round((u_total_cost/underweight), 2), "Healthy Weight": round((h_total_cost/healthy_weight), 2), "Overweight": round((over_total_cost/overweight), 2), "Obese": round((obese_total_cost/obese), 2)}
    return bmi_price_dictionary    

print(bmi_price_dictionary_maker())
        

{'Underweight': 8852.2, 'Healthy Weight': 10379.5, 'Overweight': 11030.33, 'Obese': 15460.5}


Next, I will create a class to simplify the storage and access of the data points. This class will contain various methods which will give information on the average age, cost, bmi etc. 

In [35]:
class patientsInfo:
    def __init__(self, age, sex, bmi, children, smoker, region, charges):
        self.age = age
        self.sex = sex
        self.bmi = bmi
        self.children = children
        self.smoker = smoker
        self.region = region
        self.charges = charges
     
    def age_analysis(self):
        average = round(sum(self.age)/len(self.age), 2)
        print ("The average age of the people in this dataset is {}".format(average))
        return average
    
    def sex_analysis(self):
        males_count = self.sex.count("male")
        females_count = self.sex.count("female")
        print("The number of males: {males}. The number of females: {females}".format(males=males_count, females=females_count))
    
    def bmi_analysis(self):
        average = round(sum(self.bmi)/len(self.bmi), 2)
        health = ""
        if average < 18.5:
            health = "underweight"
        elif average < 24.9:
            health = "healthy"
        elif average < 29.9:
            health = "overweight"
        else:
            health = "obese"
        print("The average BMI of the people in this dataset is {}, this is considered {}.".format(average, health))    
        return average
    #make an if statement to say if they are healthy weight on average
    
    def children_analysis(self):
        average = round(sum(self.children)/len(self.children), 2)
        print("The average number of children people in this dataset have is {}.".format(average))
        return average
        
    

To test this class I will input the previously created list variables.

In [36]:
my_info = patientsInfo(age_lst, sex_lst, bmi_lst, children_lst, smoker_lst, region_lst, charges_lst)

my_info.age_analysis()
my_info.sex_analysis()
my_info.bmi_analysis()
my_info.children_analysis()

The average age of the people in this dataset is 39.21
The number of males: 676. The number of females: 662
The average BMI of the people in this dataset is 30.66, this is considered obese.
The average number of children people in this dataset have is 1.09.


1.09