# U.S. Medical Insurance Costs

By Marco Polo

# Import, read and save, plus analysis methods

In [1]:
import csv
import collections

In [2]:
def open_csv(csv_name):
    with open(csv_name, newline = '') as csv_file:
        csv_dict = list(csv.DictReader(csv_file))
    return csv_dict

In [3]:
def get_list(key, OD_list):
    lst = []
    for item in OD_list:
        lst.append(item[key])
    return lst

In [4]:
def binary_convert(lst):
    binary_value = []
    for item in lst:
        if item == 'yes' or item == 'male':
            binary_value.append(1)
        else:
            binary_value.append(0)
    return binary_value

In [5]:
def float_convert(lst):
    new_lst = []
    for item in lst:
        new_item = float(item)
        new_lst.append(new_item)
    return new_lst

In [6]:
def get_mean(lst):
    return sum(lst)/len(lst)

In [7]:
def get_median(lst):
    sorted_list = sorted(lst)
    return sorted_list[int(len(lst)/2+0.5)]

In [8]:
def unique(lst):
    new_lst= []
    new_dic = {}
    for item in lst:
        if item not in new_lst:
            new_lst.append(item)
            new_dic[item] = 1
        else:
            new_dic[item] += 1
        od = collections.OrderedDict(sorted(new_dic.items()))
    return od

# Opening CSV file

In [9]:
medical_insurance_list_OD = open_csv('insurance.csv')

In [10]:
age_list = get_list('age', medical_insurance_list_OD)
sex_list = get_list('sex', medical_insurance_list_OD)
bmi_list = get_list('bmi', medical_insurance_list_OD)
children_list = get_list('children', medical_insurance_list_OD)
smoker_list = get_list('smoker', medical_insurance_list_OD)
region_list = get_list('region', medical_insurance_list_OD)
charges_list = get_list('charges', medical_insurance_list_OD)

# Patient Info class

In [11]:
# Contains functions for showing number of records, number of either sex, number of either smoker or non-smoker, mean, median, max, min and range of both bmi and age, number of unique regions and age.
class PatientInfo:
    def __init__(self, age_list, sex_list, bmi_list, children_list, smoker_list, region_list, charges_list):
        self.age_list = float_convert(age_list)
        self.sex_list = binary_convert(sex_list)
        self.bmi_list = float_convert(bmi_list)
        self.children_list = float_convert(children_list)
        self.smoker_list = binary_convert(smoker_list)
        self.region_list = region_list
        self.charges_list = float_convert(charges_list)
        self.dict = {}
        for i in range(len(self.age_list)):
            self.dict['record '+str(i+1)] = {'Age':self.age_list[i], 'Sex':self.sex_list[i], 'BMI':self.bmi_list[i], 'Number of children': self.children_list[i], 'Smoker':self.smoker_list[i], 'Region':self.region_list[i], 'Charges':self.charges_list[i]}
    def num_records(self):
        print('There are {num_records} of records available'.format(num_records = len(self.age_list)))
        return len(self.age_list)
    def num_sexes(self):
        print('There are {num_male} males'.format(num_male = sum(self.sex_list)))
        print('There are {num_female} females'.format(num_female = len(self.sex_list) - sum(self.sex_list)))
        return sum(self.sex_list), len(self.sex_list) - sum(self.sex_list)
    def num_smoker(self):
        print('There are {num_smoker} smokers'.format(num_smoker = sum(self.smoker_list)))
        return sum(self.smoker_list)
    def num_non_smoker(self):
        print('There are {num_non_smoker} non-smokers'.format(num_non_smoker = len(self.smoker_list)-sum(self.smoker_list)))
        return len(self.smoker_list)-sum(self.smoker_list)
    def mean_age(self):
        print('The mean age is {mean_age} year old'.format(mean_age = get_mean(self.age_list)))
        return get_mean(self.age_list)
    def median_age(self):
        print('The median age is {median_age} year old'.format(median_age = get_median(self.age_list)))
        return get_median(self.age_list)
    def max_min_range_age(self):
        print('The range of ages is {range_age} with the maximum age being {max_age} and minimum age being {min_age}'.format(range_age = max(self.age_list)-min(self.age_list), max_age = max(self.age_list), min_age = min(self.age_list)))
        return max(self.age_list)-min(self.age_list),  max(self.age_list),min(self.age_list)
    def mean_bmi(self):
        print('The mean bmi is {mean_bmi}'.format(mean_bmi = get_mean(self.bmi_list)))
        return get_mean(self.bmi_list)
    def median_bmi(self):
        print('The median bmi is {median_bmi}'.format(median_bmi = get_median(self.bmi_list)))
        return get_median(self.bmi_list)
    def max_min_range_bmi(self):
        print('The range of bmi is {range_bmi} with the maximum bmi being {max_bmi} and minimum bmi being {min_bmi}'.format(range_bmi = max(self.bmi_list)-min(self.bmi_list), max_bmi = max(self.bmi_list), min_bmi = min(self.bmi_list)))
        return max(self.bmi_list)-min(self.bmi_list),  max(self.bmi_list),min(self.bmi_list)
    def regions(self):
        return unique(self.region_list)
    def num_age(self):
        return unique(self.age_list)
    def num_children(self):
        return unique(self.children_list)
    def mean_charges(self):
        print('The mean charge is', get_mean(self.charges_list))
        return get_mean(self.charges_list)
    def median_charges(self):
        print('The median charge is', get_median(self.charges_list))
        return get_median(self.charges_list)
        

In [12]:
patient_info = PatientInfo(age_list, sex_list, bmi_list, children_list, smoker_list, region_list, charges_list)

# Separation methods

In [13]:
# Useful for age group separation, bmi separation, number of children separation and charges separation
def separate_cont_group(dictionary, key, minimum, maximum):
    new_dict = {}
    for item in dictionary.keys():
        if dictionary[item][key] >=minimum and dictionary[item][key]<maximum:
            new_dict[item] = dictionary[item]
    return new_dict

In [14]:
# Useful for groups with only two values, i.e. sex and smoker
def separate_binary_group(dictionary,key):
    new_dict0 = {}
    new_dict1 = {}
    for item in dictionary.keys():
        if dictionary[item][key]:
            new_dict1[item] = dictionary[item]
        else:
            new_dict0[item] = dictionary[item]
    return new_dict0, new_dict1

In [15]:
# Useful for picking out values. Used for picking out regions
def separate_by_keyword(dictionary, key, value_lst):
    new_dict = {}
    for item in dictionary.keys():
        if dictionary[item][key] in value_lst:
            new_dict[item] = dictionary[item]
    return new_dict

# Flexible separation function

In [16]:
def separation(dictionary = patient_info.dict, min_age = 0, max_age = 100, min_bmi = 0, max_bmi = 100, sex = "No separation", smoker = "No separation", min_children = 0, max_children = 10, region = ['northwest', 'northeast', 'southwest', 'southeast'], min_charges = 0, max_charges = 10000000000):
    age_dict = separate_cont_group(dictionary, 'Age', min_age, max_age)
    bmi_dict = separate_cont_group(age_dict, 'BMI', min_bmi, max_bmi)
    if sex == 'No separation':
        sex_dict = bmi_dict
    else:
        female_group, male_group = separate_binary_group(bmi_dict, 'Sex')
    if sex == 'Male':
        sex_dict = male_group
    elif sex == 'Female':
        sex_dict = female_group
    if smoker == 'No separation':
        smoker_dict = sex_dict
    else:
        non_smoker_group, smoker_group = separate_binary_group(sex_dict, 'Smoker')
    if smoker == 'Smoker':
        smoker_dict = smoker_group
    elif smoker == 'Non-smoker':
        smoker_dict = non_smoker_group
    children_dict = separate_cont_group(smoker_dict, 'Number of children', min_children, max_children)
    region_dict = separate_by_keyword(children_dict, 'Region', region)
    charges_dict = separate_cont_group(region_dict, 'Charges', min_charges, max_charges)
    return charges_dict
    
    
    

# Examples of function use

In [17]:
age_18_to_25 = separation(min_age = 18, max_age = 25)
age_50_to_64_smokers = separation(min_age = 50, max_age = 64, smoker = 'Smoker')
# Be careful entering 'Non-smoker', there is a hyphen
age_25_to_40_male_non_smokers = separation(min_age = 25, max_age = 40, smoker = 'Non-smoker', sex = 'Male')
# To get the group with only one age, min age is the age you want, and max_age is one above min_age. Do the same for 'Number of children', 'BMI' and 'Charges'
age_19 = separation(min_age = 19, max_age = 20)

In [18]:
print(len(age_19))

68


In [37]:
var_male_non_smoker_0_children_bmi_18_to_24_northwest = separation(smoker = 'Non-smoker', sex = 'Male', min_bmi = 18, max_bmi = 24, min_children = 0, max_children = 1, region = ['northwest'])

In [38]:
var_male_non_smoker_0_children_bmi_18_to_24_northwest

{'record 4': {'Age': 33.0,
  'Sex': 1,
  'BMI': 22.705,
  'Number of children': 0.0,
  'Smoker': 0,
  'Region': 'northwest',
  'Charges': 21984.47061},
 'record 36': {'Age': 19.0,
  'Sex': 1,
  'BMI': 20.425,
  'Number of children': 0.0,
  'Smoker': 0,
  'Region': 'northwest',
  'Charges': 1625.43375},
 'record 396': {'Age': 46.0,
  'Sex': 1,
  'BMI': 19.855,
  'Number of children': 0.0,
  'Smoker': 0,
  'Region': 'northwest',
  'Charges': 7526.70645},
 'record 676': {'Age': 45.0,
  'Sex': 1,
  'BMI': 21.375,
  'Number of children': 0.0,
  'Smoker': 0,
  'Region': 'northwest',
  'Charges': 7222.78625},
 'record 694': {'Age': 24.0,
  'Sex': 1,
  'BMI': 23.655,
  'Number of children': 0.0,
  'Smoker': 0,
  'Region': 'northwest',
  'Charges': 2352.96845},
 'record 748': {'Age': 19.0,
  'Sex': 1,
  'BMI': 21.755,
  'Number of children': 0.0,
  'Smoker': 0,
  'Region': 'northwest',
  'Charges': 1627.28245},
 'record 944': {'Age': 19.0,
  'Sex': 1,
  'BMI': 22.61,
  'Number of children': 0.0