# U.S. Medical Insurance Costs

In this project, a **CSV** file with medical insurance costs will be investigated using Python fundamentals. The goal with of this project will be to analyze various attributes within **insurance.csv** to learn more about the patient information in the file. 

This goal is achieved using following methods:

* find average age of the patients 
* check variability of the age data using interquartile range and standart deviation methods 
* return the number of males vs. females counted in the dataset 
* find geographical location of the patients 
* calculate the insurance costs for each region 
* analyze how much insurance costs of smokers differ from non-smokers (Done)
* calculate average age of patients who have children and those who does not (Done)






In [2]:
from google.colab import files
uploaded_csv = files.upload() 

Saving insurance.csv to insurance.csv


In [3]:
import pandas as pd
import io
insurance_data = pd.read_csv(io.BytesIO(uploaded_csv['insurance.csv']))
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# Store data  



In [4]:
ages = insurance_data['age'].tolist()
sexes = insurance_data['sex'].tolist()
bmis = insurance_data['bmi'].tolist()
num_children = insurance_data['children'].tolist()
smoker_statuses = insurance_data['smoker'].tolist()
regions = insurance_data['region'].tolist()
insurance_charges = insurance_data['charges'].tolist()

# The mean value



In [5]:
#Find the average age 
def find_mean(lst):
  mean = sum(lst) / len(lst)
  return int(mean)

average_age = find_mean(ages)
print(average_age)

39


# IQR


In [6]:
import math

def percentile(lst, p):
  n = int(len(lst))
  i = (p / 100) * n
  if i is not int:
    return sorted(lst)[int(math.ceil(i)) - 1]
  else:
    return sorted(lst)[int((data_asc[int(i)] + data_asc[int(i+1)]) / 2) - 1]

print(percentile(ages, 50))

39


In [7]:
def iqr(lst):
  q1 = percentile(lst, 25)
  q3 = percentile(lst, 75)
  iqr = q3 - q1
  return iqr
print(iqr(ages))

24


In [8]:
#Check if the functions work correctly with scipy
from scipy import stats
percentile=int(stats.scoreatpercentile(ages, 50))
print(percentile)

39


In [9]:
percentile=int(stats.iqr(ages, interpolation = 'midpoint'))
print(percentile)

24


# Standart deviation 


In [10]:
def st_deviation(ages):
  sum_sqdevs = 0 
  n = int(len(ages))
  for i in range(len(ages)):
    sum_sqdevs += (ages[i] - average_age) ** 2
  s = math.sqrt(sum_sqdevs / (n-1))
  return int(s)
print(st_deviation(ages))

14


In [11]:
#Check the result
import statistics
standart_deviation=int(statistics.stdev(ages))
print(standart_deviation)

14


# Number of male and female patients

In [12]:
def analyze_sexes(sexes):
  count_female = 0
  count_male = 0
  for i in sexes:
    if i == 'female':
      count_female += 1
    if i == 'male':
      count_male += 1
  return f'Number of female patients is {count_female}. Number of male patients is {count_male}.'
           
analyze_sexes(sexes)

'Number of female patients is 662. Number of male patients is 676.'

# Number of records for each region

In [13]:
reg_dict = dict()

def count_region_occurences(dictionary):
  for i in regions:
    dictionary['Northeast'] = regions.count('northeast')
    dictionary['Northwest'] = regions.count('northwest')
    dictionary['Southeast'] = regions.count('southeast')
    dictionary['Southwest'] = regions.count('southwest')
  return dictionary

num_region = count_region_occurences(reg_dict)
num_region

{'Northeast': 324, 'Northwest': 325, 'Southeast': 364, 'Southwest': 325}

#Average insurance costs for each region

In [16]:
  ne = []
  nw = []
  se = []
  sw = []
def region_costs(insurance_data):
   for ind in insurance_data.index:
     if insurance_data['region'][ind] == 'northeast':
       ne.append(insurance_data['charges'][ind])
     if insurance_data['region'][ind] == 'northwest':
       nw.append(insurance_data['charges'][ind])
     if insurance_data['region'][ind] == 'southeast':
       se.append(insurance_data['charges'][ind])
     if insurance_data['region'][ind] == 'southwest':
       sw.append(insurance_data['charges'][ind])   
   return f'''Average insurance costs for Northeast region are : {int(find_mean(ne))} dollars. 
   Average insurance costs for Northwest region are : {int(find_mean(nw))} dollars. 
   Average insurance costs for Southeast region are : {int(find_mean(se))} dollars. 
   Average insurance costs for Southwest region are : {int(find_mean(sw))} dollars.'''

region_costs(insurance_data)


'Average insurance costs for Northeast region are : 13406 dollars. \n Average insurance costs for Northwest region are : 12417 dollars. \n Average insurance costs for Southeast region are : 14735 dollars. \n Average insurance costs for Southwest region are : 12346 dollars.'

# Smokers' insurance costs

In [22]:
smokers_charges = []
nonsmokers_charges = []
def smoke_influence(insurance_data):
  for ind in insurance_data.index:
    if insurance_data['smoker'][ind]  == 'yes':
      smokers_charges.append(insurance_data['charges'][ind])
    elif insurance_data['smoker'][ind]  == 'no':
      nonsmokers_charges.append(insurance_data['charges'][ind])
  return f'''Smokers' insurance costs are on average {find_mean(smokers_charges) - find_mean(nonsmokers_charges)} dollars more.'''

smoke_influence(insurance_data)

"Smokers' insurance costs are on average 23616 dollars more."

# Average age of patinets with / without children

In [24]:
childless_age = []
parents_age = []
def at_least_one_child(insurance_data):
  for ind in insurance_data.index:
    if insurance_data['children'][ind] > 0:
      parents_age.append(insurance_data['age'][ind])
    else:
      childless_age.append(insurance_data['age'][ind])
  return f'''Average age of clients who have at least one child is : {int(find_mean(parents_age))} years. 
  Average age of clients without children is : {int(find_mean(childless_age))} years.'''

at_least_one_child(insurance_data)


'Average age of clients who have at least one child is : 39 years. \n  Average age of clients without children is : 38 years.'