###  Explore dataset on births in the U.S.

working with the dataset on births in the U.S., compiled by FiveThirtyEight. 


In [2]:
data = open('US_births_1994-2003_CDC_NCHS.csv', 'r').read().split('\n')
print(data[0:10])

['year,month,date_of_month,day_of_week,births', '1994,1,1,6,8096', '1994,1,2,7,7772', '1994,1,3,1,10142', '1994,1,4,2,11248', '1994,1,5,3,11053', '1994,1,6,4,11406', '1994,1,7,5,11251', '1994,1,8,6,8653', '1994,1,9,7,7910']


In [2]:
# Create a function to read in string data from csv file and 
# and convert each field to integer and return a list of lists 
# of intergers

def read_csv(file):
    string_list = open(file, 'r').read().split('\n')[1:]
    final_list = []
    for yr in string_list:
        int_fields = []
        string_fields = yr.split(',')
        int_fields = [int(x) for x in string_fields]
        final_list.append(int_fields)
    return final_list

# CDC data of births from 1994 to 2003
cdc_list = read_csv('US_births_1994-2003_CDC_NCHS.csv')

print(cdc_list[0:10])    

[[1994, 1, 1, 6, 8096], [1994, 1, 2, 7, 7772], [1994, 1, 3, 1, 10142], [1994, 1, 4, 2, 11248], [1994, 1, 5, 3, 11053], [1994, 1, 6, 4, 11406], [1994, 1, 7, 5, 11251], [1994, 1, 8, 6, 8653], [1994, 1, 9, 7, 7910], [1994, 1, 10, 1, 10498]]


In [2]:
# calculate the total number of births that occured in
# each month, across all of the years in the dataset.

def month_births(blist):
    births_per_month = {}
    for yr in blist:
        month = yr[1]
        births = yr[4]
        if month in births_per_month:
            births_per_month[month] += births
        else:
            births_per_month[month] = births
    return births_per_month

# cdc births dataset: births by month over 1994 to 2003
cdc_month_births = month_births(cdc_list)
cdc_month_births
    

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

In [3]:
# Create a function that calculates the total number of
# births for each unique day of the week

def dow_births(blist):
    result = {}
    for row in blist:
        if row[3] in result:
            result[row[3]] += row[4]
        else:
            result[row[3]] = row[4]
    return result

# cdc births dataset: births by day of the week
cdc_day_births = dow_births(cdc_list)
cdc_day_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

In [6]:
# create a single function that works for any column 
# to tally births by.

def calc_counts(data, column):
    result = {}
    for row in data:
        if row[column] in result:
            result[row[column]] += row[4]
        else:
            result[row[column]] = row[4]
    return result

# Apply function on cdc births data
cdc_year_births = calc_counts(cdc_list, 0)
cdc_month_births = calc_counts(cdc_list, 1)
cdc_dom_births = calc_counts(cdc_list, 2)
cdc_dow_births = calc_counts(cdc_list, 3)

cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

In [None]:
# Create a function that can calculate the min and max 
# values for any dictionry that's passed in

def find_min_max(data_dic):
    val_list = data_dic.values()
    return (min(val_list), max(val_list))


In [None]:
# Create a function  that extracts the same values 
# across years and calculates the differences between 
# consecutive values to show if number of births is 
# increasing or decreasing.

def across_year(data, column, val):
    result = {}
    for row in data:
        year = row[0]
        if row[column] == val:
            if year in result:
                result[year] += row[4]
            else:
                result[year] = row[4]

# cdc births on Saturday from 1994 to 2003
cdc_births_saturday = across_year(cdc_list, 3, 6)


In [None]:
# Find a way to combine the CDC data with the SSA data
# (https://github.com/fivethirtyeight/data/tree/master/births).
# Specifically, brainstorm ways to deal with the 
# overlapping time periods in the datasets.

ssa_list = read_csv('US_births_2000-2014_SSA.csv')

combined_list = cdc_list[:]
for row in ssa_list:
    if row[0] > 2003:
        combined_list.append(row)
