# Consolidating CDC data

Data was collected from fluvaxview data on the general population: https://www.cdc.gov/flu/fluvaxview/interactive-general-population.htm

Report data were downloaded for years 2013-14, 2014-15, 2015-16, 2016-17, and 2017-18





In [1]:
import pandas as pd
import numpy as np

In [2]:
def process_raw_fluvaxview(data_frame):
    """
    This will process the raw data that was generated from fluvaxview
    """
    
    
    data_frame.iloc[0,:] = data_frame.iloc[0,:].fillna(method='ffill')
    
    sample_data = data_frame

    to_fill = []
    for val in sample_data.columns:
        if 'Unnamed:' not in val:
            to_fill.append(val)
        else:
            to_fill.append(np.nan)

    sample_data.columns = pd.Series(data=to_fill).fillna(method='ffill')


    index = sample_data.index
    sample_data['index_place'] = index
    index = sample_data['index_place']
    index[0] = 'race_or_age' 
    sample_data['index_placeholder'] = pd.Series(index)
    sample_data = sample_data.set_index('index_placeholder')

#     return sample_data
    sample_data = sample_data.drop(labels=['index_place'], axis=1)


    sample_data = sample_data.transpose()


    months = ['Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar', 'Apr', 'May']

    month_column = []
    thing_to_fill_in = None
    for name in sample_data.Names:
        if name in months:
            thing_to_fill_in = name
        month_column.append(thing_to_fill_in)

    exp_column = []
    for name in sample_data.Names:
        if name in months:
            exp_column.append('avg')
        else:
            exp_column.append(name)


    sample_data['month'] = month_column
    sample_data['stats'] = exp_column


    sample_data = sample_data.drop(labels='Names', axis=1)


    regions = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
           'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
           'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
           'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
           'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
           'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
           'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
           'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',
           'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia',
           'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'United States',
           'Region  1', 'Region  2', 'Region  3', 'Region  4', 'Region  5',
           'Region  6', 'Region  7', 'Region  8', 'Region  9', 'Region 10',
           'HP 2020 Target']
    
#     return sample_data

    sample_data = sample_data.melt(value_vars=regions, id_vars=['month', 'stats', 'race_or_age'])
    
    return sample_data

In [3]:
data_2013 = pd.read_excel('data/2013-2014 flu data.xlsx')
data_2014 = pd.read_excel('data/2014-2015 flu data.xlsx')
data_2015 = pd.read_excel('data/2015-2016 flu data.xlsx')
data_2016 = pd.read_excel('data/2016-2017 flu data.xlsx')
data_2017 = pd.read_excel('data/2017-2018 flu data.xlsx')

data_2013 = process_raw_fluvaxview(data_2013)
data_2014 = process_raw_fluvaxview(data_2014) 
data_2015 = process_raw_fluvaxview(data_2015) 
data_2016 = process_raw_fluvaxview(data_2016) 
data_2017 = process_raw_fluvaxview(data_2017) 

data_2013['year'] = '2013-2014'
data_2014['year'] = '2014-2015'
data_2015['year'] = '2015-2016'
data_2016['year'] = '2016-2017'
data_2017['year'] = '2017-2018'

data = data_2015.append(data_2016)
data = data.append(data_2017)
data = data.append(data_2014)
data = data.append(data_2013)

data.columns = ['month', 'stats', 'race_or_age', 'state_or_region', 'value', 'year']

data.head()

Unnamed: 0,month,stats,race_or_age,state_or_region,value,year
0,Jul,avg,≥6 months,Alabama,0.9,2015-2016
1,Jul,LL,≥6 months,Alabama,0.5,2015-2016
2,Jul,UL,≥6 months,Alabama,1.3,2015-2016
3,Jul,CI,≥6 months,Alabama,(±0.4),2015-2016
4,Jul,SAMPLE,≥6 months,Alabama,7822,2015-2016


In [4]:
data.to_csv('data/2013-2018_consolidated_flu_data.csv')

# Processing consolidated CDC data

Read the CDC consolidated dataframe, and set up some preliminary variables

In [5]:
import pandas as pd
cdc = pd.read_csv("data/2013-2018_consolidated_flu_data.csv")

In [6]:
state_names = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

As we see from the sample, there are multiple statistics and regions being considered

In [7]:
cdc.sample(20)

Unnamed: 0.1,Unnamed: 0,month,stats,race_or_age,state_or_region,value,year
253945,4465,Oct,CI,18-49 years,California,(±5.3),2013-2014
103585,41215,Nov,avg,18-49 years not at high risk,South Dakota,33.7,2016-2017
175861,51121,Dec,LL,18-49 years not at high risk,United States,20.9,2017-2018
188280,1170,Oct,avg,5-12 years,Alaska,39.6,2014-2015
118391,56021,Jan,LL,18-49 years at high risk,Region 5,30.1,2016-2017
63255,885,Aug,avg,Hispanic,Alabama,7.4,2016-2017
217287,30177,Feb,UL,18-64 years not at high risk,New Jersey,35.4,2014-2015
204213,17103,May,CI,13-17 years,Kentucky,(±6.3),2014-2015
196162,9052,Jan,UL,6 months - 4 years,Florida,57.6,2014-2015
289507,40027,Sep,UL,18-49 years,South Carolina,6.5,2013-2014


For our purposes, we only need the "avg" statistic, which indicates the percentage of patients covered by that date in the season. We only need state-by-state data:

In [8]:
cdc_avg = cdc[cdc["state_or_region"].isin(state_names) * cdc["stats"]=="avg"]

In [9]:
cdc_avg.sample(20)

Unnamed: 0.1,Unnamed: 0,month,stats,race_or_age,state_or_region,value,year
191415,4305,Oct,avg,18-64 years,California,16.9,2014-2015
6290,6290,Nov,avg,18-64 years,Connecticut,34.1,2015-2016
18195,18195,Apr,avg,18-64 years,Louisiana,33.6,2015-2016
3225,3225,Feb,avg,13-17 years,Arkansas,54.5,2015-2016
135705,10965,Nov,avg,6 months - 17 years,Hawaii,46.4,2017-2018
127310,2570,Mar,avg,18-49 years at high risk,Arizona,30.0,2017-2018
197070,9960,Aug,avg,6 months - 17 years,Georgia,6.1,2014-2015
134225,9485,Dec,avg,18-49 years at high risk,Florida,18.2,2017-2018
111495,49125,Sep,avg,18-49 years not at high risk,Wisconsin,3.7,2016-2017
95170,32800,Nov,avg,6 months - 4 years,North Carolina,55.3,2016-2017


In [10]:
cdc_avg.shape

(49500, 7)

We need to remove all the "non-reported" values, and convert the rest to float.

In [11]:
#reomve "NR" values
cdc_avg_cleaned = cdc_avg[(cdc_avg["value"] != "NR †") & (cdc_avg["value"] != "NR *") & (cdc_avg["value"] != "NR")]
cdc_avg_cleaned["value_flt"] = cdc_avg_cleaned["value"].apply(float)
cdc_avg_cleaned.drop(columns=["Unnamed: 0"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [12]:
cdc_avg_cleaned.sample(20)

Unnamed: 0,month,stats,race_or_age,state_or_region,value,year,value_flt
143995,Aug,avg,18-64 years not at high risk,Maine,0.9,2017-2018,0.9
291594,Oct,avg,18-49 years not at high risk,Tennessee,18.8,2013-2014,18.8
152890,Apr,avg,18-64 years at high risk,Nevada,39.3,2017-2018,39.3
275033,Jan,avg,"Black only, non-Hispanic",Missouri,45.0,2013-2014,45.0
204760,May,avg,"White only, non-Hispanic",Kentucky,48.5,2014-2015,48.5
43380,Mar,avg,"White only, non-Hispanic",Texas,47.8,2015-2016,47.8
68415,May,avg,6 months - 17 years,Connecticut,68.6,2016-2017,68.6
168985,Dec,avg,50-64 years,Utah,35.4,2017-2018,35.4
224745,Oct,avg,≥6 months,Pennsylvania,31.0,2014-2015,31.0
174000,Feb,avg,≥65 years,Wisconsin,50.1,2017-2018,50.1


We now want just the full >= 6-months dataset

In [13]:
cdc_allpopulation = cdc_avg_cleaned[cdc_avg_cleaned.race_or_age == "≥6 months"]

Now compare the number of rows to the number of all possible data points for 5 years. We expect less due to some non-reported values.

In [14]:
cdc_allpopulation.shape

(2665, 7)

In [15]:
5*12*50

3000

In [16]:
cdc_allpopulation.sample(20)

Unnamed: 0,month,stats,race_or_age,state_or_region,value,year,value_flt
278172,Mar,avg,≥6 months,New Hampshire,47.6,2013-2014,47.6
36630,Jul,avg,≥6 months,Oregon,0.4,2015-2016,0.4
173300,May,avg,≥6 months,Wisconsin,40.4,2017-2018,40.4
214865,Feb,avg,≥6 months,Nevada,37.8,2014-2015,37.8
215855,Feb,avg,≥6 months,New Hampshire,51.0,2014-2015,51.0
292992,Mar,avg,≥6 months,Utah,41.0,2013-2014,41.0
281096,Jul,avg,≥6 months,New York,0.4,2013-2014,0.4
259375,Oct,avg,≥6 months,Georgia,24.0,2013-2014,24.0
107945,Feb,avg,≥6 months,Virginia,47.8,2016-2017,47.8
66380,May,avg,≥6 months,California,48.0,2016-2017,48.0


We can now convert the dates to a more convenient time-stamp object, taking into account they way a 'season' is defined by CDC:

In [17]:
def convertToTimestamp(monthCol, yearCol):
    dateCol = []
    for (month, year) in zip(monthCol, yearCol):
        if (month in ["Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]):
            yr = year[0:4]
        else:
            yr = year[-4:]
        dateCol.append(pd.Timestamp.strptime(month + " 15 " + yr, "%b %d %Y"))
        
    return dateCol

In [18]:
cdc_allpopulation["time"] = convertToTimestamp(cdc_allpopulation["month"].values, cdc_allpopulation["year"].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Do some renaming of columns, and save the final dataframe

In [16]:
cdc_allpopulation.rename(columns={'value_flt': 'mean_pct', 'state_or_region': 'state'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [19]:
cdc_allpopulation.sample(20)

Unnamed: 0,month,stats,race_or_age,state_or_region,value,year,value_flt,time
107930,Nov,avg,≥6 months,Virginia,39.2,2016-2017,39.2,2016-11-15
229690,Sep,avg,≥6 months,Texas,11.6,2014-2015,11.6,2014-09-15
213890,May,avg,≥6 months,Nebraska,54.0,2014-2015,54.0,2015-05-15
144545,Aug,avg,≥6 months,Maryland,2.2,2017-2018,2.2,2017-08-15
170310,Jan,avg,≥6 months,Virginia,44.0,2017-2018,44.0,2018-01-15
87150,Jan,avg,≥6 months,Missouri,44.7,2016-2017,44.7,2017-01-15
262344,Nov,avg,≥6 months,Illinois,35.2,2013-2014,35.2,2013-11-15
188120,Nov,avg,≥6 months,Alaska,35.1,2014-2015,35.1,2014-11-15
157415,Aug,avg,≥6 months,North Carolina,2.7,2017-2018,2.7,2017-08-15
209890,Sep,avg,≥6 months,Minnesota,10.2,2014-2015,10.2,2014-09-15


In [20]:
cdc_allpopulation.to_csv("data/cdc_average_bystate_2013-2017.csv")