### 1. Initial Configuration

#### 1.1 Imports & Config

In [1]:
# data analysis
import numpy as np
import pandas as pd
import xlwings as xw

# plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as po
import plotly.figure_factory as ff

# colors
import colorlover as cl
from IPython.display import HTML

import os

In [2]:
%matplotlib inline

In [3]:
po.init_notebook_mode(connected=True)

#### 1.2 Code Examples

In [284]:
HTML(cl.to_html(cl.scales))

In [324]:
HTML(cl.to_html(cl.scales['5']['qual']['Paired']))

#### 1.3 Global Functions

Remove any rows with the word "Total" in any column

In [4]:
def strip_totals(df):
    cols = df.columns
    rows_to_remove = pd.Series().reindex_like(df)
    rows_to_remove.loc[:] = False

    for col in cols:
        try:
            rows_to_remove = (rows_to_remove | df[col].str.contains('Total'))
        except AttributeError:
            continue

    return df[-rows_to_remove]

Categorize a dataframe according to a defined category heirarchy

In [166]:
def nest_depth(tree, current_depth=1):
    max_depth = 0
    for v in tree.values():
        if type(v) is not dict:
            max_depth = max(max_depth, current_depth + 1)
        else:
            max_depth = max(max_depth, nest_depth(v, current_depth + 1))
    
    return max_depth


def categorize(df, target_col, category_map, new_col_nm='Grp'):
    category_cols = {new_col_nm + str(i+1): ' ' for i in range(nest_depth(category_map))}
    category_col_nms = list(category_cols.keys())
    df_out = df.assign(**category_cols)
    
    def assign_category(match_words, categories=[]):
        if type(match_words) is dict:
            for k, v in match_words.items():
                assign_category(v, categories + [k])
        
        elif type(match_words) is list:
            # Find matches
            mask = df_out[target_col].isin(match_words)
            
            # Assign the exact groups
            df_out.loc[mask, category_col_nms[-1]] = df_out.loc[mask, target_col]
            
            # Assign the higher level groups
            for i in range(len(categories)):
                df_out.loc[mask, category_col_nms[i]] = categories[i]
            
        else:
            raise ValueError('Dictionary value is not a list or dictionary')    

    assign_category(category_map)
    
    return df_out

In [436]:
[1,2,3][:0] + [1,2]

[1, 2]

In [442]:
def get_color_scale(index):
    color_priority = ['Blues', 'Greens', 'Reds', 'Purples', 'Oranges']
    color_scale = []
    
    try:
        grp_cnt = pd.Series(index.labels[0]).value_counts().sort_index()
        
        for i, cnt in grp_cnt.iteritems():
            if cnt > 8:
                color_scale += cl.scales['9']['seq'][color_priority[i]] + cl.scales['9']['seq'][color_priority[i]][:cnt-12]
            elif cnt > 2:
                color_scale += cl.scales[str(cnt+1)]['seq'][color_priority[i]][1:]
            else:
                color_scale += cl.scales['3']['seq'][color_priority[i]][3-cnt:]
        
        return color_scale
    
    except AttributeError:
        return cl.scales[str(len(index))]['qual']['Paired']

### My Work

In [149]:
# load data
data = pd.read_csv(
    r'.\Healthcare Data\Projected Future Health Expenditures\NHE60-25.csv',
    header = 6
)

# CLEANING - Transform YEAR column to numeric
data['Year'] = pd.to_numeric(data.YEAR.str.replace('Y', ''))

# CLEANING - Drop unnecessary columns
data.drop(labels = ['Projections_Vintage', 'TOTAL', 'YEAR', 'TOTAL_HEALTH_INSURANCE'], axis = 1, inplace = True)

# CLEANING - Rename Columns
new_names = data.columns.to_series()    \
                        .str.split('_') \
                        .apply(
                            lambda words: ' '.join([w[0].upper() + w[1:].lower() for w in words])
                        )
data.rename(columns = new_names, inplace = True)
data.rename(columns = {'Historical Or Projected Data': 'ProjType'}, inplace = True)

# CLEANING - Drop the "Totals" categories
removals = data['Category'].isin(['National Health Expenditures', 'Health Consumption Expenditures', 'Personal Health Care'])
data = data.loc[-removals]

# CLEANING - Shorten Category Labels
data['Category'] = data['Category'].str.replace(' Expenditures', '')

# CLEANING - Merge Payer Columns into single Payer Dimension
health_spend = data.melt(
    id_vars = ['Category', 'ProjType', 'Year'],
    value_vars = ['Out Of Pocket', 'Private Health Insurance', 'Medicare', 'Medicaid', 'Other Health Insurance', 'Other Third Party'],    
    var_name = 'Payer',
    value_name = 'Spend'
)

In [181]:
# Categories the expenditure types in the data
Health_Expenditure_Categories = {
    'Consumption': {
        'Personal Care': {
            'Facilities': ['Hospital', 'Nursing Care Facilities and Continuing Care Retirement Communities'],
            'Professional': ['Physician and Clinical', 'Dental Services', 'Other Professional Services'],
            'Medical Products': ['Prescription Drug', 'Durable Medical Equipment', 'Non-Durable Medical Products'],
            'Other Care': ['Home Health Care', 'Other Health, Residential, and Personal Care']
        },
        'Other': ['Government Administration', 'Net Cost of Health Insurance', 'Public Health Activity']
    },
    'Investment': ['Research', 'Structures', 'Equipment']
}

categorized_health_spend = categorize(
    health_spend, 
    target_col = 'Category', 
    category_map = Health_Expenditure_Categories
)

In [432]:
spending_by_type = categorized_health_spend.pivot_table(
    values = 'Spend',
    index = ['Grp1', 'Grp4'],
    columns = ['ProjType', 'Year'],
    aggfunc = np.sum
)
spending_by_type

Unnamed: 0_level_0,ProjType,Historical,Historical,Historical,Historical,Historical,Historical,Historical,Historical,Historical,Historical,...,Projected,Projected,Projected,Projected,Projected,Projected,Projected,Projected,Projected,Projected
Unnamed: 0_level_1,Year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
Grp1,Grp4,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Consumption,Dental Services,1987,2103,2224,2371,2617,2818,2992,3446,3703,4215,...,121917,128030,134511,141321,148595,155838,163105,170292,177623,185026
Consumption,Durable Medical Equipment,741,764,910,902,1003,1105,1243,1128,1316,1513,...,50498,53003,55655,58682,62401,66560,70938,75336,79806,84355
Consumption,Government Administration,54,66,79,82,142,237,334,421,559,627,...,45040,46840,52082,55673,59413,63422,67703,72328,77318,82899
Consumption,Home Health Care,56,62,65,69,75,89,107,163,238,271,...,94074,99899,106160,113491,121540,130052,139240,148906,159162,170003
Consumption,Hospital,8985,9778,10431,11509,12500,13545,15297,17798,20538,23367,...,1086781,1140771,1199924,1269068,1343067,1421721,1504450,1591326,1681159,1776030
Consumption,Net Cost of Health Insurance,1019,1101,1214,1251,1377,1608,1734,1721,2033,1787,...,216333,235084,254765,271145,287328,304646,323433,343213,363786,384792
Consumption,Non-Durable Medical Products,1626,1764,1901,1944,2083,2211,2377,2527,2677,2967,...,60907,63697,66908,70329,74381,78529,82845,87304,91928,96676
Consumption,Nursing Care Facilities and Continuing Care Retirement Communities,810,840,875,1009,1168,1408,1733,2234,2852,3411,...,162355,169530,177486,186967,197622,208827,220691,233227,246408,259968
Consumption,"Other Health, Residential, and Personal Care",437,504,538,589,651,709,811,920,1075,1223,...,170001,178961,189388,200966,213280,226439,240246,254962,270673,287457
Consumption,Other Professional Services,392,409,429,451,486,533,571,607,643,677,...,91989,97066,102543,108617,115007,121413,128101,135179,142514,149403


In [412]:
spending_by_type = categorized_health_spend.query('Grp2 == "Personal Care"').pivot_table(
    values = 'Spend',
    index = ['Grp3', 'Grp4'],
    columns = ['ProjType', 'Year'],
    aggfunc = np.sum
)
spending_by_type

Unnamed: 0_level_0,ProjType,Historical,Historical,Historical,Historical,Historical,Historical,Historical,Historical,Historical,Historical,...,Projected,Projected,Projected,Projected,Projected,Projected,Projected,Projected,Projected,Projected
Unnamed: 0_level_1,Year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
Grp3,Grp4,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Facilities,Hospital,8985,9778,10431,11509,12500,13545,15297,17798,20538,23367,...,1086781,1140771,1199924,1269068,1343067,1421721,1504450,1591326,1681159,1776030
Facilities,Nursing Care Facilities and Continuing Care Retirement Communities,810,840,875,1009,1168,1408,1733,2234,2852,3411,...,162355,169530,177486,186967,197622,208827,220691,233227,246408,259968
Medical Products,Durable Medical Equipment,741,764,910,902,1003,1105,1243,1128,1316,1513,...,50498,53003,55655,58682,62401,66560,70938,75336,79806,84355
Medical Products,Non-Durable Medical Products,1626,1764,1901,1944,2083,2211,2377,2527,2677,2967,...,60907,63697,66908,70329,74381,78529,82845,87304,91928,96676
Medical Products,Prescription Drug,2676,2718,3029,3159,3348,3714,3986,4227,4742,5149,...,340686,360138,387379,412328,438172,465797,495747,527455,561179,597085
Other Care,Home Health Care,56,62,65,69,75,89,107,163,238,271,...,94074,99899,106160,113491,121540,130052,139240,148906,159162,170003
Other Care,"Other Health, Residential, and Personal Care",437,504,538,589,651,709,811,920,1075,1223,...,170001,178961,189388,200966,213280,226439,240246,254962,270673,287457
Professional,Dental Services,1987,2103,2224,2371,2617,2818,2992,3446,3703,4215,...,121917,128030,134511,141321,148595,155838,163105,170292,177623,185026
Professional,Other Professional Services,392,409,429,451,486,533,571,607,643,677,...,91989,97066,102543,108617,115007,121413,128101,135179,142514,149403
Professional,Physician and Clinical,5551,5813,6261,7074,8108,8588,9308,10447,11345,12715,...,677053,717009,758770,804138,849623,897942,948259,1001555,1055850,1110620


In [428]:
def stacked_line_plot(df, normalize=False, display_cumulative_values=False, color_scale=None, nbr_form=None):
    # set defaults
    df = (df if not normalize else (df / df.sum(axis=0)))
    color_scale = (color_scale if color_scale is not None else get_color_scale(df.index))

    if nbr_form is None and normalize:
        nbr_form = (lambda x: '{:.1f}%'.format(x * 100))
    else:
        nbr_form = (lambda x: '${:.1f}'.format(x / 1000))

    # Containers
    traces = []
    y = df.cumsum()
    y_txt = y.applymap(nbr_form)
    
    # Build Traces
    for data, data_txt, color in zip(y.iterrows(), y_txt.iterrows(), color_scale):
        trace = go.Scatter(
            x = data[1].index,
            y = data[1].values,
            text = data_txt[1].values,
            hoverinfo = 'x+text',
            mode = 'lines',
            line = dict(
                width = 1,
                color = color
            ),
            fill = 'tonexty',
            name = ': '.join(data[0]) 
        )
        traces.append(trace)

    fig = go.Figure(data=traces)
    po.plot(fig, filename='my-stacked-area-plot-hover')

In [425]:
def stacked_line_plot2(df, normalize=False, display_cumulative_values=False, color_scale=None, nbr_form=None):
    # set defaults
    df = (df if not normalize else (df / df.sum(axis=0)))
    color_scale = (color_scale if color_scale is not None else get_color_scale(df.index))

    if nbr_form is None and normalize:
        nbr_form = (lambda x: '{:.1f}%'.format(x * 100))
    else:
        nbr_form = (lambda x: '${:.1f}'.format(x / 1000))

    # Containers
    traces = []
    y = df.cumsum()
    y_txt = y.applymap(nbr_form)
    
    # Build Traces 1
    for data, data_txt, color in zip(y['Historical'].iterrows(), y_txt['Historical'].iterrows(), color_scale):
        trace = go.Scatter(
            x = data[1].index,
            y = data[1].values,
            text = data_txt[1].values,
            hoverinfo = 'x+text',
            mode = 'lines',
            line = dict(
                width = 1,
                color = color,
            ),
            fill = 'tonexty',
            name = ': '.join(data[0]) 
        )
        traces.append(trace)
        
    # Build Traces 2
    for data, data_txt, color in zip(y['Projected'].iterrows(), y_txt['Projected'].iterrows(), color_scale):
        trace = go.Scatter(
            x = data[1].index,
            y = data[1].values,
            text = data_txt[1].values,
            hoverinfo = 'x+text',
            mode = 'lines',
            line = dict(
                width = 1,
                color = color,
                dash = 'dash'
            ),
            fill = 'tonexty',
            name = ': '.join(data[0]) 
        )
        traces.append(trace)

    fig = go.Figure(data=traces)
    po.plot(fig, filename='my-stacked-area-plot-hover')

In [443]:
 stacked_line_plot(
    spending_by_type['Historical'],
    normalize = True
)


Your filename `my-stacked-area-plot-hover` didn't end with .html. Adding .html to the end of your file.



In [444]:
spending_by_type['Historical']

Unnamed: 0_level_0,Year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Grp1,Grp4,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Consumption,Dental Services,1987,2103,2224,2371,2617,2818,2992,3446,3703,4215,...,91146,97028,101938,102307,105032,107107,108771,110141,112832,117522
Consumption,Durable Medical Equipment,741,764,910,902,1003,1105,1243,1128,1316,1513,...,34409,37073,37678,37813,39926,42260,43678,45062,46623,48458
Consumption,Government Administration,54,66,79,82,142,237,334,421,559,627,...,28667,29074,29154,29555,30069,32399,33520,37196,41240,42576
Consumption,Home Health Care,56,62,65,69,75,89,107,163,238,271,...,52053,57474,62165,67297,70994,73839,77076,79965,83566,88804
Consumption,Hospital,8985,9778,10431,11509,12500,13545,15297,17798,20538,23367,...,651221,691973,725711,779690,822406,852019,902676,937876,980966,1036110
Consumption,Net Cost of Health Insurance,1019,1101,1214,1251,1377,1608,1734,1721,2033,1787,...,137218,143497,139754,137872,153483,159304,165491,173773,195311,210093
Consumption,Non-Durable Medical Products,1626,1764,1901,1944,2083,2211,2377,2527,2677,2967,...,43733,47752,49472,50328,51245,52825,53733,55672,56912,59031
Consumption,Nursing Care Facilities and Continuing Care Retirement Communities,810,840,875,1009,1168,1408,1733,2234,2852,3411,...,115929,124936,130305,134884,139981,145046,147369,149217,152635,156798
Consumption,"Other Health, Residential, and Personal Care",437,504,538,589,651,709,811,920,1075,1223,...,101083,108337,114534,123363,129065,131670,139100,144262,151458,163322
Consumption,Other Professional Services,392,409,429,451,486,533,571,607,643,677,...,55264,60055,64539,67152,69850,72748,76429,78795,82826,87715
