# Risk Adjustment and Machine Learning

Carreta ilustrada

### Loading health data

In [50]:
# Import pandas and assign to it the pd alias
import pandas as pd

# Load csv to pd.dataframe using pd.read_csv
df_salud = pd.read_csv('../suficiencia.csv')

# Index is not appropiately set
print(df_salud.head())

# pd.read_csv inferred unconvenient data types for some columns
for columna in df_salud.columns:
    print(columna,df_salud[columna].dtype)

                                NUM_IDE  FECHA_NACI SEXO  MUNI_2010  \
0  00000194-D058-45CA-8FFA-104DC5EEFFFB  1990-02-22    F          1   
1  0000044A-6B61-4EBF-BF51-CCFE020AF4C7  1935-07-15    M          1   
2  00000495-E581-4053-A710-6275911EF9AB  1964-05-23    M          1   
3  000007D9-9087-4F36-9B26-25D205C46EE1  1974-10-06    M        430   
4  00000CF1-F4ED-4330-8739-C82FCF1BCAC7  2000-12-28    M        172   

   MUNI_2011  DPTO_2010  DPTO_2011 EPS_2010 EPS_2011  DIAS_COMP_2010  \
0          1         11         11   EPS008   EPS008             150   
1          1          8          8   EPS037   EPS037             360   
2          1         54         54   EPS037   EPS037             360   
3        430         25         25   EPS013   EPS013             191   
4         88          5          5   EPS013   EPS013             180   

     ...      DIAG_1_W_2011  DIAG_1_X_2011  DIAG_1_Y_2011  DIAG_1_Z_2011  \
0    ...              False          False          False       

In [51]:
# We reload csv file using index_col and dtype parameters    
dtype = {
         'SEXO':'category',
         'MUNI_2010':'category',
         'MUNI_2011':'category',
         'DPTO_2010':'category',
         'DPTO_2011':'category',
         'EPS_2010':'category',
         'EPS_2011':'category',
         'SEXO_F': bool,
         'SEXO_M': bool,
        }
df_salud = pd.read_csv('../suficiencia.csv',index_col=0,dtype=dtype)

# Index is appropriately set
print(df_salud.head())

# Loaded dataframe has convenient data types for all variables
for columna in df_salud.columns:
    print(columna,df_salud[columna].dtype)

                                      FECHA_NACI SEXO MUNI_2010 MUNI_2011  \
NUM_IDE                                                                     
00000194-D058-45CA-8FFA-104DC5EEFFFB  1990-02-22    F         1         1   
0000044A-6B61-4EBF-BF51-CCFE020AF4C7  1935-07-15    M         1         1   
00000495-E581-4053-A710-6275911EF9AB  1964-05-23    M         1         1   
000007D9-9087-4F36-9B26-25D205C46EE1  1974-10-06    M       430       430   
00000CF1-F4ED-4330-8739-C82FCF1BCAC7  2000-12-28    M       172        88   

                                     DPTO_2010 DPTO_2011 EPS_2010 EPS_2011  \
NUM_IDE                                                                      
00000194-D058-45CA-8FFA-104DC5EEFFFB        11        11   EPS008   EPS008   
0000044A-6B61-4EBF-BF51-CCFE020AF4C7         8         8   EPS037   EPS037   
00000495-E581-4053-A710-6275911EF9AB        54        54   EPS037   EPS037   
000007D9-9087-4F36-9B26-25D205C46EE1        25        25   EPS013   EP

### Exploring health data
We are interested in exploring risk profiles of individuals. Lets estimate expenditure and enrollee density distribution for different expenditure intervals. We will consider intervals of \$10,000 COP between \$0 and \$3,000,000 COP. 

In [52]:
# We will be using plotly to graph the distributions. 
import plotly 
import plotly.graph_objs as go 
plotly.offline.init_notebook_mode(connected=True)

# Set interval and step size
tamanho = 10**6*3
step_size = 10**4

In [58]:
# Enrollee distribution is straightforward using plotly
trace2010 = go.Histogram(
                x=df_salud['VALOR_TOT_2010'],
                name='2010',
                histnorm='probability',
                xbins=dict(start=0.0,end=tamanho,size=step_size),
                legendgroup='2010'
            )

trace2011 = go.Histogram(
                x=df_salud['VALOR_TOT_2011'],
                name='2011',
                histnorm='probability',
                xbins=dict(start=0.0,end=tamanho,size=step_size),
                legendgroup='2011'
            )

layout = go.Layout(
            legend=dict(
                xanchor='center',
                yanchor='top',
                orientation='h',
                y=-0.25,
                x=0.5,
            ),
            yaxis=dict(
                title='Density',
                rangemode='tozero'
            ),
            xaxis=dict(
                title='Expenditure'
            ),
            title='Enrolle density'
         )

fig = go.Figure(data=[trace2010,trace2011], layout=layout)
plotly.offline.iplot(fig)

Expenditure distribution needs extra work since we are accumulating expenditure and not enrollees. For this purpose we first sort enrollees, then we calculate accumulated expenditure up to each interval and normalize it by total expenditure and finally we differentiate the series.

In [62]:
# We will be using numpy to differentiate a series
import numpy as np

In [64]:
def calculate_expenditure_density(year):
    domain = list(range(0,tamanho,step_size))
    total_expenditure = df_salud['VALOR_TOT_' + year].sum()
    sorted_expenditure =  df_salud['VALOR_TOT_' + year].sort_values()
    density = list()
    for x in domain:
        density.append(sorted_expenditure[sorted_expenditure < x].sum()/total_expenditure)
    return np.diff(density)

density_2010 = calculate_expenditure_density('2010')
density_2011 = calculate_expenditure_density('2011')

In [67]:
trace_2010 = go.Scatter(
                x=list(range(0,tamanho,step_size)),
                y=density_2010,
                legendgroup='2010',
                name='2010',
                marker=dict(color='#1f77b4'),
                type='bar'
             )

trace_2011 = go.Scatter(
                x=list(range(0,tamanho,step_size)),
                y=density_2011,
                legendgroup='2011',
                name='2011',
                marker=dict(color='#ff7f0e'),
                type='bar'
             )

layout = go.Layout(
            legend=dict(
                xanchor='center',
                yanchor='top',
                orientation='h',
                y=-0.25,
                x=0.5,
            ),
            yaxis=dict(
                title='Density',
                rangemode='tozero'
            ),
            xaxis=dict(
                title='Expenditure'
            ),
            title='Expenditure density'
         )

fig = go.Figure(data=[trace_2010,trace_2011],layout=layout)
plotly.offline.iplot(fig)