## 0. Setup

In [7]:
#standard packages
import pandas as pd
import numpy as np
#import warnings
import warnings
warnings.filterwarnings('ignore')
import os
import collections

#time packages
import datetime as dt


In [8]:
from setup import main as setup
setup(env='vscode', project='cc_cid_calculator')

Project directory: /Users/luiz.superti/Documents/GitHub/alice-coding/cc_cid_calculator


In [9]:
from src.model import StaggDiD

***

## 1. extract

In [10]:
df = pd.read_csv('src/data/cid_care_cost_did_calc_df.csv')
df.columns

Index(['member_internal_code', 'calendar_date', 'mth_order', 'total_n_mth',
       'current_sex', 'current_gender', 'member_age', 'current_city',
       'current_state', 'is_b2b', 'is_pitaya', 'diseases_sev', 'tds_cost',
       'internacao_proced_cost', 'exam_cost', 'consulta_cost', 'er_cost',
       'alice_therapy_cost', 'other_costs', 'total_care_cost'],
      dtype='object')

In [11]:
df['diseases_'] = df['diseases_sev'].str.split(';')
df2 = df.explode('diseases_')
df2['cid_'] = df2['diseases_'].apply(lambda x: str(x).strip()[0:3])
df3 = df2[['member_internal_code','calendar_date', 'cid_']].drop_duplicates()

In [12]:
df3

Unnamed: 0,member_internal_code,calendar_date,cid_
0,NC1000Z,2022-05-31,
1,NC1000Z,2022-06-30,
2,NC1000Z,2022-07-31,
3,NC1002P,2021-12-31,
4,NC1002P,2022-01-31,
...,...,...,...
114409,NC1ZZWN,2022-05-31,R74
114410,NC1ZZWN,2022-06-30,R74
114411,NC1ZZWN,2022-07-31,R74
114412,NC1ZZWN,2022-08-31,N17


In [13]:
df_row = pd.get_dummies(df3['cid_'], prefix = 'cid')
df_row_2 = df3[['member_internal_code', 'calendar_date']].merge(df_row, how = 'inner', left_index=True, right_index=True)
df_row_3 = df_row_2.groupby(['member_internal_code', 'calendar_date']).max().reset_index()
df_row_3.sort_values(by=['member_internal_code', 'calendar_date'], inplace=True)
#create a cid filter
df_row_3.drop(columns = 'cid_nan', inplace = True)
cids_cols = [col for col in df_row_3 if col.startswith('cid_')]
df_row_3[cids_cols] = df_row_3[cids_cols].apply(pd.to_numeric).fillna(0)

***

In [14]:
df_filled = df.merge(
                    pd.get_dummies(df.current_state, prefix = 'uf'),how = 'inner', left_index=True, right_index=True).merge(
                        pd.get_dummies(df.current_city, prefix = 'city'),how = 'inner', left_index=True, right_index=True).drop(columns=['current_state', 'current_city'])
df_filled['is_male'] = np.where(df_filled['current_sex']=='MALE',1,0)
df_filled['is_trans'] = np.where(df_filled['current_sex']!=df_filled['current_gender'],1,0)
#create the base-dummies
df_filled.drop(columns=['current_gender','current_sex','city_São Paulo', 'uf_SP',
                        'diseases_sev','tds_cost', 'exam_cost', 'consulta_cost',
                        'internacao_proced_cost', 'er_cost',
                        'alice_therapy_cost', 'other_costs'], inplace=True)
df_filled.drop(columns=['diseases_'], inplace=True)

base_covariates_list = df_filled.drop(columns = ['member_internal_code', 'calendar_date', 'mth_order','total_n_mth', 'total_care_cost']).columns.to_list()
core_covariates = ['member_age', 'is_pitaya', 'is_male']
y = ['total_care_cost']

### Merge bases

In [15]:
ml_df = df_filled.merge(df_row_3, how = 'inner', left_on=['member_internal_code', 'calendar_date'], right_on = ['member_internal_code', 'calendar_date'])

#adjust variables
ml_df['n_total_cids'] = ml_df[cids_cols].sum(axis=1)
ml_df['ref_mth'] = (pd.to_datetime(ml_df['calendar_date']).dt.to_period('M') -   pd.to_datetime(['1970-01-31']).to_period('M')).apply(lambda x: x.n)

#entry-calendar date
ml_df['v1'] = np.where(ml_df['mth_order']==1, ml_df['ref_mth'], np.nan)
ml_df = ml_df.merge(ml_df.groupby(['member_internal_code'])['v1'].min().rename('ref_mth_entrance_group'), how = 'inner', left_on = 'member_internal_code', right_index = True)
ml_df['ref_mth_entrance_group'] = ml_df['ref_mth_entrance_group'].astype('Int64')
ml_df.drop(columns = ['v1'], inplace = True)


In [16]:
ml_df_calendar_base = ml_df[['member_internal_code','calendar_date','mth_order','ref_mth', 'ref_mth_entrance_group']]

***

In [17]:
accountable_cids = ml_df.groupby('member_internal_code')[cids_cols].min().drop_duplicates().sum(axis=0).sort_values(ascending=False)
accountable_cids = accountable_cids[accountable_cids>=10].index.to_list()

In [None]:
entry_list_cids = []
diagnosed_list  = []

for cid in accountable_cids:
    print('current cid:', cid)
    #cid_list not including the cid
    cid_list = [c for c in cids_cols if c!=cid]

    temp_df = pd.concat([ml_df_calendar_base, ml_df[cid]], axis = 1)
    temp_df.sort_values(by = ['member_internal_code', 'calendar_date'], inplace = True)
    #
    lag_name = cid + '_lag_1'
    temp_df = pd.concat([temp_df,temp_df.groupby(['member_internal_code'])[cid].shift(1).rename(lag_name)], axis = 1)
    temp_df.sort_values(by = ['member_internal_code', 'calendar_date'], inplace = True)
    #
    entry_name = 'entry_' + cid
    diag_name = 'diag_' + cid
    #
    temp_df['v1'] = np.where((temp_df[cid]==1) & (temp_df['mth_order']==1), 1, 0)
    temp_df['ind_diag_mth'] = np.where((temp_df[cid]==1) & (temp_df[lag_name]!=1) & (temp_df['mth_order']>=2), 1, 0)
    temp_df = temp_df.merge(temp_df.groupby(['member_internal_code'])['v1'].max().rename(entry_name), how = 'inner', left_on = 'member_internal_code', right_index = True)
    temp_df = temp_df.merge(temp_df.groupby(['member_internal_code'])['ind_diag_mth'].max().rename(diag_name), how = 'inner', left_on = 'member_internal_code', right_index = True)
    temp_df.drop(columns = ['v1',lag_name], inplace = True)

    if temp_df[entry_name].mean()>0:
        
        #DF to estimate variables
        #treat_group_mths = temp_df[temp_df[diag_name]==1]['ref_mth_entrance_group'].unique()
        #control_group = temp_df[(temp_df['ref_mth_entrance_group'].isin(treat_group_mths)) & (temp_df[diag_name]==0) & (temp_df[diag_name]==0)]
        #total_df = pd.concat([temp_df[temp_df[diag_name]==1], control_group], axis = 0)
        #df = ml_df[[base_covariates_list + cid_list + y + ['mth_order', 'ref_mth_entrance_group']].merge(total_df[[diag_name] + ['diag_mth']], how = 'inner', left_index = True, right_index = True)

        entry_list_cids.append(entry_name)

    if temp_df[diag_name].mean()>0:
        diagnosed_list.append(diag_name)

 
    

#### Test for  entry

In [19]:
treat_group_mths = temp_df[temp_df[diag_name]==1]['ref_mth_entrance_group'].unique()
control_group = temp_df[(temp_df['ref_mth_entrance_group'].isin(treat_group_mths)) & (temp_df[diag_name]==0) & (temp_df[diag_name]==0)]
total_df = pd.concat([temp_df[temp_df[diag_name]==1], control_group], axis = 0)
df = ml_df[base_covariates_list + cid_list + y + ['member_internal_code','mth_order', 'ref_mth_entrance_group']].merge(total_df[[diag_name] + ['ind_diag_mth']], how = 'inner', left_index = True, right_index = True)
df = df.merge(df[df['ind_diag_mth']==1].groupby('member_internal_code')['mth_order'].max().rename('diag_mth_order'), how = 'left', left_on = 'member_internal_code', right_index = True)

In [20]:
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

logit = LogisticRegression()

# define pipeline
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [21]:
X = df[df['mth_order']==1][base_covariates_list + cid_list]
y = df[df['mth_order']==1]['diag_cid_X06']

# transform the dataset
X, y = pipeline.fit_resample(X, y)

clf = logit.fit(X,y)

pre_treat = clf.predict(df[df['mth_order']==1][base_covariates_list + cid_list])
df.loc[df['mth_order']==1, 'pre_treat'] = pre_treat

In [22]:
df = df.merge(df.groupby(['member_internal_code'])['pre_treat'].max().rename('pre_treat_max'), how = 'inner', left_on = 'member_internal_code', right_index = True)

In [23]:
df['treat'] = np.where(df['diag_cid_X06']==1, 1, 0)
df.loc[(df['treat']==0) & (df['pre_treat_max']==0), 'treat'] = np.nan

month_list = df[df['diag_cid_X06']==1]['ref_mth_entrance_group'].unique()
ml_df = df[df['ref_mth_entrance_group'].isin(month_list) & (df['treat'].notna())][['member_internal_code','mth_order','ref_mth_entrance_group','ind_diag_mth','diag_mth_order','treat', 'total_care_cost'] + core_covariates]

***

In [24]:
ml_df

Unnamed: 0,member_internal_code,mth_order,ref_mth_entrance_group,ind_diag_mth,diag_mth_order,treat,total_care_cost,member_age,is_pitaya,is_male
98,NC1015C,1,615,0,,0.0,0.000000,31.0,0,0
99,NC1015C,2,615,0,,0.0,755.871193,31.0,0,0
100,NC1015C,3,615,0,,0.0,1208.838007,31.0,0,0
101,NC1015C,4,615,0,,0.0,701.092030,31.0,0,0
102,NC1015C,5,615,0,,0.0,20708.225841,31.0,0,0
...,...,...,...,...,...,...,...,...,...,...
114310,NC1ZYWB,6,624,0,,0.0,1954.909917,37.0,0,0
114311,NC1ZYWB,7,624,0,,0.0,1635.984687,37.0,0,0
114312,NC1ZYWB,8,624,0,,0.0,1021.145917,37.0,0,0
114313,NC1ZYY3,1,626,0,,0.0,982.902339,25.0,0,0


In [25]:
df = pd.DataFrame()
for pr_mth in ml_df['ref_mth_entrance_group'].unique():
    diag_months = ml_df[(ml_df['ind_diag_mth']==1) & (ml_df['ref_mth_entrance_group']==pr_mth)]['diag_mth_order'].unique()
    print(pr_mth, diag_months)
    for mth in diag_months:
        treat_df = ml_df[(ml_df['ref_mth_entrance_group']==pr_mth) & (ml_df['treat']==1) & (ml_df['diag_mth_order']==mth)]
        treat_df['ev_an_mth'] = treat_df['mth_order'] - mth
        control_df = ml_df[(ml_df['ref_mth_entrance_group']==pr_mth) & (ml_df['treat']==0)]
        control_df['ev_an_mth'] = control_df['mth_order'] - mth

        df = pd.concat([df,treat_df, control_df], axis = 0)


615 [13.]
617 [10.  4.]
626 [3.]
624 [5.]
620 [ 3. 10.]
622 [2.]
628 [4. 2. 3.]
625 [4.]
621 [2.]
627 [4.]
623 [5. 6.]


In [26]:
final_ml_df = df.sort_values(['member_internal_code','ev_an_mth'])
final_ml_df.drop(columns = ['mth_order', 'diag_mth_order', 'ind_diag_mth'], inplace = True)

In [27]:
final_ml_df.groupby(['treat', 'ev_an_mth'])['total_care_cost'].mean()

treat  ev_an_mth
0.0    -12.0         293.838886
       -11.0         854.511592
       -10.0         806.327718
       -9.0          549.331923
       -8.0         1152.482129
       -7.0         1025.497922
       -6.0         1097.968562
       -5.0          728.235091
       -4.0          661.307316
       -3.0          656.513795
       -2.0          757.525778
       -1.0          858.149970
        0.0          887.517453
        1.0          889.319819
        2.0          857.456351
        3.0          891.810462
        4.0          986.719749
        5.0          976.078197
        6.0         1202.362195
        7.0          928.995572
        8.0          931.675598
        9.0          891.615147
        10.0        1249.719932
        11.0        1297.973187
1.0    -12.0         420.417922
       -11.0         688.006881
       -10.0        1525.963141
       -9.0          350.125671
       -8.0         1766.932417
       -7.0         1178.170393
       -6.0         112

***

In [29]:
dd = StaggDiD

In [32]:

dd.att(final_ml_df, 'treat', 'total_care_cost', 'ev_an_mth', -3, 6)

Unnamed: 0,time,lower_bound,mean,upper_bound
0,-3,,,
1,-2,,,
2,-1,0.0,0.0,0.0
3,0,,,
4,1,,,
5,2,,,
6,3,,,
7,4,,,
8,5,,,


In [39]:
final_ml_df[(final_ml_df['ev_an_mth'] == -3) & (final_ml_df['treat'] == 1)][['member_internal_code','total_care_cost']].merge(final_ml_df[(final_ml_df['ev_an_mth'] == -3) & (final_ml_df['treat'] == 0)][['member_internal_code','total_care_cost']], how = 'inner', on = 'member_internal_code')

Unnamed: 0,member_internal_code,total_care_cost
7902,NC12HN0,1633.92366
27789,NC18S56,4.5
41735,NC1D44R,0.0
64021,NC1K6ZY,920.527713
65431,NC1KMNQ,50.449324
70842,NC1M7ON,1000.020072
71274,NC1MBXV,349.0
89557,NC1S67S,2443.989252
91303,NC1SP6D,959.661502
92187,NC1SZB0,1471.867987


In [36]:
final_ml_df[(final_ml_df['ev_an_mth'] == -1) & (final_ml_df['treat'] == 1)]['total_care_cost']

2122        306.669977
7904        364.041400
20134       837.458153
27791       615.420390
36143       133.642065
41737       243.318064
64023        26.180000
65433      1683.925345
70558         0.000000
70844        18.443681
71276       270.436721
71775       119.610244
89559      1085.796427
91305      1226.726858
92189       900.523210
92568       908.382267
94230        82.853015
98662       154.986667
101909     8215.388300
103013    93272.341003
107944      988.417183
Name: total_care_cost, dtype: float64

In [35]:
final_ml_df[(final_ml_df['ev_an_mth'] == -3) & (final_ml_df['treat'] == 1)]['total_care_cost'] - final_ml_df[(final_ml_df['ev_an_mth'] == -1) & (final_ml_df['treat'] == 1)]['total_care_cost']

2122     NaN
7902     NaN
7904     NaN
20134    NaN
27789    NaN
27791    NaN
36143    NaN
41735    NaN
41737    NaN
64021    NaN
64023    NaN
65431    NaN
65433    NaN
70558    NaN
70842    NaN
70844    NaN
71274    NaN
71276    NaN
71775    NaN
89557    NaN
89559    NaN
91303    NaN
91305    NaN
92187    NaN
92189    NaN
92566    NaN
92568    NaN
94230    NaN
98660    NaN
98662    NaN
101907   NaN
101909   NaN
103011   NaN
103013   NaN
107944   NaN
Name: total_care_cost, dtype: float64

In [None]:
temp_mean_treat = np.mean(final_ml_df[(final_ml_df[time] == -3) & (final_ml_df['treat'] == 1)]['total_care_cost'] - final_ml_df[(df[time] == -1) & (df[treatment] == 1)][outcome]);
len_treat = len(df[(df[time] == t) & (df[treatment] == 1)][outcome] - df[(df[time] == -1) & (df[treatment] == 1)][outcome]);
            temp_var_treat = np.var(df[(df[time] == t) & (df[treatment] == 1)][outcome] - df[(df[time] == -1) & (df[treatment] == 1)][outcome]);
            temp_mean_control = np.mean(df[(df[time] == t) & (df[treatment] == 0)][outcome] - df[(df[time] == -1) & (df[treatment] == 0)][outcome]);
            len_control = len(df[(df[time] == t) & (df[treatment] == 1)][outcome] - df[(df[time] == -1) & (df[treatment] == 1)][outcome]);
            temp_var_control = np.var(df[(df[time] == t) & (df[treatment] == 0)][outcome] - df[(df[time] == -1) & (df[treatment] == 0)][outcome]);   
            
            temp_att_dist = [(temp_var_treat - temp_mean_control), (temp_var_treat/len_treat + temp_var_control/len_control)];

In [None]:
ml_df.to_csv('src/data/panel_cc_cids_did_df.csv', index = False)