## 0. Setup

In [1]:
#standard packages
import pandas as pd
import numpy as np
#import warnings
import warnings
warnings.filterwarnings('ignore')
import os
import collections

#time packages
import datetime as dt


In [2]:
from setup import main as setup
setup(env='vscode', project='cc_cid_calculator')

Project directory: /Users/luiz.superti/Documents/GitHub/alice-coding/cc_cid_calculator


In [3]:
from src.model import StaggDiD

***

## 1. extract

In [4]:
df = pd.read_csv('src/data/cid_care_cost_did_calc_df.csv')
df.columns

Index(['member_internal_code', 'calendar_date', 'mth_order', 'total_n_mth',
       'current_sex', 'current_gender', 'member_age', 'current_city',
       'current_state', 'is_b2b', 'is_pitaya', 'diseases_sev', 'tds_cost',
       'internacao_proced_cost', 'exam_cost', 'consulta_cost', 'er_cost',
       'alice_therapy_cost', 'other_costs', 'total_care_cost'],
      dtype='object')

In [5]:
df['diseases_'] = df['diseases_sev'].str.split(';')
df2 = df.explode('diseases_')
df2['cid_'] = df2['diseases_'].apply(lambda x: str(x).strip()[0:3])
df3 = df2[['member_internal_code','calendar_date', 'cid_']].drop_duplicates()

In [6]:
df3

Unnamed: 0,member_internal_code,calendar_date,cid_
0,NC1000Z,2022-05-31,
1,NC1000Z,2022-06-30,
2,NC1000Z,2022-07-31,
3,NC1002P,2021-12-31,
4,NC1002P,2022-01-31,
...,...,...,...
114409,NC1ZZWN,2022-05-31,R74
114410,NC1ZZWN,2022-06-30,R74
114411,NC1ZZWN,2022-07-31,R74
114412,NC1ZZWN,2022-08-31,N17


In [7]:
df_row = pd.get_dummies(df3['cid_'], prefix = 'cid')
df_row_2 = df3[['member_internal_code', 'calendar_date']].merge(df_row, how = 'inner', left_index=True, right_index=True)
df_row_3 = df_row_2.groupby(['member_internal_code', 'calendar_date']).max().reset_index()
df_row_3.sort_values(by=['member_internal_code', 'calendar_date'], inplace=True)
#create a cid filter
df_row_3.drop(columns = 'cid_nan', inplace = True)
cids_cols = [col for col in df_row_3 if col.startswith('cid_')]
df_row_3[cids_cols] = df_row_3[cids_cols].apply(pd.to_numeric).fillna(0)

***

In [8]:
df_filled = df.merge(
                    pd.get_dummies(df.current_state, prefix = 'uf'),how = 'inner', left_index=True, right_index=True).merge(
                        pd.get_dummies(df.current_city, prefix = 'city'),how = 'inner', left_index=True, right_index=True).drop(columns=['current_state', 'current_city'])
df_filled['is_male'] = np.where(df_filled['current_sex']=='MALE',1,0)
df_filled['is_trans'] = np.where(df_filled['current_sex']!=df_filled['current_gender'],1,0)
#create the base-dummies
df_filled.drop(columns=['current_gender','current_sex','city_São Paulo', 'uf_SP',
                        'diseases_sev','tds_cost', 'exam_cost', 'consulta_cost',
                        'internacao_proced_cost', 'er_cost',
                        'alice_therapy_cost', 'other_costs'], inplace=True)
df_filled.drop(columns=['diseases_'], inplace=True)

base_covariates_list = df_filled.drop(columns = ['member_internal_code', 'calendar_date', 'mth_order','total_n_mth', 'total_care_cost']).columns.to_list()
core_covariates = ['member_age', 'is_pitaya', 'is_male']
#y = ['total_care_cost']

### Merge bases

In [9]:
ml_df = df_filled.merge(df_row_3, how = 'inner', left_on=['member_internal_code', 'calendar_date'], right_on = ['member_internal_code', 'calendar_date'])

#adjust variables
ml_df['n_total_cids'] = ml_df[cids_cols].sum(axis=1)
ml_df['ref_mth'] = (pd.to_datetime(ml_df['calendar_date']).dt.to_period('M') -   pd.to_datetime(['1970-01-31']).to_period('M')).apply(lambda x: x.n)

#entry-calendar date
ml_df['v1'] = np.where(ml_df['mth_order']==1, ml_df['ref_mth'], np.nan)
ml_df = ml_df.merge(ml_df.groupby(['member_internal_code'])['v1'].min().rename('ref_mth_entrance_group'), how = 'inner', left_on = 'member_internal_code', right_index = True)
ml_df['ref_mth_entrance_group'] = ml_df['ref_mth_entrance_group'].astype('Int64')
ml_df.drop(columns = ['v1'], inplace = True)
ml_df.head(10)

Unnamed: 0,member_internal_code,calendar_date,mth_order,total_n_mth,member_age,is_b2b,is_pitaya,total_care_cost,uf_AC,uf_AL,...,cid_Z91,cid_Z92,cid_Z93,cid_Z95,cid_Z96,cid_Z97,cid_Z98,n_total_cids,ref_mth,ref_mth_entrance_group
0,NC1000Z,2022-05-31,1,3,33.0,0,0,0.0,0,0,...,0,0,0,0,0,0,0,0,628,628
1,NC1000Z,2022-06-30,2,3,33.0,0,0,0.0,0,0,...,0,0,0,0,0,0,0,0,629,628
2,NC1000Z,2022-07-31,3,3,34.0,0,0,16.01912,0,0,...,0,0,0,0,0,0,0,0,630,628
3,NC1002P,2021-12-31,1,9,26.0,1,0,0.0,0,0,...,0,0,0,0,0,0,0,0,623,623
4,NC1002P,2022-01-31,2,9,26.0,1,0,107.373738,0,0,...,0,0,0,0,0,0,0,0,624,623
5,NC1002P,2022-02-28,3,9,26.0,1,0,0.0,0,0,...,0,0,0,0,0,0,0,0,625,623
6,NC1002P,2022-03-31,4,9,26.0,1,0,18.990966,0,0,...,0,0,0,0,0,0,0,0,626,623
7,NC1002P,2022-04-30,5,9,26.0,1,0,0.0,0,0,...,0,0,0,0,0,0,0,0,627,623
8,NC1002P,2022-05-31,6,9,26.0,1,0,677.768498,0,0,...,0,0,0,0,0,0,0,1,628,623
9,NC1002P,2022-06-30,7,9,26.0,1,0,140.079314,0,0,...,0,0,0,0,0,0,0,2,629,623


In [10]:
ml_df_calendar_base = ml_df[['member_internal_code','calendar_date','mth_order','ref_mth', 'ref_mth_entrance_group']]

***

In [11]:
accountable_cids = ml_df.groupby('member_internal_code')[cids_cols].min().drop_duplicates().sum(axis=0).sort_values(ascending=False)
accountable_cids = accountable_cids[accountable_cids>=10].index.to_list()

In [12]:
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

logit = LogisticRegression()

# define pipeline 1
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)


# define pipeline 1
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)


dd = StaggDiD

In [14]:
entry_list_cids = []
diagnosed_list  = []
results_df = pd.DataFrame()
for cid in accountable_cids:
    #cid_list not including the cid
    cid_list = [c for c in cids_cols if c!=cid]

    temp_df = pd.concat([ml_df_calendar_base, ml_df[cid]], axis = 1)
    temp_df.sort_values(by = ['member_internal_code', 'calendar_date'], inplace = True)
    #
    lag_name = cid + '_lag_1'
    temp_df = pd.concat([temp_df,temp_df.groupby(['member_internal_code'])[cid].shift(1).rename(lag_name)], axis = 1)
    temp_df.sort_values(by = ['member_internal_code', 'calendar_date'], inplace = True)
    #
    entry_name = 'entry_' + cid
    diag_name = 'diag_' + cid
    #
    temp_df['v1'] = np.where((temp_df[cid]==1) & (temp_df['mth_order']==1), 1, 0)
    temp_df['ind_diag_mth'] = np.where((temp_df[cid]==1) & (temp_df[lag_name]!=1) & (temp_df['mth_order']>=2), 1, 0)
    temp_df = temp_df.merge(temp_df.groupby(['member_internal_code'])['v1'].max().rename(entry_name), how = 'inner', left_on = 'member_internal_code', right_index = True)
    temp_df = temp_df.merge(temp_df.groupby(['member_internal_code'])['ind_diag_mth'].max().rename(diag_name), how = 'inner', left_on = 'member_internal_code', right_index = True)
    temp_df.drop(columns = ['v1',lag_name], inplace = True)

    if temp_df[entry_name].mean()>0:
        print('current cid:', entry_name)
        #DF to estimate variables
        print('dataframe creation pt1')

        treat_group_mths = temp_df[temp_df[entry_name]==1]['ref_mth_entrance_group'].unique()
        control_group = temp_df[(temp_df['ref_mth_entrance_group'].isin(treat_group_mths)) & (temp_df[entry_name]==0) & (temp_df[entry_name]==0)]
        total_df = pd.concat([temp_df[temp_df[entry_name]==1], control_group], axis = 0)
        df = ml_df[base_covariates_list + cid_list + ['total_care_cost'] + ['member_internal_code','mth_order', 'ref_mth_entrance_group']].merge(total_df[[entry_name] + ['ind_diag_mth']], how = 'inner', left_index = True, right_index = True)
        df = df.merge(df[df['ind_diag_mth']==1].groupby('member_internal_code')['mth_order'].max().rename('diag_mth_order'), how = 'left', left_on = 'member_internal_code', right_index = True)

        X = df[df['mth_order']==1][base_covariates_list + cid_list]
        y = df[df['mth_order']==1][entry_name]

        print('Propensity score matching w/ SMOTE and resampling')
        X, y = pipeline.fit_resample(X, y)

        clf = logit.fit(X,y)
        pre_treat = clf.predict(df[df['mth_order']==1][base_covariates_list + cid_list])

        print('dataframe creation pt2')
        df.loc[df['mth_order']==1, 'pre_treat'] = pre_treat
        df = df.merge(df.groupby(['member_internal_code'])['pre_treat'].max().rename('pre_treat_max'), how = 'inner', left_on = 'member_internal_code', right_index = True)
        df['treat'] = np.where(df[entry_name]==1, 1, 0)
        df.loc[(df['treat']==0) & (df['pre_treat_max']==0), 'treat'] = np.nan

        month_list = df[df[entry_name]==1]['ref_mth_entrance_group'].unique()
        ml_df = df[df['ref_mth_entrance_group'].isin(month_list) & (df['treat'].notna())][['member_internal_code','mth_order','ref_mth_entrance_group','ind_diag_mth','diag_mth_order','treat', 'total_care_cost'] + core_covariates]
        print('dataframe creation pt3: event analysis')
        df = pd.DataFrame()
        for pr_mth in ml_df['ref_mth_entrance_group'].unique():
            diag_months = ml_df[(ml_df['ind_diag_mth']==1) & (ml_df['ref_mth_entrance_group']==pr_mth)]['diag_mth_order'].unique()
            print(pr_mth, diag_months)
            for mth in diag_months:
                treat_df = ml_df[(ml_df['ref_mth_entrance_group']==pr_mth) & (ml_df['treat']==1) & (ml_df['diag_mth_order']==mth)]
                treat_df['ev_an_mth'] = treat_df['mth_order'] - mth
                treat_df = treat_df.merge(treat_df.groupby('member_internal_code')['mth_order'].min().rename('min_mth_order'),how = 'inner', left_on='member_internal_code', right_index = True)
                treat_df = treat_df.merge(treat_df.groupby('member_internal_code')['ev_an_mth'].min().rename('min_ev_an_mth'),how = 'inner', left_on='member_internal_code', right_index = True)
                treat_df['id_group'] = treat_df['member_internal_code'].astype(str) + '_' + treat_df['min_ev_an_mth'].astype(str) + '_' + treat_df['min_mth_order'].astype(str)        
                treat_df.drop(columns = ['min_mth_order', 'min_ev_an_mth'], inplace = True)
                #
                control_df = ml_df[(ml_df['ref_mth_entrance_group']==pr_mth) & (ml_df['treat']==0)]
                control_df['ev_an_mth'] = control_df['mth_order'] - mth
                control_df = control_df.merge(control_df.groupby('member_internal_code')['mth_order'].min().rename('min_mth_order'),how = 'inner', left_on='member_internal_code', right_index = True)
                control_df = control_df.merge(control_df.groupby('member_internal_code')['ev_an_mth'].min().rename('min_ev_an_mth'),how = 'inner', left_on='member_internal_code', right_index = True)
                control_df['id_group'] = control_df['member_internal_code'].astype(str) + '_' + control_df['min_ev_an_mth'].astype(str) + '_' + control_df['min_mth_order'].astype(str)
                control_df.drop(columns = ['min_mth_order', 'min_ev_an_mth'], inplace = True)

                df = pd.concat([df,treat_df, control_df], axis = 0)

        final_ml_df = df.sort_values(['id_group','ev_an_mth'])
        final_ml_df.drop(columns = ['mth_order', 'diag_mth_order', 'ind_diag_mth', 'member_internal_code'], inplace = True)    
        print('att_calculus')
        att_df = dd.att(final_ml_df, 'id_group', 'treat', 'total_care_cost', 'ev_an_mth', -3, 7)

        att_df['cid'] = cid
        att_df['analysis_type'] = 'at_entry' 
        results_df = pd.concat([results_df, att_df], axis = 0)
        entry_list_cids.append(entry_name)

    if temp_df[diag_name].mean()>0:
        print('current cid:', diag_name)
        diagnosed_list.append(diag_name)

    

current cid: entry_cid_H52
dataframe creation pt1
Propensity score matching w/ SMOTE and resampling


ValueError: The specified ratio required to remove samples from the minority class while trying to generate new samples. Please increase the ratio.

In [16]:
y.mean()

0.11139401654996817

#### Test for  entry

In [None]:
treat_group_mths = temp_df[temp_df[diag_name]==1]['ref_mth_entrance_group'].unique()
control_group = temp_df[(temp_df['ref_mth_entrance_group'].isin(treat_group_mths)) & (temp_df[diag_name]==0) & (temp_df[diag_name]==0)]
total_df = pd.concat([temp_df[temp_df[diag_name]==1], control_group], axis = 0)
df = ml_df[base_covariates_list + cid_list + y + ['member_internal_code','mth_order', 'ref_mth_entrance_group']].merge(total_df[[diag_name] + ['ind_diag_mth']], how = 'inner', left_index = True, right_index = True)
df = df.merge(df[df['ind_diag_mth']==1].groupby('member_internal_code')['mth_order'].max().rename('diag_mth_order'), how = 'left', left_on = 'member_internal_code', right_index = True)

***

In [None]:
ml_df

In [None]:
final_ml_df.head(20)

***

In [None]:
dd = StaggDiD

In [None]:
final_ml_df[(final_ml_df['ev_an_mth'] == -1) & (final_ml_df['treat'] == 1)][['member_internal_code','total_care_cost']]

In [None]:
final_ml_df[(final_ml_df['ev_an_mth'] == -3) & (final_ml_df['treat'] == 1)][['member_internal_code','total_care_cost']]

In [None]:
a = final_ml_df[(final_ml_df['ev_an_mth'] == -1) & (final_ml_df['treat'] == 0)][['member_internal_code','total_care_cost']].merge(final_ml_df[(final_ml_df['ev_an_mth'] == -1) & (final_ml_df['treat'] == 1)][['member_internal_code','total_care_cost']], how = 'inner', on = 'member_internal_code')


In [None]:
final_ml_df.drop_duplicates()

In [None]:
final_ml_df[(final_ml_df['ev_an_mth'] == -1) & (final_ml_df['treat'] == 0)]

In [None]:
final_ml_df[(final_ml_df['ev_an_mth'] == -1) & (final_ml_df['treat'] == 1)][['member_internal_code','total_care_cost']]

In [None]:
temp_mean_treat = np.mean(final_ml_df[(final_ml_df[time] == -3) & (final_ml_df['treat'] == 1)]['total_care_cost'] - final_ml_df[(df[time] == -1) & (df[treatment] == 1)][outcome]);
len_treat = len(df[(df[time] == t) & (df[treatment] == 1)][outcome] - df[(df[time] == -1) & (df[treatment] == 1)][outcome]);
            temp_var_treat = np.var(df[(df[time] == t) & (df[treatment] == 1)][outcome] - df[(df[time] == -1) & (df[treatment] == 1)][outcome]);
            temp_mean_control = np.mean(df[(df[time] == t) & (df[treatment] == 0)][outcome] - df[(df[time] == -1) & (df[treatment] == 0)][outcome]);
            len_control = len(df[(df[time] == t) & (df[treatment] == 1)][outcome] - df[(df[time] == -1) & (df[treatment] == 1)][outcome]);
            temp_var_control = np.var(df[(df[time] == t) & (df[treatment] == 0)][outcome] - df[(df[time] == -1) & (df[treatment] == 0)][outcome]);   
            
            temp_att_dist = [(temp_var_treat - temp_mean_control), (temp_var_treat/len_treat + temp_var_control/len_control)];

In [None]:
ml_df.to_csv('src/data/panel_cc_cids_did_df.csv', index = False)