In [None]:
'''This notebook is used to create a new training dataset for  regression and classification
of the synthetic CMS dataset. This dataset will treat each observation (row) corresponding to a beneficiary ID,
year, and claim ID combo as independent observations, while preserving aggregate information about the beneficiary 
in each year. Specifically, this notebook provides code to create a training dataset from the years 2015 - 2022 
inclusive, and a test set from the year 2023. This notebook will be used to create a function that can be used to 
customize which years are used to create the training and test sets. The final output of this notebook is a CSV
file that contains a dataset of the aggregated features, WITHOUT any encoding (one-hot or otherwise).'''

import numpy as np
import pandas as pd
from sklearn.linear_model import (LinearRegression
)
from sklearn.preprocessing import OneHotEncoder
from utils import (
    describe_dataframe, df_train_test, graph_results, reg_train_eval
)

In [2]:
main = pd.read_csv("../data/combo_data.csv")
main

Unnamed: 0,BENE_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,YR,LOS,DRG,PRNCPAL_DGNS_CD,PTNT_DSCHRG_STUS_CD,CLM_IP_ADMSN_TYPE_CD,...,NUM_DIAG,ICD_Description,DRG_TITLE,STATE_CODE,COUNTY_CD,SEX_IDENT_CD,BENE_RACE_CD,ESRD_IND,Age,TOT_RX_CST_AMT
0,-10000010254618,-10000930037831,2015-03-25,2015-03-25,2015,0,551,S134XX,1,1,...,9,Sprain of ligaments of cervical spine,MEDICAL BACK PROBLEMS WITH MCC,1,1500,1,1,0,16,275.19
1,-10000010254653,-10000930038030,2015-09-24,2015-09-24,2015,0,951,Z3480,1,1,...,4,Encounter for supervision of other normal pre...,OTHER FACTORS INFLUENCING HEALTH STATUS,1,1400,2,1,0,33,0.00
2,-10000010254653,-10000930038031,2017-05-09,2017-05-10,2017,1,923,T7432X,1,3,...,3,"Child psychological abuse, confirmed","OTHER INJURY, POISONING AND TOXIC EFFECT DIAGN...",1,1400,2,1,0,35,0.00
3,-10000010254656,-10000930038162,2017-01-14,2017-01-14,2017,0,564,S8290X,1,1,...,4,Unspecified fracture of unspecified lower leg,OTHER MUSCULOSKELETAL SYSTEM AND CONNECTIVE TI...,1,1360,2,1,0,18,302.04
4,-10000010254656,-10000930038163,2018-03-17,2018-03-17,2018,0,951,Z3480,1,1,...,6,Encounter for supervision of other normal pre...,OTHER FACTORS INFLUENCING HEALTH STATUS,1,1360,2,1,0,19,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20862,-10000010288008,-10000931485965,2020-01-08,2020-01-14,2020,6,0,T50901A,1,3,...,12,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,75,0.00
20863,-10000010288008,-10000931485967,2020-10-23,2020-10-23,2020,0,0,T50901A,1,1,...,12,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,75,0.00
20864,-10000010288008,-10000931485969,2021-08-22,2021-08-23,2021,1,0,T50901A,1,1,...,13,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,76,0.00
20865,-10000010288008,-10000931485971,2021-09-13,2021-09-16,2021,3,0,T50901A,1,2,...,13,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,76,0.00


In [None]:
# Get Unique ID claims for each year, for each beneficiary ID
# Use groupby and nunique
# Use the agg function to rename the column

aggregated = main.groupby(['BENE_ID', 'YR']).agg({'CLM_ID': 'nunique'}).reset_index()
aggregated.rename(columns={'CLM_ID': 'Unique_Claim_Count'}, inplace=True)
aggregated

Unnamed: 0,BENE_ID,YR,Unique_Claim_Count
0,-10000010288010,2017,1
1,-10000010288008,2017,2
2,-10000010288008,2018,2
3,-10000010288008,2020,2
4,-10000010288008,2021,2
...,...,...,...
11540,-10000010254656,2018,1
11541,-10000010254656,2022,1
11542,-10000010254653,2015,1
11543,-10000010254653,2017,1


In [None]:
# Merge the aggregated claim counts back into the main dataframe

main_with_claim_counts = main.merge(aggregated, on=['BENE_ID', 'YR'], how='left')
main_with_claim_counts

Unnamed: 0,BENE_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,YR,LOS,DRG,PRNCPAL_DGNS_CD,PTNT_DSCHRG_STUS_CD,CLM_IP_ADMSN_TYPE_CD,...,ICD_Description,DRG_TITLE,STATE_CODE,COUNTY_CD,SEX_IDENT_CD,BENE_RACE_CD,ESRD_IND,Age,TOT_RX_CST_AMT,Unique_Claim_Count
0,-10000010254618,-10000930037831,2015-03-25,2015-03-25,2015,0,551,S134XX,1,1,...,Sprain of ligaments of cervical spine,MEDICAL BACK PROBLEMS WITH MCC,1,1500,1,1,0,16,275.19,1
1,-10000010254653,-10000930038030,2015-09-24,2015-09-24,2015,0,951,Z3480,1,1,...,Encounter for supervision of other normal pre...,OTHER FACTORS INFLUENCING HEALTH STATUS,1,1400,2,1,0,33,0.00,1
2,-10000010254653,-10000930038031,2017-05-09,2017-05-10,2017,1,923,T7432X,1,3,...,"Child psychological abuse, confirmed","OTHER INJURY, POISONING AND TOXIC EFFECT DIAGN...",1,1400,2,1,0,35,0.00,1
3,-10000010254656,-10000930038162,2017-01-14,2017-01-14,2017,0,564,S8290X,1,1,...,Unspecified fracture of unspecified lower leg,OTHER MUSCULOSKELETAL SYSTEM AND CONNECTIVE TI...,1,1360,2,1,0,18,302.04,1
4,-10000010254656,-10000930038163,2018-03-17,2018-03-17,2018,0,951,Z3480,1,1,...,Encounter for supervision of other normal pre...,OTHER FACTORS INFLUENCING HEALTH STATUS,1,1360,2,1,0,19,0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20862,-10000010288008,-10000931485965,2020-01-08,2020-01-14,2020,6,0,T50901A,1,3,...,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,75,0.00,2
20863,-10000010288008,-10000931485967,2020-10-23,2020-10-23,2020,0,0,T50901A,1,1,...,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,75,0.00,2
20864,-10000010288008,-10000931485969,2021-08-22,2021-08-23,2021,1,0,T50901A,1,1,...,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,76,0.00,2
20865,-10000010288008,-10000931485971,2021-09-13,2021-09-16,2021,3,0,T50901A,1,2,...,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,76,0.00,2


In [None]:
# Calculate the average number of claims per year for each beneficiary
average_claims_per_year = main_with_claim_counts.groupby('BENE_ID').agg({'Unique_Claim_Count': 'mean'}).reset_index()
average_claims_per_year.rename(columns={'Unique_Claim_Count': 'Average_Claim_Count'}, inplace=True)

# Merge the average claims per year back with the main dataset
main_with_avg_claims = main_with_claim_counts.merge(average_claims_per_year, on='BENE_ID', how='left')

# Compute the correlation between Average_Claim_Count and LOS
# Interestingly, the average claim count is negatively correlated with LOS
correlation_avg_claims = main_with_avg_claims[['Average_Claim_Count', 'LOS']].corr()
print(correlation_avg_claims)

                     Average_Claim_Count       LOS
Average_Claim_Count             1.000000 -0.130778
LOS                            -0.130778  1.000000


In [None]:
# Drop the Unique_Claim_Count column and observe the resulting dataframe

main_with_avg_claims.drop(columns=['Unique_Claim_Count'], inplace=True)
main_with_avg_claims

Unnamed: 0,BENE_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,YR,LOS,DRG,PRNCPAL_DGNS_CD,PTNT_DSCHRG_STUS_CD,CLM_IP_ADMSN_TYPE_CD,...,ICD_Description,DRG_TITLE,STATE_CODE,COUNTY_CD,SEX_IDENT_CD,BENE_RACE_CD,ESRD_IND,Age,TOT_RX_CST_AMT,Average_Claim_Count
0,-10000010254618,-10000930037831,2015-03-25,2015-03-25,2015,0,551,S134XX,1,1,...,Sprain of ligaments of cervical spine,MEDICAL BACK PROBLEMS WITH MCC,1,1500,1,1,0,16,275.19,1.0
1,-10000010254653,-10000930038030,2015-09-24,2015-09-24,2015,0,951,Z3480,1,1,...,Encounter for supervision of other normal pre...,OTHER FACTORS INFLUENCING HEALTH STATUS,1,1400,2,1,0,33,0.00,1.0
2,-10000010254653,-10000930038031,2017-05-09,2017-05-10,2017,1,923,T7432X,1,3,...,"Child psychological abuse, confirmed","OTHER INJURY, POISONING AND TOXIC EFFECT DIAGN...",1,1400,2,1,0,35,0.00,1.0
3,-10000010254656,-10000930038162,2017-01-14,2017-01-14,2017,0,564,S8290X,1,1,...,Unspecified fracture of unspecified lower leg,OTHER MUSCULOSKELETAL SYSTEM AND CONNECTIVE TI...,1,1360,2,1,0,18,302.04,1.0
4,-10000010254656,-10000930038163,2018-03-17,2018-03-17,2018,0,951,Z3480,1,1,...,Encounter for supervision of other normal pre...,OTHER FACTORS INFLUENCING HEALTH STATUS,1,1360,2,1,0,19,0.00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20862,-10000010288008,-10000931485965,2020-01-08,2020-01-14,2020,6,0,T50901A,1,3,...,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,75,0.00,2.0
20863,-10000010288008,-10000931485967,2020-10-23,2020-10-23,2020,0,0,T50901A,1,1,...,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,75,0.00,2.0
20864,-10000010288008,-10000931485969,2021-08-22,2021-08-23,2021,1,0,T50901A,1,1,...,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,76,0.00,2.0
20865,-10000010288008,-10000931485971,2021-09-13,2021-09-16,2021,3,0,T50901A,1,2,...,"Poisoning by unspecified drugs, medicaments an...",,5,5430,1,5,0,76,0.00,2.0


In [None]:
# Calculate the average CLM_TOT_CHRG_AMT (claim total charge amount) per year per BENE_ID
avg_chrg_per_year = main_with_avg_claims.groupby(['BENE_ID', 'YR']).agg({'CLM_TOT_CHRG_AMT': 'mean'}).reset_index()
avg_chrg_per_year.rename(columns={'CLM_TOT_CHRG_AMT': 'Avg_Clm_Tot_Chrg_Amt'}, inplace=True)

# Merge the calculated average charges back into the main_with_avg_claims dataframe
main_with_avg_claims = main_with_avg_claims.merge(avg_chrg_per_year, on=['BENE_ID', 'YR'], how='left')
main_with_avg_claims

Unnamed: 0,BENE_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,YR,LOS,DRG,PRNCPAL_DGNS_CD,PTNT_DSCHRG_STUS_CD,CLM_IP_ADMSN_TYPE_CD,...,DRG_TITLE,STATE_CODE,COUNTY_CD,SEX_IDENT_CD,BENE_RACE_CD,ESRD_IND,Age,TOT_RX_CST_AMT,Average_Claim_Count,Avg_Clm_Tot_Chrg_Amt
0,-10000010254618,-10000930037831,2015-03-25,2015-03-25,2015,0,551,S134XX,1,1,...,MEDICAL BACK PROBLEMS WITH MCC,1,1500,1,1,0,16,275.19,1.0,96.65
1,-10000010254653,-10000930038030,2015-09-24,2015-09-24,2015,0,951,Z3480,1,1,...,OTHER FACTORS INFLUENCING HEALTH STATUS,1,1400,2,1,0,33,0.00,1.0,6311.88
2,-10000010254653,-10000930038031,2017-05-09,2017-05-10,2017,1,923,T7432X,1,3,...,"OTHER INJURY, POISONING AND TOXIC EFFECT DIAGN...",1,1400,2,1,0,35,0.00,1.0,8545.72
3,-10000010254656,-10000930038162,2017-01-14,2017-01-14,2017,0,564,S8290X,1,1,...,OTHER MUSCULOSKELETAL SYSTEM AND CONNECTIVE TI...,1,1360,2,1,0,18,302.04,1.0,1014.85
4,-10000010254656,-10000930038163,2018-03-17,2018-03-17,2018,0,951,Z3480,1,1,...,OTHER FACTORS INFLUENCING HEALTH STATUS,1,1360,2,1,0,19,0.00,1.0,9911.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20862,-10000010288008,-10000931485965,2020-01-08,2020-01-14,2020,6,0,T50901A,1,3,...,,5,5430,1,5,0,75,0.00,2.0,166.97
20863,-10000010288008,-10000931485967,2020-10-23,2020-10-23,2020,0,0,T50901A,1,1,...,,5,5430,1,5,0,75,0.00,2.0,166.97
20864,-10000010288008,-10000931485969,2021-08-22,2021-08-23,2021,1,0,T50901A,1,1,...,,5,5430,1,5,0,76,0.00,2.0,166.97
20865,-10000010288008,-10000931485971,2021-09-13,2021-09-16,2021,3,0,T50901A,1,2,...,,5,5430,1,5,0,76,0.00,2.0,166.97


In [None]:
# Select relevant columns for the final dataset

select_cols = ["LOS" ,
               "YR",  
               "PRNCPAL_DGNS_CD", 
               "CLM_IP_ADMSN_TYPE_CD", 
               "ER_flag", 
               "STATE_CODE", 
               "BENE_RACE_CD", 
               "ESRD_IND",
               "Age", 
               "TOT_RX_CST_AMT", 
               "NUM_DIAG",
               "SEX_IDENT_CD",
               "Average_Claim_Count",
               "Avg_Clm_Tot_Chrg_Amt"
               ]

workingdf = main_with_avg_claims[select_cols]

severe_codes = workingdf.loc[workingdf["LOS"] > 10, "PRNCPAL_DGNS_CD"].unique()
workingdf = workingdf.assign(SEV_FLAG=np.where(workingdf["PRNCPAL_DGNS_CD"].isin(severe_codes), 1, 0))
#Have to use severity flag (which could be determined in many ways) instead of PRCNPAL_DGNS_CD because PRNCP 
# as is can't be used with regression
#I also want to try out target encoding
workingdf = workingdf.assign(ESRD_IND = workingdf["ESRD_IND"].map({"Y": 1, "0" : 0}))

workingdf = workingdf.drop(columns=["PRNCPAL_DGNS_CD"])

describe_dataframe(workingdf)

Unnamed: 0,column_name,dtype,non_null_count,null_count,unique_count,sample_values,min,max,mean
0,LOS,int64,20867,0,45,"[0, 1, 10, 11, 2]",0.0,104.0,1.054919
1,YR,int64,20867,0,9,"[2015, 2017, 2018, 2022, 2021]",2015.0,2023.0,2019.100014
2,CLM_IP_ADMSN_TYPE_CD,int64,20867,0,3,"[1, 3, 2]",1.0,3.0,1.399195
3,ER_flag,int64,20867,0,2,"[0, 1]",0.0,1.0,0.048498
4,STATE_CODE,int64,20867,0,51,"[1, 2, 3, 4, 6]",1.0,53.0,24.974553
5,BENE_RACE_CD,int64,20867,0,6,"[1, 4, 2, 5, 3]",1.0,6.0,2.049456
6,ESRD_IND,int64,20867,0,2,"[0, 1]",0.0,1.0,0.163943
7,Age,int64,20867,0,112,"[16, 33, 35, 18, 19]",0.0,111.0,66.288015
8,TOT_RX_CST_AMT,float64,20867,0,7231,"[275.19, 0.0, 302.04, 444.23, 408.33]",0.0,197011.98,11439.025583
9,NUM_DIAG,int64,20867,0,56,"[9, 4, 3, 6, 8]",1.0,58.0,21.899554


In [None]:
#Need to divide up the data into training and testing sets by year
#I will use 2023 as the test set and the rest of the years as the training set
wdf_rest = workingdf[workingdf["YR"] != 2023]
wdf_2023 = workingdf[workingdf["YR"] == 2023]

#one hot encode state, race, admission type, sex, 
ohe = OneHotEncoder(sparse_output=False)

ohe.fit(wdf_rest[['CLM_IP_ADMSN_TYPE_CD', 
                  'STATE_CODE', 
                  'BENE_RACE_CD', 
                  "SEX_IDENT_CD"]])
ohe_df_rest = pd.DataFrame(data = ohe.transform(wdf_rest[['CLM_IP_ADMSN_TYPE_CD', 
                                                          'STATE_CODE', 
                                                          'BENE_RACE_CD',
                                                          "SEX_IDENT_CD"]]), 
             columns=ohe.get_feature_names_out(wdf_rest[['CLM_IP_ADMSN_TYPE_CD', 
                                                         'STATE_CODE', 
                                                         'BENE_RACE_CD',
                                                         "SEX_IDENT_CD"]].columns))

ohe.fit(wdf_2023[['CLM_IP_ADMSN_TYPE_CD', 
                  'STATE_CODE', 
                  'BENE_RACE_CD', 
                  "SEX_IDENT_CD"]])
ohe_df_2023 = pd.DataFrame(data = ohe.transform(wdf_2023[['CLM_IP_ADMSN_TYPE_CD', 
                                                          'STATE_CODE', 
                                                          'BENE_RACE_CD',
                                                          "SEX_IDENT_CD"]]), 
             columns=ohe.get_feature_names_out(wdf_2023[['CLM_IP_ADMSN_TYPE_CD', 
                                                         'STATE_CODE', 
                                                         'BENE_RACE_CD',
                                                         "SEX_IDENT_CD"]].columns))

#drop year, county code, all one hot encoded vars 
wdf_rest = wdf_rest.drop(columns=["YR", 
                                  'CLM_IP_ADMSN_TYPE_CD', 
                                  'STATE_CODE', 
                                  'BENE_RACE_CD',
                                  "SEX_IDENT_CD"])
wdf_2023 = wdf_2023.drop(columns=["YR", 
                                  'CLM_IP_ADMSN_TYPE_CD', 
                                  'STATE_CODE', 
                                  'BENE_RACE_CD',
                                  "SEX_IDENT_CD"])

wdf_rest = pd.concat([wdf_rest.reset_index(drop=True), ohe_df_rest.reset_index(drop=True)], axis=1)
wdf_2023 = pd.concat([wdf_2023.reset_index(drop=True), ohe_df_2023.reset_index(drop=True)], axis=1)

In [34]:
describe_dataframe(wdf_rest)

Unnamed: 0,column_name,dtype,non_null_count,null_count,unique_count,sample_values,min,max,mean
0,LOS,int64,20263,0,45,"[0, 1, 10, 11, 2]",0.0,104.00,1.056704
1,ER_flag,int64,20263,0,2,"[0, 1]",0.0,1.00,0.048216
2,ESRD_IND,int64,20263,0,2,"[0, 1]",0.0,1.00,0.163500
3,Age,int64,20263,0,111,"[16, 33, 35, 18, 19]",0.0,110.00,66.201599
4,TOT_RX_CST_AMT,float64,20263,0,7025,"[275.19, 0.0, 302.04, 444.23, 408.33]",0.0,197011.98,11715.990461
...,...,...,...,...,...,...,...,...,...
66,BENE_RACE_CD_4,float64,20263,0,2,"[0.0, 1.0]",0.0,1.00,0.034447
67,BENE_RACE_CD_5,float64,20263,0,2,"[0.0, 1.0]",0.0,1.00,0.183339
68,BENE_RACE_CD_6,float64,20263,0,2,"[0.0, 1.0]",0.0,1.00,0.011104
69,SEX_IDENT_CD_1,float64,20263,0,2,"[1.0, 0.0]",0.0,1.00,0.547106


In [35]:
main_with_avg_claims.to_csv("../data/ts2_data.csv", index = False)