# Create Patient Level File
# 03_create_patient_level_file

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 28/09/2025 | Adrienne | Created   | Created dataset for learning models | 
| |  |  | |

# Content

* [Introduction](#introduction)

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime
import json_lines
import seaborn as sns

In [35]:
# readin clean datafiles
path = "../data/clean/"
#claim_df =  pd.read_pickle(path + 'claim.pkl')
#claim_df = pd.read_pickle(path + 'claim_sample.pkl')
claim_df = pd.read_pickle(path + 'claim_mini_sample.pkl')

- need to limit data to those that have HCPCS and then pick a min number of claims
- Pick a percentile of number of claims and use that as limit
- XXX perhaps create a function for all filtering that is done, so we can quickly change the limits


In [36]:
# drop claims with no hcpcs
claim_fil_df = claim_df[~claim_df['hcpcs_ls'].apply(lambda x: len(x) == 0)]

## Create Columns

In [None]:
# Create a number of claims column
vals = claim_fil_df['patient_medicare_number'].value_counts()
vals_df = pd.DataFrame({'number_of_claims': vals})
claim_fil_df = pd.merge(claim_fil_df, vals_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined DRG column
claim_fil_df['drg_code'] = claim_fil_df['drg_code'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['drg_code'].apply(list)
grp_df = pd.DataFrame( {'drg_ls': grp_ser})
grp_df['drg_ls'] = grp_df['drg_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [71]:
# Combined HCPCS column
grp_ser = claim_fil_df.groupby('patient_medicare_number')['hcpcs_ls'].apply(list)
grp_df = pd.DataFrame( {'combined_hcpcs_ls': grp_ser})
grp_df['combined_hcpcs_ls'] = grp_df['combined_hcpcs_ls'].apply(lambda x: [code for list in x for code in list ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [70]:
# Combined ICD10 column
grp_ser = claim_fil_df.groupby('patient_medicare_number')['diagnosis_ls'].apply(list)
grp_df = pd.DataFrame( {'combined_diagnosis_ls': grp_ser})
grp_df['combined_diagnosis_ls'] = grp_df['combined_diagnosis_ls'].apply(lambda x: [code for list in x for code in list ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')


In [86]:
# Combined unique_claim_ID
claim_fil_df['unique_claim_ID'] = claim_fil_df['unique_claim_ID'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['unique_claim_ID'].apply(list)
grp_df = pd.DataFrame( {'unique_claim_ls': grp_ser})
grp_df['unique_claim_ls'] = grp_df['unique_claim_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [90]:
# Combined billablePeriod_start
claim_fil_df['billablePeriod_start'] = claim_fil_df['billablePeriod_start'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['billablePeriod_start'].apply(list)
grp_df = pd.DataFrame( {'billablePeriod_start_ls': grp_ser})
grp_df['billablePeriod_start_ls'] = grp_df['billablePeriod_start_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [88]:
# Combined billablePeriod_end
claim_fil_df['billablePeriod_end'] = claim_fil_df['billablePeriod_end'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['billablePeriod_end'].apply(list)
grp_df = pd.DataFrame( {'billablePeriod_end_ls': grp_ser})
grp_df['billablePeriod_end_ls'] = grp_df['billablePeriod_end_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [104]:
# Combined location_of_bill
claim_fil_df['location_of_bill'] = claim_fil_df['location_of_bill'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['location_of_bill'].apply(list)
grp_df = pd.DataFrame( {'location_of_bill_ls': grp_ser})
grp_df['location_of_bill_ls'] = grp_df['location_of_bill_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [94]:
# age column
claim_fil_df['birthdate'] = claim_fil_df['contained_0_birthDate']

In [95]:
# rename gender column:
claim_fil_df['gender'] = claim_fil_df['contained_0_gender']

## Drop columns so there is one row by for a patient

In [105]:
keep_cols = ['patient_medicare_number', 'patient_first_name', 'patient_last_name', 'gender', 'birthdate', 'number_of_claims', 'drg_ls', 'combined_diagnosis_ls', 'combined_hcpcs_ls', 'billablePeriod_start_ls', 'billablePeriod_end_ls', 'location_of_bill_ls']

In [106]:
claim_final_df = claim_fil_df[keep_cols]

In [107]:
claim_final_df.head()

Unnamed: 0,patient_medicare_number,patient_first_name,patient_last_name,gender,birthdate,number_of_claims,drg_ls,combined_diagnosis_ls,combined_hcpcs_ls,billablePeriod_start_ls,billablePeriod_end_ls,location_of_bill_ls
0,1S00E00AD09,Oralia106,Ward668,female,1948-06-14,7,"[948, 003, 001]","[K011, G43719, R52, E669, E785, K3520, K37, Z9...","[99241, 00777310502, G0444, 00777310502, 99221...","[2019-05-17, 2019-10-14, 1991-01-11, 2014-01-1...","[2019-05-17, 2019-10-14, 1991-01-12, 2014-01-1...","[002, 002, 002, 002, 002]"
1,1S00E00AG56,Filomena21,Rice937,female,1947-05-04,3,[],"[E669, J329, E034, D649, E785, B349, E669, J32...","[99241, 00777310502, G0444, 00777310502, 99241...","[2019-07-07, 2017-08-27, 2020-07-06]","[2019-07-07, 2017-08-27, 2020-07-06]","[002, 002]"
2,1S00E00MP51,Errol226,Hilpert278,male,1948-01-01,2,[],"[C189, K621, D649, M810, I639, P292, K635, C18...","[G0152, 99241]","[2019-07-03, 2019-10-23]","[2019-07-03, 2019-10-23]","[004, 002]"
3,1S00E00JD02,Jacquetta1,Koepp521,female,1952-04-16,1,[],"[R739, E781, E8881, D649, M179, J329, E669, E119]",[99241],[2012-03-01],[2012-03-01],[002]
4,1S00E00HF72,Darlene91,Carter549,female,1947-08-12,3,[],"[O039, O039, B9789, O039]","[Q5001, 99241, G0444]","[2012-09-19, 2017-08-11, 2017-12-05]","[2012-09-19, 2017-08-11, 2017-12-05]","[004, 002]"


In [100]:
# Save dataset
claim_fil_df.to_pickle("../data/clean/patient_level.pkl")

Follow up needed

In [None]:
#TODO: Need to look into the following hcpcs code in the list that doesn't look to be correct

In [108]:
hcpcs_cols = [col for col in claim_df.columns if 'productOrService_coding_0_code' in col]
temp_pat = claim_fil_df[ claim_fil_df['patient_medicare_number'] == '1S00E00AA32']
for col in hcpcs_cols:
    print(temp_pat[col])

1154    99241
1220    99221
3230    99241
4077    99241
Name: item_0_productOrService_coding_0_code, dtype: object
1154    NaN
1220    NaN
3230    NaN
4077    NaN
Name: item_1_productOrService_coding_0_code, dtype: object
1154    NaN
1220    NaN
3230    NaN
4077    NaN
Name: item_2_productOrService_coding_0_code, dtype: object
1154    NaN
1220    NaN
3230    NaN
4077    NaN
Name: item_3_productOrService_coding_0_code, dtype: object
1154    NaN
1220    NaN
3230    NaN
4077    NaN
Name: item_4_productOrService_coding_0_code, dtype: object
1154    NaN
1220    NaN
3230    NaN
4077    NaN
Name: item_5_productOrService_coding_0_code, dtype: object
1154    NaN
1220    NaN
3230    NaN
4077    NaN
Name: item_6_productOrService_coding_0_code, dtype: object
1154    NaN
1220    NaN
3230    NaN
4077    NaN
Name: item_7_productOrService_coding_0_code, dtype: object
1154    NaN
1220    NaN
3230    NaN
4077    NaN
Name: item_8_productOrService_coding_0_code, dtype: object
1154    NaN
1220    NaN
3230 

In [110]:
claim_fil_df[ claim_fil_df['patient_medicare_number'] == '1S00E00AA32'][['patient_medicare_number', 'patient_first_name', 'patient_last_name', 'drg_code', 'drg_ls', 'diagnosis_ls', 'combined_diagnosis_ls', 'hcpcs_ls', 'combined_hcpcs_ls']]

Unnamed: 0,patient_medicare_number,patient_first_name,patient_last_name,drg_code,drg_ls,diagnosis_ls,combined_diagnosis_ls,hcpcs_ls,combined_hcpcs_ls
1154,1S00E00AA32,Denny560,Watsica258,,[001],"[I10, E669, I2510, I219, I252, J209]","[I10, E669, I2510, I219, I252, J209, P292, E66...","[99241, 00777310502]","[99241, 00777310502, 99221, 00777310502, 99241..."
1220,1S00E00AA32,Denny560,Watsica258,1.0,[001],"[P292, E669]","[I10, E669, I2510, I219, I252, J209, P292, E66...","[99221, 00777310502]","[99241, 00777310502, 99221, 00777310502, 99241..."
3230,1S00E00AA32,Denny560,Watsica258,,[001],"[I10, E669, I2510, I219, I252, J209]","[I10, E669, I2510, I219, I252, J209, P292, E66...","[99241, 00777310502]","[99241, 00777310502, 99221, 00777310502, 99241..."
4077,1S00E00AA32,Denny560,Watsica258,,[001],"[P292, E669, I2510, B349, J329]","[I10, E669, I2510, I219, I252, J209, P292, E66...","[99241, 00777310502]","[99241, 00777310502, 99221, 00777310502, 99241..."
