# Create Patient Level File
# 03_create_patient_level_file

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 28/09/2025 | Adrienne | Created   | Created dataset for learning models | 
| |  |  | |

# Content

* [Introduction](#introduction)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime
import json_lines
import seaborn as sns

In [5]:
# readin clean datafiles
path = "../data/clean/"
#claim_df =  pd.read_pickle(path + 'claim.pkl')
#claim_df = pd.read_pickle(path + 'claim_sample.pkl')
claim_df = pd.read_pickle(path + 'claim_mini_sample.pkl')

- need to limit data to those that have HCPCS and then pick a min number of claims
- Pick a percentile of number of claims and use that as limit
- XXX perhaps create a function for all filtering that is done, so we can quickly change the limits


In [3]:
# drop claims with no hcpcs
claim_fil_df = claim_df[~claim_df['hcpcs_ls'].apply(lambda x: len(x) == 0)]

## Create Columns

In [79]:
# Create a number of claims column
vals = claim_fil_df['patient_medicare_number'].value_counts()
vals_df = pd.DataFrame({
  'number_of_claims': vals
})
claim_fil_df = pd.merge(claim_fil_df, vals_df, how = 'left', on = 'patient_medicare_number')
claim_fil_df.head()

Unnamed: 0,billablePeriod_end,billablePeriod_start,contained_0_birthDate,contained_0_gender,contained_0_id,contained_0_identifier_0_type_coding_0_code,contained_0_identifier_0_type_coding_0_display,contained_0_identifier_0_value,contained_0_name_0_family,contained_0_name_0_given,...,drg_code,provider_number,national_provider_identifier,type_of_bill,claim_type,location_of_bill,diagnosis_ls,hcpcs_ls,procedure_ls,number_of_claims
0,2019-05-17,2019-05-17,1948-06-14,female,patient,MC,Patient's Medicare Number,1S00E00AD09,Ward668,"[""Oralia106""]",...,,220020,8886688687,1,institutional,2,"[K011, G43719, R52, E669, E785, K3520, K37, Z9...","[99241, 00777310502]",[],7
1,2019-07-07,2019-07-07,1947-05-04,female,patient,MC,Patient's Medicare Number,1S00E00AG56,Rice937,"[""Filomena21""]",...,,220070,8886688182,1,institutional,2,"[E669, J329, E034, D649, E785, B349]","[99241, 00777310502]",[],3
2,2019-07-03,2019-07-03,1948-01-01,male,patient,MC,Patient's Medicare Number,1S00E00MP51,Hilpert278,"[""Errol226""]",...,,58189,8888306999,1,institutional,4,"[C189, K621, D649, M810, I639, P292, K635]",[G0152],[],2
3,2012-03-01,2012-03-01,1952-04-16,female,patient,MC,Patient's Medicare Number,1S00E00JD02,Koepp521,"[""Jacquetta1""]",...,,260180,8886287084,1,institutional,2,"[R739, E781, E8881, D649, M179, J329, E669, E119]",[99241],[],1
4,2012-09-19,2012-09-19,1947-08-12,female,patient,MC,Patient's Medicare Number,1S00E00HF72,Carter549,"[""Darlene91""]",...,,157651,8887312378,B,institutional,4,[O039],[Q5001],[],3


In [81]:
# Combined DRG column
#claim_df['drg_pad'] = claim_df['drg_code'].apply( lambda x : '{0:0>3}'.format(x)) 
claim_fil_df['drg_code'] = claim_fil_df['drg_code'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['drg_code'].apply(list)
grp_df = pd.DataFrame( {'drg_ls': grp_ser})
grp_df['drg_ls'] = grp_df['drg_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.concat([claim_fil_df, grp_df], axis = 1)
claim_fil_df.head()



Unnamed: 0,billablePeriod_end,billablePeriod_start,contained_0_birthDate,contained_0_gender,contained_0_id,contained_0_identifier_0_type_coding_0_code,contained_0_identifier_0_type_coding_0_display,contained_0_identifier_0_value,contained_0_name_0_family,contained_0_name_0_given,...,provider_number,national_provider_identifier,type_of_bill,claim_type,location_of_bill,diagnosis_ls,hcpcs_ls,procedure_ls,number_of_claims,drg_ls
0,2019-05-17,2019-05-17,1948-06-14,female,patient,MC,Patient's Medicare Number,1S00E00AD09,Ward668,"[""Oralia106""]",...,220020,8886688687,1,institutional,2,"[K011, G43719, R52, E669, E785, K3520, K37, Z9...","[99241, 00777310502]",[],7.0,
1,2019-07-07,2019-07-07,1947-05-04,female,patient,MC,Patient's Medicare Number,1S00E00AG56,Rice937,"[""Filomena21""]",...,220070,8886688182,1,institutional,2,"[E669, J329, E034, D649, E785, B349]","[99241, 00777310502]",[],3.0,
2,2019-07-03,2019-07-03,1948-01-01,male,patient,MC,Patient's Medicare Number,1S00E00MP51,Hilpert278,"[""Errol226""]",...,58189,8888306999,1,institutional,4,"[C189, K621, D649, M810, I639, P292, K635]",[G0152],[],2.0,
3,2012-03-01,2012-03-01,1952-04-16,female,patient,MC,Patient's Medicare Number,1S00E00JD02,Koepp521,"[""Jacquetta1""]",...,260180,8886287084,1,institutional,2,"[R739, E781, E8881, D649, M179, J329, E669, E119]",[99241],[],1.0,
4,2012-09-19,2012-09-19,1947-08-12,female,patient,MC,Patient's Medicare Number,1S00E00HF72,Carter549,"[""Darlene91""]",...,157651,8887312378,B,institutional,4,[O039],[Q5001],[],3.0,


In [None]:
# Combined HCPCS column

In [None]:
# Combined ICD10 column
grp_ser = df.groupby('patient_medicare_number')['diagnosis_ls'].transform(lambda x: [ [','.join(code)] for code in x])
grp_df = pd.DataFrame( {'combined_diagnosis_ls': grp_ser})
grp_df.head()
comb_df = pd.concat([df, grp_df], axis = 1)
comb_df.head()


Unnamed: 0,billablePeriod_end,billablePeriod_start,contained_0_birthDate,contained_0_gender,contained_0_id,contained_0_identifier_0_type_coding_0_code,contained_0_identifier_0_type_coding_0_display,contained_0_identifier_0_value,contained_0_name_0_family,contained_0_name_0_given,...,provider_number,national_provider_identifier,type_of_bill,claim_type,location_of_bill,diagnosis_ls,hcpcs_ls,procedure_ls,number_of_claims,combined_diagnosis_ls
0,2019-05-17,2019-05-17,1948-06-14,female,patient,MC,Patient's Medicare Number,1S00E00AD09,Ward668,"[""Oralia106""]",...,220020,8886688687,1,institutional,2,"[K011, G43719, R52, E669, E785, K3520, K37, Z9...","[99241, 00777310502]",[],7,"[K011,G43719,R52,E669,E785,K3520,K37,Z9049,T50..."
1,2019-07-07,2019-07-07,1947-05-04,female,patient,MC,Patient's Medicare Number,1S00E00AG56,Rice937,"[""Filomena21""]",...,220070,8886688182,1,institutional,2,"[E669, J329, E034, D649, E785, B349]","[99241, 00777310502]",[],3,"[E669,J329,E034,D649,E785,B349]"
2,2019-07-03,2019-07-03,1948-01-01,male,patient,MC,Patient's Medicare Number,1S00E00MP51,Hilpert278,"[""Errol226""]",...,58189,8888306999,1,institutional,4,"[C189, K621, D649, M810, I639, P292, K635]",[G0152],[],2,"[C189,K621,D649,M810,I639,P292,K635]"
3,2012-03-01,2012-03-01,1952-04-16,female,patient,MC,Patient's Medicare Number,1S00E00JD02,Koepp521,"[""Jacquetta1""]",...,260180,8886287084,1,institutional,2,"[R739, E781, E8881, D649, M179, J329, E669, E119]",[99241],[],1,"[R739,E781,E8881,D649,M179,J329,E669,E119]"
4,2012-09-19,2012-09-19,1947-08-12,female,patient,MC,Patient's Medicare Number,1S00E00HF72,Carter549,"[""Darlene91""]",...,157651,8887312378,B,institutional,4,[O039],[Q5001],[],3,[O039]


In [None]:
# Preventative care indicator

In [None]:
# age column

In [None]:
# base model just a set of procedures - bare services to treat condition
# model with all procedures (so if you have 3 blood draws it it in there three times)


In [None]:
flat_claim_df['hcpcs_ls'] = flat_claim_df[hcpcs_cols].apply(lambda row: [x for x in row if pd.notnull(x)] , axis = 1)