# Create Patient Level File
# 03_create_patient_level_file

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 28/09/2025 | Adrienne | Created   | Created dataset for learning models | 
| 29/09/2025 | Martin | New   | Added function to filter by number of entries through percentile or absolute value | 

# Content

* [Loading Data](#loading-data)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime
import json_lines
import seaborn as sns

# Loading Data

In [2]:
# readin clean datafiles
path = "../data/clean/"
#claim_df =  pd.read_pickle(path + 'claim.pkl')
#claim_df = pd.read_pickle(path + 'claim_sample.pkl')
claim_df = pd.read_pickle(path + 'claim_mini_sample.pkl')

# Filtering Data

Added configurations to filter the data based on HCPCS and number of claims

1. `filter_by`: "percentile" | "entries"
2. `threshold`: Selected percentile or number of entries __GREATER THAN__ specified value

NOTE: Currently min number of claims is grouped on `patient_medicare_number`

Notes from Adrienne:
- need to limit data to those that have HCPCS and then pick a min number of claims
- Pick a percentile of number of claims and use that as limit
- XXX perhaps create a function for all filtering that is done, so we can quickly change the limits

In [3]:
def filter_claims(df: pd.DataFrame, filter_by: str, threshold: int):
  orig_df = df.copy()

  # Get length of diagnosis, hcpcs and procedure lists
  df['diagnosis_ls_len'] = df['diagnosis_ls'].str.len()
  df['hcpcs_ls_len'] = df['hcpcs_ls'].str.len()
  df['procedure_ls_len'] = df['procedure_ls'].str.len()

  # Remove entries with no HCPCS
  df = df[df['hcpcs_ls_len'] != 0]
  
  # Remove claims where unique_claim_ID is not nine in length - if rows without HCPCS are dropped then this is not necessary
  df['len_claim'] = df['unique_claim_ID'].str.len()
  df = df[ df['len_claim'] == 9]
  
  # Remove claims prior to 2012 
  # 95% of data is from then onwards.  As there are great differences in medical care 
  # through the decades want a more recent time period
  df['year'] = df['billablePeriod_start'].apply(lambda x: x[:4])
  df = df[ df['year'].astype(int) >= 2012]
  
  # dropping hcpcs codes that are not five in length
  df['hcpcs_ls'] = df['hcpcs_ls'].apply(lambda x: [code for code in x if len(code) == 5])

  # Filter on percentile or number of entries for each patient_medicare_number
  claim_count_by_medicare = df.groupby('patient_medicare_number').count()['billablePeriod_end'].reset_index()
  if filter_by == "percentile":
    perc = np.percentile(claim_count_by_medicare['billablePeriod_end'], threshold)
    selected_medicare_numbers = claim_count_by_medicare[claim_count_by_medicare['billablePeriod_end'] >= perc]['patient_medicare_number']
    print(f"Number of patient medicare numbers selected: {len(selected_medicare_numbers)}")
  elif filter_by == "entries":
    idx = np.where(claim_count_by_medicare['billablePeriod_end'] >= threshold)
    selected_medicare_numbers = claim_count_by_medicare.iloc[idx]['patient_medicare_number']
    print(f"Number of patient medicare numbers selected: {len(selected_medicare_numbers)}")
  else:
    print("Please check that your entries are valid")
    return

  # Get only the entries with selected medicare numbers
  df = df[df['patient_medicare_number'].isin(selected_medicare_numbers)]
  print()
  print(f"Previous number of entries: {orig_df.shape[0]}")
  print(f"Current number of entries: {df.shape[0]}")
  return df

In [4]:
filter_by = "percentile"
threshold = 0

claim_fil_df = filter_claims(claim_df, filter_by=filter_by, threshold=threshold)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['len_claim'] = df['unique_claim_ID'].str.len()


Number of patient medicare numbers selected: 1975

Previous number of entries: 10000
Current number of entries: 4960


## Create Columns

In [5]:
# prior to creating list columns want to sort claims by date
claim_fil_df = claim_fil_df.sort_values(by=['patient_medicare_number', 'billablePeriod_start'])

In [6]:
# Create a number of claims column
vals = claim_fil_df['patient_medicare_number'].value_counts()
vals_df = pd.DataFrame({'number_of_claims': vals})
claim_fil_df = pd.merge(claim_fil_df, vals_df, how = 'left', on = 'patient_medicare_number')

In [7]:
# Combined DRG column
claim_fil_df['drg_code'] = claim_fil_df['drg_code'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['drg_code'].apply(list)
grp_df = pd.DataFrame( {'drg_ls': grp_ser})
grp_df['drg_ls'] = grp_df['drg_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [8]:
# Combined HCPCS column
grp_ser = claim_fil_df.groupby('patient_medicare_number')['hcpcs_ls'].apply(list)
grp_df = pd.DataFrame( {'combined_hcpcs_ls': grp_ser})
grp_df['combined_hcpcs_ls'] = grp_df['combined_hcpcs_ls'].apply(lambda x: [code for list in x for code in list ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [9]:
# Combined ICD10 column
grp_ser = claim_fil_df.groupby('patient_medicare_number')['diagnosis_ls'].apply(list)
grp_df = pd.DataFrame( {'combined_diagnosis_ls': grp_ser})
grp_df['combined_diagnosis_ls'] = grp_df['combined_diagnosis_ls'].apply(lambda x: [code for list in x for code in list ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')


In [10]:
# Combined unique_claim_ID
claim_fil_df['unique_claim_ID'] = claim_fil_df['unique_claim_ID'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['unique_claim_ID'].apply(list)
grp_df = pd.DataFrame( {'unique_claim_ls': grp_ser})
grp_df['unique_claim_ls'] = grp_df['unique_claim_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [11]:
# Combined billablePeriod_start
claim_fil_df['billablePeriod_start'] = claim_fil_df['billablePeriod_start'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['billablePeriod_start'].apply(list)
grp_df = pd.DataFrame( {'billablePeriod_start_ls': grp_ser})
grp_df['billablePeriod_start_ls'] = grp_df['billablePeriod_start_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [12]:
# Combined billablePeriod_end
claim_fil_df['billablePeriod_end'] = claim_fil_df['billablePeriod_end'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['billablePeriod_end'].apply(list)
grp_df = pd.DataFrame( {'billablePeriod_end_ls': grp_ser})
grp_df['billablePeriod_end_ls'] = grp_df['billablePeriod_end_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [13]:
# Combined location_of_bill
claim_fil_df['location_of_bill'] = claim_fil_df['location_of_bill'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['location_of_bill'].apply(list)
grp_df = pd.DataFrame( {'location_of_bill_ls': grp_ser})
grp_df['location_of_bill_ls'] = grp_df['location_of_bill_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [14]:
# birthdate column - for whatever reason birthdates are not on every claim for a patient
grp_ser = claim_fil_df.groupby('patient_medicare_number')['contained_0_birthDate'].apply(list)
grp_df = pd.DataFrame({
  'birthdate_ls': grp_ser
})
grp_df['birthdate_ls'] = grp_df['birthdate_ls'].apply(set)
grp_df['birthdate'] = grp_df['birthdate_ls'].apply(lambda x: [code for code in x if pd.notnull(code)]).str[0]
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')


In [15]:
# rename gender column:
claim_fil_df['gender'] = claim_fil_df['contained_0_gender']

## Drop columns so there is one row by for a patient

In [19]:
keep_cols = ['patient_medicare_number', 'patient_first_name', 'patient_last_name', 'gender', 'birthdate', 'number_of_claims', 'drg_ls', 'combined_diagnosis_ls', 'combined_hcpcs_ls', 'billablePeriod_start_ls', 'billablePeriod_end_ls', 'location_of_bill_ls', 'total_value']

In [20]:
claim_final_df = claim_fil_df[keep_cols]

In [21]:
claim_final_df = claim_final_df.sort_values(by=['patient_medicare_number', 'patient_first_name'], ascending = [True, False])
claim_final_df.head(20)

Unnamed: 0,patient_medicare_number,patient_first_name,patient_last_name,gender,birthdate,number_of_claims,drg_ls,combined_diagnosis_ls,combined_hcpcs_ls,billablePeriod_start_ls,billablePeriod_end_ls,location_of_bill_ls,total_value
1,1S00E00AA10,Brandon214,Roob72,female,1946-01-15,3,[],"[O039, O039, B085, B002, O039, J029]","[G0444, 99241, G0444, G9572]","[2013-04-23, 2016-01-15, 2020-06-02]","[2013-04-23, 2016-01-15, 2020-06-02]",[002],15458.12
0,1S00E00AA10,B.,Roob72,female,1946-01-15,3,[],"[O039, O039, B085, B002, O039, J029]","[G0444, 99241, G0444, G9572]","[2013-04-23, 2016-01-15, 2020-06-02]","[2013-04-23, 2016-01-15, 2020-06-02]",[002],778.78
2,1S00E00AA10,B.,Roob72,female,1946-01-15,3,[],"[O039, O039, B085, B002, O039, J029]","[G0444, 99241, G0444, G9572]","[2013-04-23, 2016-01-15, 2020-06-02]","[2013-04-23, 2016-01-15, 2020-06-02]",[002],778.78
3,1S00E00AA23,B.,Hagene,female,,1,[],"[J329, E785, P292]","[G0444, G9572]",[2014-04-13],[2014-04-13],[],840.21
5,1S00E00AA25,Carlota980,Gamez720,female,1947-04-15,2,[],"[E669, D649, K635, O039, M810, J329, E669, D64...","[G0444, 99241]","[2012-07-18, 2021-11-23]","[2012-07-18, 2021-11-23]",[002],85.55
4,1S00E00AA25,C.,Gamez7,female,1947-04-15,2,[],"[E669, D649, K635, O039, M810, J329, E669, D64...","[G0444, 99241]","[2012-07-18, 2021-11-23]","[2012-07-18, 2021-11-23]",[002],996.16
6,1S00E00AA32,Denny560,Watsica258,male,1945-06-09,3,[],"[P292, E669, I2510, B349, J329, I10, E669, I25...","[99241, 99241, 99241]","[2015-05-12, 2021-02-20, 2021-03-20]","[2015-05-12, 2021-02-20, 2021-03-20]","[002, 002, 002]",85.55
7,1S00E00AA32,Denny560,Watsica258,male,1945-06-09,3,[],"[P292, E669, I2510, B349, J329, I10, E669, I25...","[99241, 99241, 99241]","[2015-05-12, 2021-02-20, 2021-03-20]","[2015-05-12, 2021-02-20, 2021-03-20]","[002, 002, 002]",278.58
8,1S00E00AA32,Denny560,Watsica258,male,1945-06-09,3,[],"[P292, E669, I2510, B349, J329, I10, E669, I25...","[99241, 99241, 99241]","[2015-05-12, 2021-02-20, 2021-03-20]","[2015-05-12, 2021-02-20, 2021-03-20]","[002, 002, 002]",278.58
10,1S00E00AA54,Lashawnda5,Greenfelder433,female,1950-12-23,11,[],"[E119, R739, E781, E8881, D649, E11319, P292, ...","[G0444, 99241, 99241, 99241, 99241, G0444, 992...","[2012-10-27, 2013-01-26, 2014-06-21, 2014-07-2...","[2012-10-27, 2013-01-26, 2014-06-21, 2014-07-2...","[002, 002, 002, 002, 002, 002, 002, 002, 002, ...",142.58


In [22]:
# Save dataset
claim_final_df = claim_final_df.groupby('patient_medicare_number', as_index=False).nth(0)
claim_final_df.to_pickle("../data/clean/patient_level_mini_sample.pkl")
#claim_final_df.to_pickle("../data/clean/patient_level.pkl")
