# Create Patient Level File
## 03_create_patient_level_file

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 28/09/2025 | Adrienne | Created   | Created dataset for learning models | 
| 29/09/2025 | Martin | New   | Added function to filter by number of entries through percentile or absolute value | 
| 08/10/2025 | Adrienne | New | Added code to split datasets evenly |
| 15.10.2025 | Adrienne | Update | Code Cleanup |

## Content

* [Introduction](#introduction)
* [Loading Data](#loading-data)
* [Filtering Data](#filtering-data)
* [Create Columns](#create-columns)
* [EDA](#eda)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime
import seaborn as sns
from datetime import date
from sklearn.model_selection import train_test_split

## Introduction

This program turns the claim data from long to wide by summarizing all claims on a patient level

## Loading Data

In [None]:
# readin clean datafiles
path = "../data/clean/"
claim_df =  pd.read_pickle(path + 'claim.pkl')

## Filtering Data

Added configurations to filter the data based on HCPCS and number of claims

1. `filter_by`: "percentile" | "entries"
2. `threshold`: Selected percentile or number of entries __GREATER THAN__ specified value

NOTE: Currently min number of claims is grouped on `patient_medicare_number`

In [None]:
def filter_claims(df: pd.DataFrame, filter_by: str, threshold: int):
  orig_df = df.copy()

  # Get length of diagnosis, hcpcs and procedure lists
  df['diagnosis_ls_len'] = df['diagnosis_ls'].str.len()
  df['hcpcs_ls_len'] = df['hcpcs_ls'].str.len()
  df['procedure_ls_len'] = df['procedure_ls'].str.len()

  # Remove entries with no HCPCS
  df = df[df['hcpcs_ls_len'] != 0]
  
  # Remove claims where unique_claim_ID is not nine in length - if rows without HCPCS are dropped then this is not necessary
  df['len_claim'] = df['unique_claim_ID'].str.len()
  df = df[ df['len_claim'] == 9]
  
  # Remove claims prior to 2012 
  # 95% of data is from then onwards.  As there are great differences in medical care 
  # through the decades want a more recent time period
  df['year'] = df['billablePeriod_start'].apply(lambda x: x[:4])
  df = df[ df['year'].astype(int) >= 2012]
  
  # dropping hcpcs codes that are not five in length
  df['hcpcs_ls'] = df['hcpcs_ls'].apply(lambda x: [code for code in x if len(code) == 5])

  # Filter on percentile or number of entries for each patient_medicare_number
  claim_count_by_medicare = df.groupby('patient_medicare_number').count()['billablePeriod_end'].reset_index()
  if filter_by == "percentile":
    perc = np.percentile(claim_count_by_medicare['billablePeriod_end'], threshold)
    print(f"Min number of claims: {perc}")
    selected_medicare_numbers = claim_count_by_medicare[claim_count_by_medicare['billablePeriod_end'] >= perc]['patient_medicare_number']
    print(f"Number of patient medicare numbers selected: {len(selected_medicare_numbers)}")
  elif filter_by == "entries":
    idx = np.where(claim_count_by_medicare['billablePeriod_end'] >= threshold)
    selected_medicare_numbers = claim_count_by_medicare.iloc[idx]['patient_medicare_number']
    print(f"Number of patient medicare numbers selected: {len(selected_medicare_numbers)}")
  else:
    print("Please check that your entries are valid")
    return

  # Get only the entries with selected medicare numbers
  df = df[df['patient_medicare_number'].isin(selected_medicare_numbers)]
  print()
  print(f"Previous number of entries: {orig_df.shape[0]}")
  print(f"Current number of entries: {df.shape[0]}")
  return df

In [None]:
filter_by = "percentile"
threshold = 10

claim_fil_df = filter_claims(claim_df, filter_by=filter_by, threshold=threshold)

## Create Columns

In [None]:
# prior to creating list columns want to sort claims by date
claim_fil_df = claim_fil_df.sort_values(by=['patient_medicare_number', 'billablePeriod_start'])

In [None]:
# Create a number of claims column
vals = claim_fil_df['patient_medicare_number'].value_counts()
vals_df = pd.DataFrame({'number_of_claims': vals})
claim_fil_df = pd.merge(claim_fil_df, vals_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined DRG column
claim_fil_df['drg_code'] = claim_fil_df['drg_code'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['drg_code'].apply(list)
grp_df = pd.DataFrame( {'drg_ls': grp_ser})
grp_df['drg_ls'] = grp_df['drg_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined HCPCS column
grp_ser = claim_fil_df.groupby('patient_medicare_number')['hcpcs_ls'].apply(list)
grp_df = pd.DataFrame( {'combined_hcpcs_ls': grp_ser})
grp_df['combined_hcpcs_ls'] = grp_df['combined_hcpcs_ls'].apply(lambda x: [code for list in x for code in list ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined ICD10 column
grp_ser = claim_fil_df.groupby('patient_medicare_number')['diagnosis_ls'].apply(list)
grp_df = pd.DataFrame( {'combined_diagnosis_ls': grp_ser})
grp_df['combined_diagnosis_ls'] = grp_df['combined_diagnosis_ls'].apply(lambda x: [code for list in x for code in list ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined admitting diagnosis column
grp_ser = claim_fil_df.groupby('patient_medicare_number')['admitting_diagnosis'].apply(list)
grp_df = pd.DataFrame( {'combined_admitting_diagnosis_ls': grp_ser})
grp_df['combined_admitting_diagnosis_ls'] = grp_df['combined_admitting_diagnosis_ls'].apply(lambda ls: [x for x in ls if x != 'nan'])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined principal diagnosis column
grp_ser = claim_fil_df.groupby('patient_medicare_number')['principal_diagnosis'].apply(list)
grp_df = pd.DataFrame( {'combined_principal_diagnosis_ls': grp_ser})
grp_df['combined_principal_diagnosis_ls'] = grp_df['combined_principal_diagnosis_ls'].apply(lambda ls: [x for x in ls if x != 'nan'])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined unique_claim_ID
claim_fil_df['unique_claim_ID'] = claim_fil_df['unique_claim_ID'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['unique_claim_ID'].apply(list)
grp_df = pd.DataFrame( {'unique_claim_ls': grp_ser})
grp_df['unique_claim_ls'] = grp_df['unique_claim_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined billablePeriod_start
claim_fil_df['billablePeriod_start'] = claim_fil_df['billablePeriod_start'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['billablePeriod_start'].apply(list)
grp_df = pd.DataFrame( {'billablePeriod_start_ls': grp_ser})
grp_df['billablePeriod_start_ls'] = grp_df['billablePeriod_start_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined billablePeriod_end
claim_fil_df['billablePeriod_end'] = claim_fil_df['billablePeriod_end'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['billablePeriod_end'].apply(list)
grp_df = pd.DataFrame( {'billablePeriod_end_ls': grp_ser})
grp_df['billablePeriod_end_ls'] = grp_df['billablePeriod_end_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# Combined location_of_bill
claim_fil_df['location_of_bill'] = claim_fil_df['location_of_bill'].apply( lambda x : '{0:0>3}'.format(x)) 
grp_ser = claim_fil_df.groupby('patient_medicare_number')['location_of_bill'].apply(list)
grp_df = pd.DataFrame( {'location_of_bill_ls': grp_ser})
grp_df['location_of_bill_ls'] = grp_df['location_of_bill_ls'].apply(lambda x: [code for code in x if code != 'nan' ])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

In [None]:
# birthdate column - for whatever reason birthdates are not on every claim for a patient
grp_ser = claim_fil_df.groupby('patient_medicare_number')['contained_0_birthDate'].apply(list)
grp_df = pd.DataFrame({
  'birthdate_ls': grp_ser
})
grp_df['birthdate_ls'] = grp_df['birthdate_ls'].apply(set)
grp_df['birthdate'] = grp_df['birthdate_ls'].apply(lambda x: [code for code in x if pd.notnull(code)]).str[0]
grp_df['birthdate'] = pd.to_datetime(grp_df['birthdate'])
claim_fil_df = pd.merge(claim_fil_df, grp_df, how = 'left', on = 'patient_medicare_number')

# create age column

def calculate_age(birthdate):
    today = date.today()
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age

claim_fil_df['age'] = claim_fil_df['birthdate'].apply(calculate_age)

Drop columns so there is one row by for a patient

In [None]:
keep_cols = ['patient_medicare_number', 'patient_first_name', 'patient_last_name', 'gender', 'birthdate', 'age', 'number_of_claims', 'drg_ls', 'combined_diagnosis_ls', 'combined_admitting_diagnosis_ls', 'combined_principal_diagnosis_ls', 'combined_hcpcs_ls', 'billablePeriod_start_ls', 'billablePeriod_end_ls', 'location_of_bill_ls', 'total_value']

In [None]:
claim_final_df = claim_fil_df[keep_cols]

In [None]:
claim_final_df = claim_final_df.sort_values(by=['patient_medicare_number', 'patient_first_name'], ascending = [True, False])
claim_final_df.head()

In [None]:
# keep first row of group
claim_final_df = claim_final_df.groupby('patient_medicare_number', as_index=False).nth(0)

In [None]:
# save dataset
claim_final_df.to_pickle("../data/clean/patient_level.pkl")

## EDA

Investigating Class Imbalance

In [None]:
# Percentage breakdown
vals = claim_final_df['age'].value_counts(normalize=True).sort_index(ascending=True) * 100
pd.DataFrame({
  'age_breakdown': vals
}).head(22)

In [None]:
# Percentage breakdown
vals = claim_final_df['gender'].value_counts(normalize=True) * 100
pd.DataFrame({
  'gender_breakdown': vals
}).head(22)

Looking at length of combined_hcpcs_ls

In [None]:
claim_final_df['ls_len'] = claim_final_df['combined_hcpcs_ls'].str.len()

In [None]:
# Percentage breakdown
vals = claim_final_df['ls_len'].value_counts(normalize=True).sort_index(ascending=True) * 100
pd.DataFrame({
  'ls_breakdown': vals
}).head(50)

In [None]:
fig, ax = plt.subplots(figsize=(5, 3))
#sns.countplot(y='ls_len', data=df, order=df['ls_len'].value_counts().index)
sns.countplot(y='ls_len', data=claim_final_df[claim_final_df['ls_len'] > 500])

This is the patient that has 1219 claims (once filtered) resulting in a combined_hcpcs_ls length of 1592

1S00E00GT33

In [None]:
claim_df[ claim_df['patient_medicare_number']== '1S00E00KH30'].tail()