In [None]:
# prompt: load my google drive
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

outpatient_file_path = '/content/drive/MyDrive/Data/All FFS Claims/outpatient.csv'
# LU Look Up file for health codes
LU_drg_file_path = '/content/drive/MyDrive/Data/HealthCodes/DRG.csv'

# Downloaded 2025 ICD Codes but could not find about 90 codes so looked up the remaining using Perplexity.AI
# Ideal way:  Download 2015-2025 ICD Codes, create a database and then do look up

LU_ICD_file_path = '/content/drive/MyDrive/Data/HealthCodes/ICD10Diagnosis.csv'
LU_ICD2_file_path = '/content/drive/MyDrive/Data/HealthCodes/ICD_DIAG_CD_RemainingCodes.csv'
# Reading DRG and Diagnosis Look Ups
LU_drg = pd.read_csv(LU_drg_file_path)[['DRG','Description']]
LU_drg.rename(columns={'Description': 'DRG_Description'}, inplace=True)

LU_drg['DRG'] = LU_drg['DRG'].fillna(0).astype(int).astype(str).str.zfill(3)

LU_ICD = pd.read_csv(LU_ICD_file_path)
LU_ICD.rename(columns={'Description': 'ICD_Description'}, inplace=True)

LU_ICD2 = pd.read_csv(LU_ICD2_file_path)[['ICD_DIAG_CD', 'Description']]
LU_ICD2.rename(columns={'Description': 'ICD_Description'}, inplace=True)

LU_ICD_final = pd.concat([LU_ICD, LU_ICD2])



In [None]:
# Read the CSV file
outpatient = pd.read_csv(outpatient_file_path, sep="|")
# How were columns read in?
col_outpatient = pd.DataFrame(outpatient.dtypes, columns=['type'])
col_outpatient



  outpatient = pd.read_csv(outpatient_file_path, sep="|")


Unnamed: 0,type
BENE_ID,int64
CLM_ID,int64
NCH_NEAR_LINE_REC_IDENT_CD,object
NCH_CLM_TYPE_CD,int64
CLM_FROM_DT,object
...,...
REV_CNTR_STUS_IND_CD,int64
REV_CNTR_NDC_QTY,float64
REV_CNTR_NDC_QTY_QLFR_CD,float64
RNDRNG_PHYSN_UPIN,object


One concept that we have not talked about is HCPC Modifier Codes.  These add information without changing the definition of the code.  For example LT = Left Side, RT = Right Side

How is outpatient data different from inpatient data?  
1.  Anything related to admission does not exist. For example, present on admission flags, DRG, Source of Admission
2.  LOS is always 1
3.  ER data is not included.  ER Visits that do not result in an inpatient admission are generally billed at outpatient visits.  


In [None]:
# Convert columns to appropriate data types
outpatient['BENE_ID'] = outpatient['BENE_ID'].astype(str)
outpatient['CLM_ID'] = outpatient['CLM_ID'].astype(str)
outpatient['PTNT_DSCHRG_STUS_CD'] = outpatient['PTNT_DSCHRG_STUS_CD'].astype(str)
# Convert date columns
outpatient['CLM_FROM_DT'] = pd.to_datetime(outpatient['CLM_FROM_DT'], format='%d-%b-%Y')
outpatient['CLM_THRU_DT'] = pd.to_datetime(outpatient['CLM_THRU_DT'], format='%d-%b-%Y')

outpatient['YR'] = outpatient['CLM_THRU_DT'].dt.year

In [None]:
# Drop and select columns
columns_to_drop = [col for col in outpatient.columns if col.endswith('UPIN') or col.startswith('ICD_DGNS_E_CD') or col.startswith('PRCDR_DT')]
# columns_to_drop
outpatient = outpatient.drop(columns=columns_to_drop)



In [None]:
outpatient['REV_CNTR'].value_counts()

Unnamed: 0_level_0,count
REV_CNTR,Unnamed: 1_level_1
1,574480
780,317
789,295


REV_CNTR 450 not present, so no ER claims (generally ER visits that do not results in an inpatient admission are billed under outpatient)
REV_CNTR codes 780 and 789 are related to telemedicine, so we have going to ignore the REV_CNTR and drop it and all detail related data from the outpatient dataset.

In [None]:
# keep only header information
print(len(outpatient))
outpatient = outpatient[outpatient['CLM_LINE_NUM'] == 1]
print(len(outpatient))



575092
402653


In [None]:
# LOS is not a concept for outpatient, although some outpatient patients can spend more than 1 day in the hospital under observation or recovery
outpatient['LOS'] = (outpatient['CLM_THRU_DT'] - outpatient['CLM_FROM_DT']).dt.days + 1
outpatient['LOS'].value_counts()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outpatient['LOS'] = (outpatient['CLM_THRU_DT'] - outpatient['CLM_FROM_DT']).dt.days + 1


Unnamed: 0_level_0,count
LOS,Unnamed: 1_level_1
1,371460
2,31087
3,81
8,22
4,2
48,1


In [None]:
diagnosis = outpatient[['BENE_ID', 'YR', 'PRNCPAL_DGNS_CD'] +
                      [col for col in outpatient.columns if col.startswith('ICD_DGNS_CD')]]

# Wide to long
diagnosis = pd.melt(diagnosis,
                    id_vars=['BENE_ID', 'YR'],
                    var_name='ICD_DIAG_COL',
                    value_name='ICD_DIAG_CD')

# Remove rows where ICD_DIAG_CD is not populated
diagnosis = diagnosis[diagnosis['ICD_DIAG_CD'].notna() & (diagnosis['ICD_DIAG_CD'] != '')]

# Drop the 'ICD_DIAG_COL' column
diagnosis = diagnosis.drop(columns=['ICD_DIAG_COL'])

# Remove duplicates
diagnosis = diagnosis.drop_duplicates()
print(len(diagnosis))



779185


In [None]:

diagnosis = pd.merge(diagnosis, LU_ICD_final, on='ICD_DIAG_CD', how='left')


In [None]:
num_diagnosis = diagnosis.groupby(['BENE_ID', 'YR']).size().reset_index(name='NUM_DIAG')

In [None]:
outpatient_encounters = outpatient[['BENE_ID', 'CLM_ID', 'CLM_FROM_DT',
                            'CLM_THRU_DT', 'YR', 'PRNCPAL_DGNS_CD',
                            'PTNT_DSCHRG_STUS_CD','CLM_TOT_CHRG_AMT']]

# Perform left joins
outpatient_encounters = outpatient_encounters.merge(num_diagnosis, on=['BENE_ID', 'YR'], how='left')
outpatient_encounters = outpatient_encounters.merge(LU_ICD_final,
                                      left_on='PRNCPAL_DGNS_CD',
                                      right_on='ICD_DIAG_CD',
                                      how='left').drop(columns=['ICD_DIAG_CD'])


In [None]:

outpatient_encounters.to_csv('/content/drive/MyDrive/Data/Output Data/outpatient_encounters.csv', index=False)
diagnosis.to_csv('/content/drive/MyDrive/Data/Output Data/outpatient_diagnosis.csv', index=False)