In [6]:
from data_loaders import DataLoaders as dl
import polars as pl
from polars import col as c
import polars.selectors as cs
import analysis
import importlib

### Product Overview Report
- unique drug names is based on GPI-10 generic name from Medispan
- A single ndc was mapped to each HCPCS code based on the ASP NDC crosswalk file from CMS
    - the NDC was then mapped to MEDISPAN GPI-10 generic name
- All NDCs were mapped to MEDISPAN GPI-10 generic name
- The record (line) coalesced the HCPCS GPI-10 mapping and NDC GPI-10 mapping to get a drug name
    - HCPCS mapping was preferred over NDC mapping

In [5]:


(
    (
    dl()
    .load_hospital_price_table_with_drug_names()
    .select(
        pl.len().alias('total_lines'),
        c.description.filter(c.drug_name.is_not_null()).len().alias('lines_w_drug_names'),
        c.description.filter(c.hcpcs.is_not_null()).len().alias('hcpcs_lines'),
        c.description.filter(c.ndc.is_not_null()).len().alias('ndc_lines'),
        c.drug_name.n_unique().alias('unique_drug_names'),
        c.hcpcs.n_unique().alias('unique_hcpcs'),
        c.hcpcs.filter(c.drug_name.is_not_null()).n_unique().alias('hcpcs_with_drug_name'),
        c.ndc.n_unique().alias('unique_ndcs'),
    )
    .collect(engine="streaming")
    .to_pandas()
    )
)

Unnamed: 0,total_lines,lines_w_drug_names,hcpcs_lines,ndc_lines,unique_drug_names,unique_hcpcs,hcpcs_with_drug_name,unique_ndcs
0,31103488,27701156,30734391,21001799,2402,1327,1094,70482


### Top 20 Drug Names by Frequency

In [3]:
# Top 20 drugs by number of lines
(
dl()
    .load_hospital_price_table_with_drug_names()
    .filter(c.drug_name.is_not_null())
    .group_by('drug_name')
    .agg(pl.len().alias('num_lines'), c.hospital_id.n_unique().alias('num_hospitals'))
    .sort('num_lines', descending=True)
    .head(20)
    .collect(engine="streaming")
    .to_pandas()
)

Unnamed: 0,drug_name,num_lines,num_hospitals
0,Enoxaparin Sodium,461376,1086
1,Vancomycin HCl,442115,1108
2,Heparin Sodium (Porcine),428010,1093
3,Sodium Chloride,356677,1036
4,Ceftriaxone Sodium,304483,1136
5,Bupivacaine HCl,297263,841
6,Potassium Chloride,289857,1071
7,Midazolam HCl,286896,1085
8,Magnesium Sulfate,266519,1081
9,Hydromorphone HCl,264068,1048


### Top 20 HCPCS by Frequency

In [4]:
# Top 20 HCPCS by number of lines with drug names
(
dl()
    .load_hospital_price_table_with_drug_names()
    .filter(c.drug_name.is_not_null())
    .filter(c.hcpcs.is_not_null())
    .group_by('hcpcs', 'drug_name')
    .agg(pl.len().alias('num_lines'), c.hospital_id.n_unique().alias('num_hospitals'))
    .sort('num_lines', descending=True)
    .head(20)
    .collect(engine="streaming")
    .to_pandas()
)

Unnamed: 0,hcpcs,drug_name,num_lines,num_hospitals
0,J1650,Enoxaparin Sodium,461280,1086
1,J1644,Heparin Sodium (Porcine),404492,1087
2,J3370,Vancomycin HCl,363356,1091
3,J0696,Ceftriaxone Sodium,296249,1136
4,J2250,Midazolam HCl,284768,1084
5,J3480,Potassium Chloride,283855,1071
6,J0665,Bupivacaine HCl,272751,805
7,J3475,Magnesium Sulfate in Dextrose,266406,1081
8,J3010,Fentanyl Citrate,256025,1076
9,J2543,Piperacillin Sodium-Tazobactam Sodium,249975,1069


In [17]:
def average_unique_prices_per_hospital_by_product_classification(product_classification):    
    return (
    dl
    .load_hospital_price_table_with_drug_names()
    # filter out rows with null product classification
    .filter(pl.col(f'{product_classification}').is_not_null())
    # group by hospital and product classification (ndc, hcpcs, drug_name)
    .group_by(c.hospital_id, product_classification)
    # count unique prices for each product and price type at each hospital
    .agg(cs.matches('(?i)gross|dollar$|cash').n_unique().name.suffix(f'_{product_classification}'))
    # then average those unique price counts across all products at each hospital
    .group_by(c.hospital_id)
    .agg(
        cs.numeric().mean().round(1))
    # then average those unique price counts across all hospitals
    .select(
        cs.numeric().mean().round(1).name.prefix('avg_')
    )
    )


### Average Unique Prices for HCPCS by Hospital

In [18]:
# Average unique prices per hospital by product classification
average_unique_prices_per_hospital_by_product_classification('hcpcs').collect(engine='streaming').to_pandas()

Unnamed: 0,avg_standard_charge_gross_hcpcs,avg_standard_charge_discounted_cash_hcpcs,avg_standard_charge_negotiated_dollar_hcpcs
0,2.3,2.3,19.6


### Average Unique Prices for Drug Name (GPI-10) by Hospital

In [19]:
average_unique_prices_per_hospital_by_product_classification('drug_name').collect(engine='streaming').to_pandas()

Unnamed: 0,avg_standard_charge_gross_drug_name,avg_standard_charge_discounted_cash_drug_name,avg_standard_charge_negotiated_dollar_drug_name
0,2.1,2.0,19.1


### Average Unique Prices for NDC by Hospital


In [20]:
average_unique_prices_per_hospital_by_product_classification('ndc').collect(engine='streaming').to_pandas()

Unnamed: 0,avg_standard_charge_gross_ndc,avg_standard_charge_discounted_cash_ndc,avg_standard_charge_negotiated_dollar_ndc
0,1.1,1.1,10.9


### How does Average Inpatient vs Outpatient Prices Vary by Hospital
1. Identify all hospital with prices for inpatient and outpatient services at a ndc, drug_name (gpi-10), and hcpcs level
2. Calculate average price for each observation at each level (ndc, drug_name (gpi-10), and hcpcs)
3. Calculate the difference between inpatient and outpatient prices for each observation at each level (ndc, drug_name (gpi-10), and hcpcs)


In [24]:
def inpatient_outpatient_price_differences(product_selection) -> pl.DataFrame:
    return (
    dl
    .load_hospital_price_table_with_drug_names()
    .filter(c.setting.str.contains('(?i)in|out'))
    .group_by(c.hospital_id, c.setting, product_selection)
    .agg(
        c.standard_charge_negotiated_dollar.mean().round(2),
    )
    .collect(engine='streaming')
    .pivot(
        on='setting',
        index=['hospital_id', product_selection],
    )
    .filter(c.outpatient.is_not_null() & c.inpatient.is_not_null())
    .with_columns(c.outpatient.sub(c.inpatient).round(2).alias('outpatient_diff'))
    .select(
        pl.lit(product_selection).alias('product_type'),
        pl.len().alias('row_ct'),
        c.hospital_id.n_unique().alias('unique_hospitals'),
        c.outpatient.mean().round(2).alias('avg_outpatient_price'),
        c.inpatient.mean().round(2).alias('avg_inpatient_price'),
        c.outpatient_diff.mean().round(2).alias('avg_outpatient_diff'),
    )
    )

pl.concat([inpatient_outpatient_price_differences(selection) for selection in ['ndc', 'hcpcs', 'drug_name']]).to_pandas()

Unnamed: 0,product_type,row_ct,unique_hospitals,avg_outpatient_price,avg_inpatient_price,avg_outpatient_diff
0,ndc,343993,423,2971.47,3798.95,-827.48
1,hcpcs,94132,423,2910.85,4013.43,-1102.57
2,drug_name,94886,422,2118.67,3193.2,-1074.53
