In [8]:
from data_loaders import DataLoaders as dl
import polars as pl
from polars import col as c
import polars.selectors as cs
import analysis
import importlib

### Overview of products, class, and ndc table

In [5]:
# function to filter for cancer and multiple sclerosis drugs using gpi_4_class
importlib.reload(analysis)
from analysis import cancer_and_ms_ndcs

# example usage of first 5 rows
cancer_and_ms_ndcs().collect(engine='streaming').head().to_pandas()

Unnamed: 0,product,gpi_4_class,ndc
0,HYDROXYprogesterone Caproate Intramuscular Sol...,Antineoplastic - Hormonal and Related Agents,[67457088605]
1,Lumakras Oral Tablet 240 MG,Antineoplastic Enzyme Inhibitors,[55513051260]
2,Kyprolis Intravenous Solution Reconstituted 30 MG,Antineoplastic Enzyme Inhibitors,[76075010201]
3,Elrexfio Subcutaneous Solution 44 MG/1.1ML,Antineoplastic - Antibodies,"[00069252201, 00069252202]"
4,Glatopa Subcutaneous Solution Prefilled Syring...,Multiple Sclerosis Agents,"[00781323434, 00781323471, 63629881501]"


In [10]:
importlib.reload(analysis)
from analysis import load_base_data
# load_base_data loads hospital price table and joins with ndcs from cancer_and_ms_ndcs function to limit to cancer and ms drugs

# example usage of first 5 rows
(
load_base_data()
.collect(engine='streaming')
.head()
.to_pandas()
)


Unnamed: 0,hospital_id,description,setting,drug_unit_of_measurement,drug_type_of_measurement,standard_charge_gross,standard_charge_discounted_cash,standard_charge_negotiated_dollar,plan_name,payer_name,standard_charge_methodology,standard_charge_negotiated_percentage,hcpcs,ndc,calculated_negotiated_dollars,mapped_plan_name,mapped_lob_name,product,gpi_4_class
0,483ce101-d0eb-434f-8f40-db066c58aa76,BCG LIVE 50 MG IS SUSR,outpatient,1.0,ea,154.99,154.99,4.99,AETNA MANAGED CARE,AETNA,FEE SCHEDULE,,J9030,52060202,False,Aetna,,Tice BCG Intravesical Suspension Reconstituted...,Antineoplastics Misc.
1,483ce101-d0eb-434f-8f40-db066c58aa76,BCG LIVE 50 MG IS SUSR,outpatient,1.0,ea,154.99,154.99,86.47,HEALTH NEW ENGLAND MANAGED CARE,HEALTH NEW ENGLAND,PERCENT OF TOTAL BILLED CHARGES,55.79,J9030,52060202,True,,,Tice BCG Intravesical Suspension Reconstituted...,Antineoplastics Misc.
2,483ce101-d0eb-434f-8f40-db066c58aa76,BCG LIVE 50 MG IS SUSR,outpatient,1.0,ea,154.99,154.99,4.22,CIGNA MANAGED CARE,CIGNA,FEE SCHEDULE,,J9030,52060202,False,Cigna,,Tice BCG Intravesical Suspension Reconstituted...,Antineoplastics Misc.
3,483ce101-d0eb-434f-8f40-db066c58aa76,BCG LIVE 50 MG IS SUSR,outpatient,1.0,ea,154.99,154.99,4.35,ANTHEM MANAGED CARE,ANTHEM,FEE SCHEDULE,,J9030,52060202,False,Anthem,,Tice BCG Intravesical Suspension Reconstituted...,Antineoplastics Misc.
4,483ce101-d0eb-434f-8f40-db066c58aa76,BCG LIVE 50 MG IS SUSR,outpatient,1.0,ea,154.99,154.99,4.74,UNITED MANAGED CARE,UNITED,FEE SCHEDULE,,J9030,52060202,False,United Health Care,,Tice BCG Intravesical Suspension Reconstituted...,Antineoplastics Misc.


In [12]:
importlib.reload(analysis)
# determine how many unique drugs

(
# load base data
load_base_data()
# group by gpi_4_class and count unique products, hospital_ids, and drug_type_of_measure
.group_by(c.gpi_4_class)
.agg(
    # take columns that match product, hospital_id, or drug_type_of_measure and count unique values
    cs.matches('(?i)product|hospital_id|drug_type_of_measure').n_unique().name.suffix('_unique')
)
.collect(engine='streaming')
.sort('product_unique', descending=True)
.to_pandas()
)

Unnamed: 0,gpi_4_class,hospital_id_unique,drug_type_of_measurement_unique,product_unique
0,Antineoplastic Enzyme Inhibitors,410,5,177
1,Antineoplastic - Antibodies,462,5,70
2,Antineoplastic - Hormonal and Related Agents,540,5,50
3,Antineoplastic Antibiotics,428,6,37
4,Antineoplastics Misc.,382,6,26
5,Multiple Sclerosis Agents,388,6,24
6,Antineoplastic - Angiogenesis Inhibitors,368,4,19
7,Antineoplastic - EGFR Inhibitors,318,4,16
8,Antineoplastic Combinations,315,4,13
9,Antineoplastic - Anti-HER2 Agents,310,4,7


### Summary Statistics for average unique prices over products

In [22]:
(
# load base data
load_base_data()
# group by product
.group_by(c.product)
# calculate unique price count for gross, cash, and negotiated prices
.agg(
    cs.matches('^standard.*gross|cash|negotiated_dollar').n_unique().name.suffix('_unique_count'),
)
# get the min, max, and mean over the product groupings
.select(
    cs.numeric().min().round(2).name.suffix('_min'),
    cs.numeric().max().round(2).name.suffix('_max'),
    cs.numeric().mean().round(2).name.suffix('_avg'),
)
.select(~cs.matches('(?i)calculated'))
.collect(engine="streaming")
.transpose(include_header=True)
.to_pandas()
)

Unnamed: 0,column,column_0
0,standard_charge_gross_unique_count_min,1.0
1,standard_charge_discounted_cash_unique_count_min,1.0
2,standard_charge_negotiated_dollar_unique_count...,1.0
3,standard_charge_gross_unique_count_max,684.0
4,standard_charge_discounted_cash_unique_count_max,753.0
5,standard_charge_negotiated_dollar_unique_count...,7492.0
6,standard_charge_gross_unique_count_avg,37.82
7,standard_charge_discounted_cash_unique_count_avg,44.44
8,standard_charge_negotiated_dollar_unique_count...,534.65


### The problem with price
- no standard price at the units or drug type level
- non-standardize unit or type of pricing for the same product increase complexity and confusion

In [15]:
importlib.reload(analysis)

# determine the unique products in the data set and summarize statistics on price descriptions

(
load_base_data()
.group_by(c.product)
.agg(
    cs.matches('(?i)of_measure').n_unique().name.suffix('_unique')
)
.select(
    c.product.n_unique().alias('num_products'),
    cs.matches('(?i)of_measure').mean().round(1).name.prefix('avg_'),
    cs.matches('(?i)of_measure').min().round(1).name.prefix('min_'),
    cs.matches('(?i)of_measure').max().round(1).name.prefix('max_')
)
.collect(engine="streaming")
.transpose(include_header=True)
.to_pandas()
)

Unnamed: 0,column,column_0
0,num_products,472.0
1,avg_drug_unit_of_measurement_unique,3.4
2,avg_drug_type_of_measurement_unique,2.4
3,min_drug_unit_of_measurement_unique,1.0
4,min_drug_type_of_measurement_unique,1.0
5,max_drug_unit_of_measurement_unique,23.0
6,max_drug_type_of_measurement_unique,5.0


### We attempted to quantify the magnitude of price variability
1. determine the min, max, mean, and standard deviation for each product without normalization of data for
    - gross price
    - cash discount price
    - negoticated price
2. aggregate the results to determine the average min, max, mean, and standard deviation across all products

In [20]:
def product_stats(col_regex: str) -> list[pl.Expr]:
    """
    helper function to summarize statistics for products based on a column regex.
    """
    return [
    cs.matches(f'(?i){col_regex}').min().name.suffix('_min'),
    cs.matches(f'(?i){col_regex}').mean().round(1).name.suffix('_mean'),
    cs.matches(f'(?i){col_regex}').max().name.suffix('_max'),
    cs.matches(f'(?i){col_regex}').std().round(1).name.suffix('_std')
    ]

(
load_base_data()
.group_by(c.product)
.agg(
    # regular expression to match columns related to gross, cash, or negotiated prices
    *product_stats('^standard.*gross|cash|negotiated_dollar'),
)
.select(
    # take all numeric columns and calculate the mean across all products
    cs.numeric().mean().round(2),
)
# remove unnecessary columns
.select(~cs.matches('(?i)calculated'))
.collect(engine="streaming")
.transpose(include_header=True)
.to_pandas()

)

Unnamed: 0,column,column_0
0,standard_charge_gross_min,4688.45
1,standard_charge_discounted_cash_min,2897.16
2,standard_charge_negotiated_dollar_min,1590.96
3,standard_charge_gross_mean,12431.08
4,standard_charge_discounted_cash_mean,62794.33
5,standard_charge_negotiated_dollar_mean,13877.38
6,standard_charge_gross_max,39571.89
7,standard_charge_discounted_cash_max,25667304.34
8,standard_charge_negotiated_dollar_max,42588624.09
9,standard_charge_gross_std,6623.55
