In [10]:
from data_loaders import DataLoaders as dl
import polars as pl
from polars import col as c
import polars.selectors as cs
import analysis
import importlib
from analysis import standardize_single_unit
import seaborn as sns
import plotly.express as px

In [35]:
product_selection = 'hcpcs'
over_columns = [
    # 'hospital_id', 
    product_selection, 
    'setting', 
    'drug_type_of_measurement'
    ]
q = (
dl
.load_hospital_price_table_with_drug_names()
#.filter(c.hospital_id == 'bffa8e3d-dcb6-45f0-a2c6-cf546cca6e8f')
.filter(pl.col(product_selection).is_not_null())
.with_columns(standardize_single_unit())
.filter(c.standard_charge_negotiated_dollar > .01)
.with_columns(c.setting.fill_null('Unknown').alias('setting'))
.with_columns(c.standard_charge_negotiated_dollar.mean().over(over_columns).round(2).alias('avg_negotiated_dollar'))
.with_columns(c.standard_charge_negotiated_dollar.std().over(over_columns).round(2).alias('std_negotiated_dollar'))
.with_columns(c.standard_charge_negotiated_dollar.sub(c.avg_negotiated_dollar).truediv(c.std_negotiated_dollar).round(4).alias('z_score_negotiated_dollar'))
.filter(c.z_score_negotiated_dollar.is_infinite().or_(c.z_score_negotiated_dollar.is_null().or_(c.z_score_negotiated_dollar.is_nan())).not_())
.group_by('hospital_id',product_selection)
.agg(c.z_score_negotiated_dollar.mean().round(4))
.group_by('hospital_id')
.agg(c.z_score_negotiated_dollar.mean().round(4).alias('avg_z_score_negotiated_dollar'))
.sort('avg_z_score_negotiated_dollar', descending=True)
.collect(engine="streaming")
)
q

hospital_id,avg_z_score_negotiated_dollar
str,f64
"""c1f4f40b-3e02-4671-b267-57c905…",5.7625
"""a89f8b4c-a4fc-462a-86e8-2dd095…",5.7333
"""bffa8e3d-dcb6-45f0-a2c6-cf546c…",5.5065
"""d2a564cc-e8e8-4dfe-8426-70d8ab…",3.3188
"""f93dae2b-7365-4d5e-b355-762209…",3.0434
…,…
"""ecbb2f5b-3e09-47c6-bb19-8f2e6e…",-0.4597
"""6812f4f8-a091-49b8-ac11-503cef…",-0.461
"""fdffca15-58a1-468d-b572-495521…",-0.465
"""b405d856-7421-4bfc-b827-46d861…",-0.4739


In [36]:
df = q.to_pandas()
df['z_sign'] = df['avg_z_score_negotiated_dollar'].apply(lambda v: 'Positive' if v > 0 else ('Zero' if v == 0 else 'Negative'))

fig = px.violin(
    df,
    #x='z_sign',
    y='avg_z_score_negotiated_dollar',
    # color='z_sign',
    box=True,
    points='all',
    hover_data=['hospital_id', 'avg_z_score_negotiated_dollar'],
    labels={
        'z_sign': 'Z score sign',
        'avg_z_score_negotiated_dollar': 'Avg z-score (negotiated $)'
    },
    title=f'Distribution of avg_z_score_negotiated_dollar (n={len(df)})',
    template='plotly_white'
)

fig.add_hline(y=0, line_dash='dash', line_color='black', annotation_text='Zero', annotation_position='top left')
fig.update_layout(width=900, height=500, showlegend=False)
fig.show()