In [1]:
import duckdb
import polars as pl
from polars import col as c
import polars.selectors as cs
from config import BASE_PARQUET_PATH, DATABASE_URL


## Hospital Analysis

Using the hospital data stored in the `hospital_directory` table of the duckdb database, perform the following analyses:

1. Determine the total number of hospitals processed `processed`
2. Determine the number of hospitals with files compliant with CMS requirements `compliant`
3. Determine the number of hospitals with drugs listed in the file `has_drugs`
4. Determine the number of hospitals with drugs and corresponding prices `has_prices`

In [2]:
def run_query(query: str) -> pl.LazyFrame:
    with duckdb.connect(DATABASE_URL) as con:
        return pl.DataFrame(con.execute(query=query).fetch_arrow_table()).lazy()

def base_query():
    sql = f"""select * from hospital_directory"""
    return run_query(sql)


In [3]:
(
base_query()
.select(cs.boolean().exclude('converted').sum())
.collect(engine='streaming')
.to_pandas()
)

Unnamed: 0,processed,compliant,has_drugs,has_pricing
0,2992,2617,2122,998


In [4]:
(
base_query()
.select(cs.boolean().exclude('converted').sum())
.with_columns((pl.all() / c.processed).round(4).name.suffix('_pct'))
.select(cs.matches('.*_pct').exclude('processed_pct'))
.collect(engine='streaming')
.to_pandas()
)

Unnamed: 0,compliant_pct,has_drugs_pct,has_pricing_pct
0,0.8747,0.7092,0.3336


### Overall Summary Statistics
- 2,992 files were processed of which 87% were compliant with CMS requirements. 71% of the files had drugs listed and 34% had both drugs and corresponding prices.

## State based Summary Statistics

In [5]:
(
base_query()
.group_by("state")
.agg(cs.boolean().exclude('converted').sum())
.sort("state")
.collect(engine='streaming')
.to_pandas()
)

Unnamed: 0,state,processed,compliant,has_drugs,has_pricing
0,AK,26,17,14,4
1,AL,74,66,47,15
2,AR,74,69,53,33
3,AZ,74,68,40,29
4,CA,74,66,57,28
5,CO,74,63,59,28
6,CT,32,22,12,0
7,DE,14,14,9,2
8,FL,74,66,47,33
9,GA,74,69,57,18


In [6]:
(
base_query()
.group_by("state")
.agg(cs.boolean().exclude('converted').sum())
.with_columns((cs.matches('(?i)comp|has') / c.processed).round(4).name.suffix('_pct'))
.select(cs.matches('state|.*_pct').exclude('processed_pct'))
.sort("state")
.collect(engine='streaming')
.to_pandas()
)

Unnamed: 0,state,compliant_pct,has_drugs_pct,has_pricing_pct
0,AK,0.6538,0.5385,0.1538
1,AL,0.8919,0.6351,0.2027
2,AR,0.9324,0.7162,0.4459
3,AZ,0.9189,0.5405,0.3919
4,CA,0.8919,0.7703,0.3784
5,CO,0.8514,0.7973,0.3784
6,CT,0.6875,0.375,0.0
7,DE,1.0,0.6429,0.1429
8,FL,0.8919,0.6351,0.4459
9,GA,0.9324,0.7703,0.2432
