In [1]:
import duckdb
import polars as pl
from polars import col as c
import polars.selectors as cs
from config import BASE_PARQUET_PATH, DATABASE_URL


## Hospital Analysis

Using the hospital data stored in the `hospital_directory` table of the duckdb database, perform the following analyses:

1. Determine the total number of hospitals processed `processed`
2. Determine the number of hospitals with files compliant with CMS requirements `compliant`
3. Determine the number of hospitals with drugs listed in the file `has_drugs`
4. Determine the number of hospitals with drugs and corresponding prices `has_prices`

In [20]:
def run_query(query: str) -> pl.LazyFrame:
    with duckdb.connect(DATABASE_URL) as con:
        return pl.DataFrame(con.execute(query=query).fetch_arrow_table()).lazy()

def base_query():
    sql = f"""select * from hospital_directory"""
    return run_query(sql)


In [21]:
(
base_query()
.select(cs.boolean().exclude('converted').sum())
.collect(engine='streaming')
.to_pandas()
)

Unnamed: 0,processed,compliant,has_drugs,has_pricing
0,2086,1697,1378,634


In [22]:
(
base_query()
.select(cs.boolean().exclude('converted').sum())
.with_columns((pl.all() / c.processed).round(4).name.suffix('_pct'))
.select(cs.matches('.*_pct').exclude('processed_pct'))
.collect(engine='streaming')
.to_pandas()
)

Unnamed: 0,compliant_pct,has_drugs_pct,has_pricing_pct
0,0.8135,0.6606,0.3039


### Overall Summary Statistics
- 2,086 files were processed of which 81.35% were compliant with CMS requirements. 66.06% of the files had drugs listed and 30.39% had both drugs and corresponding prices.

## State based Summary Statistics

In [23]:
(
base_query()
.group_by("state")
.agg(cs.boolean().exclude('converted').sum())
.sort("state")
.collect(engine='streaming')
.to_pandas()
)

Unnamed: 0,state,processed,compliant,has_drugs,has_pricing
0,AK,26,17,14,4
1,AL,43,39,26,8
2,AR,46,39,30,22
3,AZ,43,36,17,12
4,CA,50,41,40,21
5,CO,45,33,31,16
6,CT,33,22,12,1
7,DE,15,13,7,2
8,FL,50,35,25,12
9,GA,47,43,36,13


In [24]:
(
base_query()
.group_by("state")
.agg(cs.boolean().exclude('converted').sum())
.with_columns((cs.matches('(?i)comp|has') / c.processed).round(4).name.suffix('_pct'))
.select(cs.matches('state|.*_pct').exclude('processed_pct'))
.sort("state")
.collect(engine='streaming')
.to_pandas()
)

Unnamed: 0,state,compliant_pct,has_drugs_pct,has_pricing_pct
0,AK,0.6538,0.5385,0.1538
1,AL,0.907,0.6047,0.186
2,AR,0.8478,0.6522,0.4783
3,AZ,0.8372,0.3953,0.2791
4,CA,0.82,0.8,0.42
5,CO,0.7333,0.6889,0.3556
6,CT,0.6667,0.3636,0.0303
7,DE,0.8667,0.4667,0.1333
8,FL,0.7,0.5,0.24
9,GA,0.9149,0.766,0.2766
