In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm # Provides progress bar for long tasks

### Listing currently existing tables in the database

In [8]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT name
    FROM sqlite_master 
    WHERE type ='table' 
    AND name NOT LIKE 'sqlite_%';
    """ 
    
    db_table_list = pd.read_sql(query, db)

display(db_table_list)

Unnamed: 0,name
0,hop_teaming
1,taxonomy
2,npidata
3,filtered_hop_teaming


### How many records do we originally have in the hop_teaming table?

**NOTE: The following cell codes runs in ~5 minutes. Set to markdown for security.**

with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT COUNT(*)
    FROM hop_teaming;
    """ 
    
    db_hop_teaming_count = pd.read_sql(query, db)

display(db_hop_teaming_count)

Result: ~203 million records

### Filter `from_npi` to be entity type 1 and `to_npi` to be entity type 2

**NOTE: The following cell codes runs in ~5-10 minutes. Set to markdown for security.**

In [3]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    WITH npi_entity_type_1 AS (
        SELECT npi
        FROM npidata 
        WHERE entity_type_code = 1
    ), npi_entity_type_2 AS (
        SELECT npi
        FROM npidata 
        WHERE entity_type_code = 2
    )
    SELECT *
    FROM hop_teaming
    WHERE from_npi IN npi_entity_type_1
    AND to_npi IN npi_entity_type_2
    """
    
    filtered_hop_teaming = pd.read_sql(query, db)

display(filtered_hop_teaming.shape)
display(filtered_hop_teaming.head())

(233546, 6)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1043215882,1003028770,29,44,47.455,56.183
1,1043232879,1003028770,24,24,112.333,80.894
2,1043302466,1003028770,24,26,98.192,97.772
3,1033297429,1003028770,56,62,53.145,58.831
4,1043206329,1003028770,173,177,97.864,81.756


### Filter so that the `transaction_count` is >= 50 and `average_day_wait` <= 50

In [4]:
filtered_hop_teaming = filtered_hop_teaming[
    (filtered_hop_teaming["transaction_count"] >= 50) |
    (filtered_hop_teaming["average_day_wait"] <= 50)
]

display(filtered_hop_teaming.shape)
display(filtered_hop_teaming.head())

(132000, 6)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1043215882,1003028770,29,44,47.455,56.183
3,1033297429,1003028770,56,62,53.145,58.831
4,1043206329,1003028770,173,177,97.864,81.756
9,1003922881,1003028770,17,21,45.619,41.519
10,1003963976,1003028770,2535,3945,0.0,0.0


Finally `filtered_hop_teaming` records count: 108401

### Creating a new table called `filtered_hop_teaming`

**IMPORTANT! This loading into the database should only be run once.** If you run this multiple times, it will create duplicate entries in the database. For the security of not re-running this code by accident, the code here is converted into markdown. **If you need to rebuild the database, delete the `data/hcbb.sqlite` file and re-run this cell as code. You will also need to make sure to re-run any other related scripts that builds other tables in the database.**

In [5]:
with sqlite3.connect('../data/hcbb.sqlite') as db:
    filtered_hop_teaming.to_sql(
        'filtered_hop_teaming', 
        db, 
        if_exists = 'append', 
        index = False)

    # When done, print done
    print('Task done.')

Task done.


### Testing

In [6]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    
    query = """
    SELECT *
    FROM filtered_hop_teaming
    LIMIT 5;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1043215882,1003028770,29,44,47.455,56.183
1,1033297429,1003028770,56,62,53.145,58.831
2,1043206329,1003028770,173,177,97.864,81.756
3,1003922881,1003028770,17,21,45.619,41.519
4,1003963976,1003028770,2535,3945,0.0,0.0


In [7]:
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT COUNT(*) AS count_all 
    FROM filtered_hop_teaming;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,count_all
0,132000
