In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm # Provides progress bar for long tasks

### Listing currently existing tables in the database

In [2]:
with sqlite3.connect('../data/hcbb_group_reviews.sqlite') as db :
    query = """
    SELECT name
    FROM sqlite_master 
    WHERE type ='table' 
    AND name NOT LIKE 'sqlite_%';
    """ 
    
    db_table_list = pd.read_sql(query, db)

display(db_table_list)

Unnamed: 0,name
0,cbsa
1,npidata
2,taxonomy
3,hop_teaming_raw
4,filtered_hop_teaming


### How many records do we originally have in the hop_teaming table?

**NOTE: The following cell codes runs in ~5 minutes. Set to markdown for security.**

Result: 203,330,907 records

### Filter `from_npi` to be entity type 1 and `to_npi` to be entity type 2

**NOTE: The following cell codes runs in ~5-10 minutes. Set to markdown for security.**

Result: 50,460,705

### Filter so that the `transaction_count` is >= 50 and `average_day_wait` < 50

Final `filtered_hop_teaming` records count: 8,846,709

### Creating a new table called `filtered_hop_teaming`

**IMPORTANT! This loading into the database should only be run once.** If you run this multiple times, it will create duplicate entries in the database. For the security of not re-running this code by accident, the code here is converted into markdown. **If you need to rebuild the database, delete the `data/hcbb_group_reviews.sqlite` file and re-run this cell as code. You will also need to make sure to re-run any other related scripts that builds other tables in the database.**

### Testing

In [3]:
with sqlite3.connect('../data/hcbb_group_reviews.sqlite') as db :
    
    query = """
    SELECT *
    FROM filtered_hop_teaming
    LIMIT 5;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1033194220,1003000308,62,80,26.175,53.78
1,1003818055,1003000563,101,146,0.0,0.0
2,1013967371,1003001538,46,61,21.033,27.679
3,1023073459,1003001553,25,51,34.314,32.55
4,1023003142,1003001553,164,222,33.923,52.133


In [4]:
with sqlite3.connect('../data/hcbb_group_reviews.sqlite') as db :
    query = """
    SELECT from_npi, to_npi
    FROM filtered_hop_teaming
    GROUP BY from_npi, to_npi;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,from_npi,to_npi
0,1003000126,1033102504
1,1003000126,1053363853
2,1003000126,1093102857
3,1003000126,1104203371
4,1003000126,1134117393
...,...,...
8846704,1992999874,1215958657
8846705,1992999874,1376917070
8846706,1992999874,1588613400
8846707,1992999874,1861818965


In [5]:
with sqlite3.connect('../data/hcbb_group_reviews.sqlite') as db :
    query = """
    SELECT COUNT(*) AS count_all 
    FROM filtered_hop_teaming;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,count_all
0,8846709


In [6]:
with sqlite3.connect('../data/hcbb_group_reviews.sqlite') as db :
    query = """
    SELECT *
    FROM filtered_hop_teaming
    WHERE from_npi = '1043215882'
    LIMIT 10;
    """ 
    
    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,from_npi,to_npi,patient_count,transaction_count,average_day_wait,std_day_wait
0,1043215882,1063843951,18,53,9.887,20.953
1,1043215882,1093741464,41,65,46.585,59.003
2,1043215882,1104202761,90,167,26.359,34.179
3,1043215882,1104277656,11,50,10.96,10.455
4,1043215882,1205886264,107,151,32.536,45.799
5,1043215882,1316018070,28,66,16.5,27.157
6,1043215882,1336104280,41,59,40.814,55.973
7,1043215882,1356379382,58,84,26.56,37.394
8,1043215882,1396882205,35,58,31.517,34.952
9,1043215882,1467460725,184,304,24.539,44.271


**Quick Fix for dropping tables (DO NOT RUN UNLESS FOR RECREATING TABLES)**