In [1]:
import pandas as pd
import sqlite3
from tqdm.notebook import tqdm

In [2]:
pd.set_option("display.max_columns", 500)

In [3]:
#Listing currently existing tables in the database
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
        SELECT name
        FROM sqlite_master 
        WHERE type ='table' 
        AND name NOT LIKE 'sqlite_%';
        """ 

    test_df = pd.read_sql(query, db)

display(test_df)

Unnamed: 0,name
0,taxonomy
1,hop_teaming
2,npidata
3,filtered_hop_teaming


In [4]:
# Test to confirm the tables loaded.
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT COUNT(*) AS count_all
    FROM filtered_hop_teaming;
    """ 
    
    test = pd.read_sql(query, db)

test

Unnamed: 0,count_all
0,132000


In [46]:
# Find NPIs with largest number of referrals
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT SUM(transaction_count) AS total_referrals, to_npi, n.provider_org_name
    FROM filtered_hop_teaming AS f
    JOIN npidata AS n
    ON f.to_npi = n.npi
    GROUP BY to_npi
    ORDER BY total_referrals DESC
    LIMIT 20;
    """ 
    
    test = pd.read_sql(query, db)

test

Unnamed: 0,total_referrals,to_npi,provider_org_name
0,1113176,1104202761,VANDERBILT UNIVERSITY MEDICAL CENTER
1,876679,1093741464,"ADVANCED DIAGNOSTIC IMAGING, PC"
2,849974,1437194669,SAINT THOMAS MEDICAL PARTNERS
3,764787,1396882205,VANDERBILT UNIVERSITY MEDICAL CENTER
4,711886,1861478489,RADIOLOGY ALLIANCE PC
5,642771,1003863580,"ASSOCIATED PATHOLOGISTS, LLC"
6,409468,1245393057,CENTENNIAL HEART LLC
7,382116,1235186800,"PATHGROUP LABS, LLC"
8,345431,1215932413,"ANESTHESIA MEDICAL GROUP, PC"
9,311019,1811955917,TENNESSEE ONCOLOGY PLLC


In [51]:
# Find competitor hosptitals with the largest number of total referrals.
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT SUM(transaction_count) AS total_referrals, to_npi, n.provider_org_name
    FROM filtered_hop_teaming AS f
    JOIN npidata AS n
    ON f.to_npi = n.npi
    GROUP BY to_npi, provider_org_name
    HAVING provider_org_name NOT LIKE '%VANDERBILT%'
    ORDER BY total_referrals DESC
    LIMIT 10;
    """ 
    
    test = pd.read_sql(query, db)

test

Unnamed: 0,total_referrals,to_npi,provider_org_name
0,876679,1093741464,"ADVANCED DIAGNOSTIC IMAGING, PC"
1,849974,1437194669,SAINT THOMAS MEDICAL PARTNERS
2,711886,1861478489,RADIOLOGY ALLIANCE PC
3,642771,1003863580,"ASSOCIATED PATHOLOGISTS, LLC"
4,409468,1245393057,CENTENNIAL HEART LLC
5,382116,1235186800,"PATHGROUP LABS, LLC"
6,345431,1215932413,"ANESTHESIA MEDICAL GROUP, PC"
7,311019,1811955917,TENNESSEE ONCOLOGY PLLC
8,257331,1023055126,"HCA HEALTH SERVICES OF TENNESSEE, INC."
9,256085,1548284060,HERITAGE MEDICAL ASSOCIATES PC


In [55]:
# Find competitor hosptitals with the largest number of total referrals by from_npi
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    WITH from_npidata AS (
        SELECT *
        FROM npidata
    )
    SELECT SUM(transaction_count) AS total_referrals
        , to_npi
        , n.provider_org_name
        , from_npi.*
    FROM filtered_hop_teaming AS f
    JOIN npidata AS n
    ON f.to_npi = n.npi
    JOIN from_npidata 
    ON f.from_npi = from_npidata.npi
    GROUP BY to_npi, provider_org_name, from_npi
    HAVING provider_org_name NOT LIKE '%VANDERBILT%'
    ORDER BY total_referrals DESC
    LIMIT 10;
    """ 
    
    test = pd.read_sql(query, db)

test

DatabaseError: Execution failed on sql '
    WITH from_npidata AS (
        SELECT *
        FROM npidata
    )
    SELECT SUM(transaction_count) AS total_referrals
        , to_npi
        , n.provider_org_name
        , from_npi.*
    FROM filtered_hop_teaming AS f
    JOIN npidata AS n
    ON f.to_npi = n.npi
    JOIN from_npidata 
    ON f.from_npi = from_npidata.npi
    GROUP BY to_npi, provider_org_name, from_npi
    HAVING provider_org_name NOT LIKE '%VANDERBILT%'
    ORDER BY total_referrals DESC
    LIMIT 10;
    ': no such table: from_npi

In [None]:
# Tingting's code for exporting to Neo4j

nodes = list(set(ht.from_npi.tolist() + ht.to_npi.tolist()))
node_df = pd.DataFrame({'npi:ID': nodes})
node_df[':LABEL'] = "Provider"
node_df.to_csv('import/nodes.csv', index = False)
edges = pd.DataFrame({':START_ID' : ht.from_npi, 'patient_count': ht.patient_count, 
                      'transaction_count': ht.transaction_count, 
                     ':END_ID' : ht.to_npi})
edges[':TYPE'] = 'REFERRED_TO'
edges.to_csv('import/edges.csv', index = False)

In [53]:
# Test to confirm the tables loaded.
with sqlite3.connect('../data/hcbb.sqlite') as db :
    query = """
    SELECT *
    FROM npidata
    LIMIT 1;
    """ 
    
    test = pd.read_sql(query, db)

test

Unnamed: 0,npi,entity_type_code,provider_org_name,provider_last_name,provider_first_name,provider_middle_name,provider_name_prefix,provider_name_suffix,provider_credential,provider_business_address_1,provider_business_address_2,provider_business_city,provider_business_state,provider_business_zip,taxonomy_code,provider_business_zip5
0,1134122187,1,,RUDNICKE,CHERYL,DENISE,MRS.,,CRNP,250 25TH AVE N,STE 412,NASHVILLE,TN,372031632,363L00000X,37203
