In [1]:
import sys
import json
import pickle
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas, pd_writer
import getpass as gt
import pandas as pd
import numpy as np

from snowflake_conn import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [2]:
conn = get_connection()

### Step 1: Get raw data
If we had pharmacy claims, we'd also need to do that here

In [16]:
# medical claim
sql = '''
SELECT
patient_id
, claim_id
, claim_line_number
, claim_start_date
, claim_end_date
, service_category_1
, service_category_2
, paid_amount
, allowed_amount
FROM SANDBOX_FFS.CORE.MEDICAL_CLAIM 
LIMIT 1000000
'''
medical_claim = read_sql(sql)
print(medical_claim.shape)
medical_claim.head(1)

(1000000, 9)


Unnamed: 0,patient_id,claim_id,claim_line_number,claim_start_date,claim_end_date,service_category_1,service_category_2,paid_amount,allowed_amount
0,-10000010272692,-10000930836616201671,5,,2016-02-02,Office Visit,Office Visit,350.6,


In [11]:
medical_claim.service_category_1.unique()

array(['Office Visit', 'Outpatient', 'Inpatient'], dtype=object)

In [12]:
medical_claim.service_category_2.unique()

array(['Office Visit', 'Hospice', 'Outpatient Hospital or Clinic',
       'Urgent Care', 'Skilled Nursing'], dtype=object)

In [17]:
sql = '''SELECT * FROM SANDBOX_FFS.CORE.ELIGIBILITY;'''
eligibility = read_sql(sql)
print(eligibility.shape)
eligibility.head(1)

(2705, 23)


Unnamed: 0,patient_id,member_id,gender,race,birth_date,death_date,death_flag,enrollment_start_date,enrollment_end_date,payer,payer_type,original_reason_entitlement_code,dual_status_code,medicare_status_code,first_name,last_name,address,city,state,zip_code,phone,data_source,tuva_last_run
0,-10000010254647,,male,black or african american,1990-01-01,,0,2025-01-01,2025-03-31,medicare,medicare,1,2,20,,,,,,,,medicare_lds,2023-10-05 16:59:34.487348+00:00


In [21]:
# member months
sql = '''SELECT COUNT(DISTINCT(patient_id)) FROM SANDBOX_FFS.FINANCIAL_PMPM.MEMBER_MONTHS;'''
print(read_sql(sql))

sql = '''SELECT * FROM SANDBOX_FFS.FINANCIAL_PMPM.MEMBER_MONTHS'''
member_months = read_sql(sql)
print(member_months.shape)
member_months.head(1)

   count(distinct(patient_id))
0                         2705
(8002, 4)


Unnamed: 0,patient_id,year_month,payer,tuva_last_run
0,-10000010254647,202501,medicare,2023-10-05 16:59:34.487348+00:00


### Step 2. Get service category data
See the `svc_cat_grouper` notebook for more info on this process
- Pull in medical claim info from `SANDBOX_CLAIMS.core.medical_claim`
- Pull out professional medical claims -- for each claim number, have one row for each claim line
    - Acute inpatient: place of service code = 21
    - Ambulance: place of service code = 41, 42; HCPCS code between A0425 and A0436
    - Ambulatory surgery: place of service code = 24; NOT in DME
    - Dialysis: place of service code = 65
    - DME (Durable Medical Equipment): HCPCS code between E0100 and E8002
    - ER: place of service code = 23
    - Home health: place of service code = 12; NOT in DME
    - Hospice: place of service code = 34
    - Inpatient psychiatric: place of service code = 51, 55, 56
    - Inpatient rehab: place of service code = 61
    - Lab: place of service code = 81
    - Office visit: place of service code = 11, 02
    - Outpatient hospital / clinic: place of service code = 15, 17, 19, 22, 49, 50, 60, 71, 72
    - Outpatient psychiatric: place of service code = 52, 53, 57, 58
    - Outpatient rehab: place of service code = 62
    - Skilled nursing: place of service code = 31, 32; NOT in DME
    - Urgent care: place of service code = 20
- Pull out institutional medical claims -- for each claim number, have one row for each claim line
    - Acute inpatient: revenue center code = 0100, 0101, 0110, 0111, 0112, 0113, 0114, 0116, 0117, 0118, 0119, 0120, 0121, 0122, 0123, 0124, 0126, 0127, 0128, 0129, 0130, 0131, 0132, 0133, 0134, 0136, 0137, 0138, 0139, 0140, 0141, 0142, 0143, 0144, 0146, 0147, 0148, 0149, 0150, 0151, 0152, 0153, 0154, 0156, 0157, 0158, 0159, 0160, 0164, 0167, 0169, 0170, 0171, 0172, 0173, 0174, 0179, 0190, 0191, 0192, 0193, 0194, 0199, 0200, 0201, 0202, 0203, 0204, 0206, 0207, 0208,0209, 0210, 0211, 0212, 0213, 0214, 0219, 1000, 1001, 1002; has a non-null MS DRG code; has a non-null ARG DRG code; bill type code starts with 11, 12
    - Dialysis: bill type code starts with 72
    - ER: revenue center code = 0450, 0451, 0452, 0459, 0981; bill type code starts with 13, 71, 73
    - Home health: bill type code starts with 31, 32, 33
    - Hospice: bill type code starts with 81, 82
    - Lab: bill type code starts with 14
    - Outpatient hospital / clinic: bill type code starts with 13, 71, 73; NOT in urgent care / ER
    - Outpatient psychiatric: bill type code starts with 52
    - Skilled nursing: bill type code starts with 21, 22
    - Urgent care: revenue center code = 0456; bill type code starts with 13, 71, 73
- Combine all professional and institutional claims
- Add a second service category group (in addition to the labels like "dialysis" or "urgent care" above):
    - Ancillary: ambulance (*professional claims only*), durable medical equipment (*professional claims only*), lab
    - Inpatient: acute inpatient, inpatient psychiatric, inpatient rehab, skilled nursing
    - Office Visit: office visit
    - Outpatient: ambulatory surgery, dialysis, ER, home health, hospice, outpatient hospital / clinic, outpatient psychiatric, outpatient rehab (*professional claims only*), urgent care
    - Other: null
- Final result of this process: table with claim number, claim line number, claim type (institutional or professional), service category 1 (ancillary, inpatient, office visit, outpatient, other), service category 2 (e.g., dialysis, urgent care)

In [22]:
sql = '''SELECT * FROM SANDBOX_FFS.FINANCIAL_PMPM._INT_PATIENT_SPEND_WITH_SERVICE_CATEGORIES;'''
service_categories = read_sql(sql)
print(service_categories.shape)
service_categories.head(10)

(246725, 7)


Unnamed: 0,patient_id,year_month,service_category_1,service_category_2,total_paid,total_allowed,tuva_last_run
0,-10000010282483,202106,Office Visit,Office Visit,29178.72,,2023-10-05 16:59:34.487348+00:00
1,-10000010282488,201903,Office Visit,Office Visit,9029.02,,2023-10-05 16:59:34.487348+00:00
2,-10000010282494,201807,Office Visit,Office Visit,1203.71,,2023-10-05 16:59:34.487348+00:00
3,-10000010282494,202102,Office Visit,Office Visit,95.9,,2023-10-05 16:59:34.487348+00:00
4,-10000010282491,202201,Office Visit,Office Visit,723.3,,2023-10-05 16:59:34.487348+00:00
5,-10000010282512,201712,Office Visit,Office Visit,2323.28,,2023-10-05 16:59:34.487348+00:00
6,-10000010282525,201612,Office Visit,Office Visit,7614.62,,2023-10-05 16:59:34.487348+00:00
7,-10000010282525,202101,Office Visit,Office Visit,7551.04,,2023-10-05 16:59:34.487348+00:00
8,-10000010282524,202105,Office Visit,Office Visit,7758.48,,2023-10-05 16:59:34.487348+00:00
9,-10000010282528,201711,Office Visit,Office Visit,8786.16,,2023-10-05 16:59:34.487348+00:00


In [25]:
service_categories.groupby('service_category_1')['service_category_2'].value_counts()

KeyError: 'service_category_1'

In [24]:
sql = '''SELECT * FROM SANDBOX_FFS.FINANCIAL_PMPM._INT_SERVICE_CATEGORY_1_ALLOWED_PIVOT;'''
service_categories = read_sql(sql)
print(service_categories.shape)
service_categories.head(10)

(196501, 9)


Unnamed: 0,patient_id,year_month,inpatient_allowed,outpatient_allowed,office_visit_allowed,ancillary_allowed,other_allowed,pharmacy_allowed,tuva_last_run
0,-10000010282494,202202,0.0,0.0,,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
1,-10000010282704,201611,0.0,0.0,,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
2,-10000010283154,201912,0.0,0.0,,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
3,-10000010283202,201803,0.0,0.0,,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
4,-10000010283361,201710,0.0,0.0,0.0,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
5,-10000010283927,201504,0.0,0.0,,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
6,-10000010284880,201602,0.0,0.0,,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
7,-10000010284878,202208,0.0,0.0,0.0,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
8,-10000010270615,201910,0.0,0.0,0.0,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
9,-10000010284908,201709,0.0,0.0,,0.0,0.0,0.0,2023-10-05 16:59:34.487348+00:00
