In [29]:
import pandas as pd
import pyodbc
connection_string = ("DRIVER={SQL Server};"
"SERVER=USDF11V0954;"
"DATABASE=Actuarial_AH;"
"Trust_Connection=yes;")

conn = pyodbc.connect(connection_string)
cursor = conn.cursor()

In [28]:
med = """
WITH WEIGHTLOSSCOHORT1 AS (
    SELECT DISTINCT MEMBER_ID,
           YEAR(SERVICE_DATE) AS PRESCRIPTION_YEAR
    FROM Actuarial_AH.DBO.SN_Rx
    WHERE ART_DRUGGRPS_L2 IN ('GLP-1 Agonists', 'Weight Loss Agents')
    AND SERVICE_DATE >= '2021-01-01'
    AND SERVICE_DATE < '2025-01-01'
),
WEIGHT_DX AS (
    SELECT DISTINCT MEMBER_ID
    FROM Actuarial_AH.DBO.SN_Medical
    WHERE ICD10_SECTION LIKE '%Overweight%'
)

SELECT M.MEDICAL_CLAIM_ID, 
       M.MEMBER_ID, 
       M.PAID_DATE, 
       YEAR(M.PAID_DATE) AS PAID_YEAR,
       M.SERVICE_DATE,
       YEAR(M.SERVICE_DATE) AS SERVICE_YEAR,
       M.MEDICAL_PAID_AMOUNT, 
       M.IS_TELEMEDICINE, 
       M.IS_ER_AVOIDABLE, 
       M.DIAGNOSIS_DESC_ICD10_1 AS PRIMARY_DX,
       M.PROCEDURE_DESC, 
       M.ARTTOS_V2_L1, 
       M.ARTTOS_V2_L3, 
       M.DX_IS_CHRONIC,
       M.ICD10_CHAPTER, 
       M.ICD10_CATEGORY, 
       M.ICD10_SECTION, 
       M.MEG_EPISODE_DESCRIPTION, 
       M.OP_SURG_INC,
       M.IS_PCP_VISIT, 
       M.INCLUDED_SPECIALIST, 
       M.ER_VISIT_FLAG, 
       M.IS_URGENT_CARE_VISIT, 
       M.IS_PREVENTIVE_VISIT, 
       M.IP_ADMIT_INC
FROM [Actuarial_AH].[dbo].[SN_Medical] M
JOIN Actuarial_AH.dbo.SN_Member MB ON MB.MEMBER_ID = M.MEMBER_ID
WHERE M.PAID_DATE BETWEEN '2021-01-01' AND '2024-12-31'
AND MB.MONTH_KEY = '2024-12-01' 
AND MB.MEDICAL_ENROLLMENT_STATUS = 3
AND MB.MEDICAL_CONT_ENRLMNT_START <= '2021-01-01'
AND MB.MEMBER_ID NOT IN (
    SELECT MEMBER_ID
    FROM WEIGHTLOSSCOHORT1
)
AND MB.MEMBER_ID IN (
    SELECT MEMBER_ID
    FROM WEIGHT_DX
)
"""
med = pd.read_sql(med, conn)

NameError: name 'conn' is not defined

In [1]:
#create mock dataset to replace company data
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set seed for reproducibility
np.random.seed(42)

# STEP 1: Simulate all member IDs and GLP-1 user subset
all_member_ids = np.random.choice(range(10000, 20000), size=10000, replace=False)
glp1_members = np.random.choice(all_member_ids, size=4000, replace=False)
non_glp1_members = list(set(all_member_ids) - set(glp1_members))

# STEP 2: From non-GLP-1 members, select those with overweight/obesity diagnosis
weight_dx_members = np.random.choice(non_glp1_members, size=5000, replace=True)

# STEP 3: Simulate 50,000 claims from those members
num_claims = 50000
claim_member_ids = np.random.choice(weight_dx_members, size=num_claims, replace=True)
claim_ids = np.arange(1, num_claims + 1)

# Random dates
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

start_date = datetime(2021, 1, 1)
end_date = datetime(2024, 12, 31)
paid_dates = [random_date(start_date, end_date) for _ in range(num_claims)]
service_dates = [random_date(start_date, end_date) for _ in range(num_claims)]

# Field generators
bools = [0, 1]
primary_dxs = np.random.choice(['Overweight', 'Obesity', 'Morbid Obesity'], size=num_claims)
procedure_descs = np.random.choice(['Office Visit', 'Weight Counseling', 'Lab Panel'], size=num_claims)
arttos_l1 = np.random.choice(['Evaluation & Management', 'Therapy'], size=num_claims)
arttos_l3 = np.random.choice(['Office Visit', 'Nutrition Therapy'], size=num_claims)
icd10_chapters = np.random.choice(['Endocrine', 'Digestive'], size=num_claims)
icd10_categories = np.random.choice(['E66', 'Z68'], size=num_claims)
icd10_sections = np.random.choice(['Overweight ICD10 Section', 'Obesity ICD10 Section', 'Morbid Obesity ICD10 Section'], size=num_claims)
meg_episodes = np.random.choice(['Weight Management', 'Preventive Care'], size=num_claims)
paid_amounts = np.round(np.random.exponential(scale=200, size=num_claims), 2)

# STEP 4: Build DataFrame
med = pd.DataFrame({
    'MEDICAL_CLAIM_ID': claim_ids,
    'MEMBER_ID': claim_member_ids,
    'PAID_DATE': paid_dates,
    'PAID_YEAR': [d.year for d in paid_dates],
    'SERVICE_DATE': service_dates,
    'SERVICE_YEAR': [d.year for d in service_dates],
    'MEDICAL_PAID_AMOUNT': paid_amounts,
    'IS_TELEMEDICINE': np.random.choice(bools, size=num_claims),
    'IS_ER_AVOIDABLE': np.random.choice(bools, size=num_claims),
    'PRIMARY_DX': primary_dxs,
    'PROCEDURE_DESC': procedure_descs,
    'ARTTOS_V2_L1': arttos_l1,
    'ARTTOS_V2_L3': arttos_l3,
    'DX_IS_CHRONIC': np.random.choice(bools, size=num_claims),
    'ICD10_CHAPTER': icd10_chapters,
    'ICD10_CATEGORY': icd10_categories,
    'ICD10_SECTION': icd10_sections,
    'MEG_EPISODE_DESCRIPTION': meg_episodes,
    'OP_SURG_INC': np.random.choice(bools, size=num_claims),
    'IS_PCP_VISIT': np.random.choice(bools, size=num_claims),
    'INCLUDED_SPECIALIST': np.random.choice(bools, size=num_claims),
    'ER_VISIT_FLAG': np.random.choice(bools, size=num_claims),
    'IS_URGENT_CARE_VISIT': np.random.choice(bools, size=num_claims),
    'IS_PREVENTIVE_VISIT': np.random.choice(bools, size=num_claims),
    'IP_ADMIT_INC': np.random.choice(bools, size=num_claims)
})

# STEP 5: Preview or save
med.head()

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Unnamed: 0,MEDICAL_CLAIM_ID,MEMBER_ID,PAID_DATE,PAID_YEAR,SERVICE_DATE,SERVICE_YEAR,MEDICAL_PAID_AMOUNT,IS_TELEMEDICINE,IS_ER_AVOIDABLE,PRIMARY_DX,...,ICD10_CATEGORY,ICD10_SECTION,MEG_EPISODE_DESCRIPTION,OP_SURG_INC,IS_PCP_VISIT,INCLUDED_SPECIALIST,ER_VISIT_FLAG,IS_URGENT_CARE_VISIT,IS_PREVENTIVE_VISIT,IP_ADMIT_INC
0,1,17942,2023-02-25,2023,2021-06-20,2021,38.27,1,1,Obesity,...,Z68,Overweight ICD10 Section,Preventive Care,0,1,0,0,0,0,1
1,2,19424,2021-07-28,2021,2021-12-03,2021,266.45,0,0,Obesity,...,E66,Morbid Obesity ICD10 Section,Weight Management,0,1,1,0,0,0,0
2,3,12422,2024-11-21,2024,2023-08-10,2023,252.8,1,0,Obesity,...,E66,Overweight ICD10 Section,Weight Management,0,0,1,0,1,1,0
3,4,15162,2024-05-04,2024,2021-08-22,2021,643.46,1,1,Obesity,...,E66,Overweight ICD10 Section,Weight Management,0,1,0,0,0,0,1
4,5,12272,2023-01-14,2023,2022-08-16,2022,300.92,0,1,Morbid Obesity,...,E66,Morbid Obesity ICD10 Section,Preventive Care,1,1,0,1,1,0,0


In [4]:
rx = """
WITH WEIGHTLOSSCOHORT1 AS (
    SELECT DISTINCT MEMBER_ID,
           YEAR(SERVICE_DATE) AS PRESCRIPTION_YEAR
    FROM Actuarial_AH.DBO.SN_Rx
WHERE ART_DRUGGRPS_L2 IN ('GLP-1 Agonists', 'Weight Loss Agents')
    AND SERVICE_DATE BETWEEN '2021-01-01' AND '2024-12-31'
),
WEIGHT_DX AS (
    SELECT DISTINCT MEMBER_ID
    FROM Actuarial_AH.DBO.SN_Medical
    WHERE ICD10_SECTION LIKE '%Overweight%'
)
SELECT R.RX_CLAIM_ID, 
       R.MEMBER_ID, 
       R.PAID_DATE, 
       R.SERVICE_DATE, 
       R.DRUG_NAME_GENERIC,
	   R.DRUG_NAME_PREFERRED,
	   R.RX_PAID_AMOUNT,
	   R.RX_SCRIPT_COUNT,
	   R.ART_DRUGGRPS_L1,
	   R.ART_DRUGGRPS_L2,
	   YEAR(R.PAID_DATE) AS PAID_YEAR,
YEAR(R.SERVICE_DATE) AS SERVICE_YEAR
FROM [Actuarial_AH].[dbo].[SN_Rx] R
JOIN 
Actuarial_AH.dbo.SN_Member M ON R.MEMBER_ID = M.MEMBER_ID
JOIN Actuarial_AH.dbo.SN_Medical MD ON MD.MEMBER_ID = M.MEMBER_ID
WHERE R.PAID_DATE BETWEEN '2021-01-01' AND '2024-12-31'
AND M.MONTH_KEY = '2024-12-01' 
AND M.MEDICAL_ENROLLMENT_STATUS = 3
AND M.MEDICAL_CONT_ENRLMNT_START <= '2021-01-01'
AND M.MEMBER_ID NOT IN (
    SELECT MEMBER_ID
    FROM WEIGHTLOSSCOHORT1
    GROUP BY MEMBER_ID
	)
AND MD.ICD10_SECTION LIKE '%Overweight%'
"""
rx = pd.read_sql(rx, conn)

  rx = pd.read_sql(rx, conn)


In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set seed for reproducibility
np.random.seed(42)

# Simulated MEMBER_IDs (replace with your medical_df['MEMBER_ID'].unique() if available)
member_ids = np.random.choice(range(10000, 20000), size=5000, replace=False)

# Create 50,000 Rx claims for those members
num_rx_claims = 50000
rx_claim_ids = np.arange(1, num_rx_claims + 1)
rx_member_ids = np.random.choice(member_ids, size=num_rx_claims, replace=True)

# Date generator
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

start_date = datetime(2021, 1, 1)
end_date = datetime(2024, 12, 31)
rx_paid_dates = [random_date(start_date, end_date) for _ in range(num_rx_claims)]
rx_service_dates = [random_date(start_date, end_date) for _ in range(num_rx_claims)]

# Non-GLP-1 drug list
non_glp1_drugs = [
    ('metformin', 'Glucophage', 'Biguanides'),
    ('atorvastatin', 'Lipitor', 'Statins'),
    ('lisinopril', 'Prinivil', 'ACE Inhibitors'),
    ('amlodipine', 'Norvasc', 'Calcium Channel Blockers'),
    ('levothyroxine', 'Synthroid', 'Thyroid Agents'),
    ('albuterol', 'Ventolin', 'Beta-2 Agonists'),
]

# Randomly assign drugs
drug_samples = random.choices(non_glp1_drugs, k=num_rx_claims)
generic_names, preferred_names, art_l2 = zip(*drug_samples)
art_l1 = ['Cardiovascular Agents' if g in ['Statins', 'ACE Inhibitors', 'Calcium Channel Blockers'] else 'Endocrine Agents' for g in art_l2]

# Other fields
rx_paid_amounts = np.round(np.random.exponential(scale=100, size=num_rx_claims), 2)
rx_script_counts = np.random.choice([1, 2, 3], size=num_rx_claims)

# Create Rx DataFrame
rx = pd.DataFrame({
    'RX_CLAIM_ID': rx_claim_ids,
    'MEMBER_ID': rx_member_ids,
    'PAID_DATE': rx_paid_dates,
    'SERVICE_DATE': rx_service_dates,
    'DRUG_NAME_GENERIC': generic_names,
    'DRUG_NAME_PREFERRED': preferred_names,
    'RX_PAID_AMOUNT': rx_paid_amounts,
    'RX_SCRIPT_COUNT': rx_script_counts,
    'ART_DRUGGRPS_L1': art_l1,
    'ART_DRUGGRPS_L2': art_l2,
    'PAID_YEAR': [d.year for d in rx_paid_dates],
    'SERVICE_YEAR': [d.year for d in rx_service_dates]
})

# Preview the data
rx.head()

Unnamed: 0,RX_CLAIM_ID,MEMBER_ID,PAID_DATE,SERVICE_DATE,DRUG_NAME_GENERIC,DRUG_NAME_PREFERRED,RX_PAID_AMOUNT,RX_SCRIPT_COUNT,ART_DRUGGRPS_L1,ART_DRUGGRPS_L2,PAID_YEAR,SERVICE_YEAR
0,1,18180,2023-08-11,2023-05-27,atorvastatin,Lipitor,220.3,3,Cardiovascular Agents,Statins,2023,2023
1,2,18708,2022-11-11,2021-08-04,levothyroxine,Synthroid,216.74,3,Endocrine Agents,Thyroid Agents,2022,2021
2,3,11681,2024-04-19,2023-02-22,levothyroxine,Synthroid,127.33,3,Endocrine Agents,Thyroid Agents,2024,2023
3,4,16682,2021-11-07,2024-05-22,atorvastatin,Lipitor,192.27,1,Cardiovascular Agents,Statins,2021,2024
4,5,13071,2023-06-08,2024-11-20,atorvastatin,Lipitor,255.77,1,Cardiovascular Agents,Statins,2023,2024


In [5]:
member = """
WITH WEIGHTLOSSCOHORT1 AS (
    SELECT DISTINCT MEMBER_ID,
           YEAR(SERVICE_DATE) AS PRESCRIPTION_YEAR
    FROM Actuarial_AH.DBO.SN_Rx
WHERE ART_DRUGGRPS_L2 IN ('GLP-1 Agonists', 'Weight Loss Agents')
    AND SERVICE_DATE >= '2022-01-01'
    AND SERVICE_DATE < '2025-01-01'
),
WEIGHT_DX AS (
    SELECT DISTINCT MEMBER_ID
    FROM Actuarial_AH.DBO.SN_Medical
    WHERE ICD10_SECTION LIKE '%Overweight%'
)
SELECT M.MEMBER_ID,
       M.MEMBER_STATUS,
       M.HOME_STATE,
	   CASE 
    WHEN M.MEMBER_RELATIONSHIP = 0 THEN 'Subscriber'
    WHEN M.MEMBER_RELATIONSHIP = 1 THEN 'Spouse'
    WHEN M.MEMBER_RELATIONSHIP = 2 THEN 'Dependent'
    WHEN M.MEMBER_RELATIONSHIP = 3 THEN 'Domestic Partner'
    WHEN M.MEMBER_RELATIONSHIP = 4 THEN 'Unknown'
    ELSE 'Unknown' -- Optional: Handle unexpected values
END AS MEMBER_RELATIONSHIP,
CASE 
        WHEN M.GENDER = 0 THEN 'Female'
        WHEN M.GENDER = 1 THEN 'Male'
        WHEN M.GENDER = 2 THEN 'Unknown'
        ELSE 'Unknown' 
    END AS GENDER,
    DATEDIFF(YEAR, CONVERT(DATE, M.BIRTH_YEAR_MO + '-01'), GETDATE()) - 
    CASE 
        WHEN MONTH(CONVERT(DATE, M.BIRTH_YEAR_MO + '-01')) > MONTH(GETDATE()) 
             OR (MONTH(CONVERT(DATE, M.BIRTH_YEAR_MO + '-01')) = MONTH(GETDATE()) 
                 AND DAY(CONVERT(DATE, M.BIRTH_YEAR_MO + '-01')) > DAY(GETDATE())) 
        THEN 1 
        ELSE 0 
    END AS Age,
	CASE 
    WHEN M.BIRTH_YEAR >= 1946 AND M.BIRTH_YEAR < 1965 THEN 'Baby Boomers'
	WHEN M.BIRTH_YEAR >= 1965 AND M.BIRTH_YEAR < 1981 THEN 'Generation X'
	WHEN M.BIRTH_YEAR >= 1981 AND M.BIRTH_YEAR < 1997 THEN 'Millenials'
	WHEN M.BIRTH_YEAR >= 1997 AND M.BIRTH_YEAR < 2012 THEN 'Generation Z'
    ELSE 'Unknown' -- Optional: Handle unexpected values
END AS GENERATIONS
FROM [Actuarial_AH].[dbo].[SN_MEMBER] M
JOIN Actuarial_AH.dbo.SN_Medical MD ON M.MEMBER_ID = MD.MEMBER_ID
WHERE M.MEMBER_ID NOT IN (
    SELECT MEMBER_ID
    FROM WEIGHTLOSSCOHORT1
    GROUP BY MEMBER_ID
	)
AND M.MONTH_KEY = '2024-12-01' 
AND M.MEDICAL_ENROLLMENT_STATUS = 3
AND M.MEDICAL_CONT_ENRLMNT_START <= '2021-01-01'
AND MD.ICD10_SECTION LIKE '%Overweight%'

"""
member = pd.read_sql(member, conn)

  member = pd.read_sql(member, conn)


In [7]:
# Set seed for reproducibility
np.random.seed(42)

# Step 1: Simulate member IDs
all_member_ids = np.random.choice(range(10000, 20000), size=10000, replace=False)

# Simulate GLP-1 users (to exclude them)
glp1_members = np.random.choice(all_member_ids, size=4000, replace=False)
non_glp1_members = list(set(all_member_ids) - set(glp1_members))

# Select 5,000 members from non-GLP-1 users with overweight diagnosis
weight_dx_members = np.random.choice(non_glp1_members, size=5000, replace=False)

# Step 2: Simulate member-level fields
num_members = len(weight_dx_members)
member_status = np.random.choice([1, 2, 3], size=num_members)  # 1 = Active, 2 = COBRA, 3 = Retired
home_states = np.random.choice(['UT', 'CA', 'TX', 'NY', 'FL', 'WA', 'IL', 'CO', 'PA', 'NC'], size=num_members)
relationship_codes = np.random.choice([0, 1, 2, 3, 4], size=num_members)
gender_codes = np.random.choice([0, 1, 2], size=num_members)

# Simulate birth dates
birth_years = np.random.choice(range(1946, 2012), size=num_members)
birth_months = np.random.choice(range(1, 13), size=num_members)

# Step 3: Calculate age and generation
reference_date = datetime(2025, 6, 28)
ages = []
generations = []

for year, month in zip(birth_years, birth_months):
    birth_date = datetime(year, month, 1)
    age = reference_date.year - birth_date.year - ((reference_date.month, reference_date.day) < (birth_date.month, birth_date.day))
    ages.append(age)

    if 1946 <= year < 1965:
        generations.append('Baby Boomers')
    elif 1965 <= year < 1981:
        generations.append('Generation X')
    elif 1981 <= year < 1997:
        generations.append('Millenials')
    elif 1997 <= year < 2012:
        generations.append('Generation Z')
    else:
        generations.append('Unknown')

# Step 4: Map relationship and gender codes
relationship_map = {0: 'Subscriber', 1: 'Spouse', 2: 'Dependent', 3: 'Domestic Partner', 4: 'Unknown'}
gender_map = {0: 'Female', 1: 'Male', 2: 'Unknown'}

# Step 5: Build DataFrame
member = pd.DataFrame({
    'MEMBER_ID': weight_dx_members,
    'MEMBER_STATUS': member_status,
    'HOME_STATE': home_states,
    'MEMBER_RELATIONSHIP': [relationship_map[code] for code in relationship_codes],
    'GENDER': [gender_map[code] for code in gender_codes],
    'Age': ages,
    'GENERATIONS': generations
})

# Preview
member.head()

Unnamed: 0,MEMBER_ID,MEMBER_STATUS,HOME_STATE,MEMBER_RELATIONSHIP,GENDER,Age,GENERATIONS
0,12937,3,CO,Subscriber,Unknown,48,Generation X
1,10730,2,WA,Unknown,Male,33,Millenials
2,14992,3,NC,Domestic Partner,Unknown,19,Generation Z
3,11320,1,CA,Unknown,Female,38,Millenials
4,10558,3,IL,Subscriber,Unknown,30,Millenials


In [8]:
member

Unnamed: 0,MEMBER_ID,MEMBER_STATUS,HOME_STATE,MEMBER_RELATIONSHIP,GENDER,Age,GENERATIONS
0,12937,3,CO,Subscriber,Unknown,48,Generation X
1,10730,2,WA,Unknown,Male,33,Millenials
2,14992,3,NC,Domestic Partner,Unknown,19,Generation Z
3,11320,1,CA,Unknown,Female,38,Millenials
4,10558,3,IL,Subscriber,Unknown,30,Millenials
...,...,...,...,...,...,...,...
4995,18543,3,WA,Unknown,Female,13,Generation Z
4996,15523,2,IL,Unknown,Male,46,Generation X
4997,19037,3,NC,Spouse,Female,44,Generation X
4998,15819,3,FL,Spouse,Female,43,Millenials


In [11]:
med

Unnamed: 0,MEDICAL_CLAIM_ID,MEMBER_ID,PAID_DATE,PAID_YEAR,SERVICE_DATE,SERVICE_YEAR,MEDICAL_PAID_AMOUNT,IS_TELEMEDICINE,IS_ER_AVOIDABLE,PRIMARY_DX,...,ICD10_CATEGORY,ICD10_SECTION,MEG_EPISODE_DESCRIPTION,OP_SURG_INC,IS_PCP_VISIT,INCLUDED_SPECIALIST,ER_VISIT_FLAG,IS_URGENT_CARE_VISIT,IS_PREVENTIVE_VISIT,IP_ADMIT_INC
0,1,17942,2023-02-25,2023,2021-06-20,2021,38.27,1,1,Obesity,...,Z68,Overweight ICD10 Section,Preventive Care,0,1,0,0,0,0,1
1,2,19424,2021-07-28,2021,2021-12-03,2021,266.45,0,0,Obesity,...,E66,Morbid Obesity ICD10 Section,Weight Management,0,1,1,0,0,0,0
2,3,12422,2024-11-21,2024,2023-08-10,2023,252.80,1,0,Obesity,...,E66,Overweight ICD10 Section,Weight Management,0,0,1,0,1,1,0
3,4,15162,2024-05-04,2024,2021-08-22,2021,643.46,1,1,Obesity,...,E66,Overweight ICD10 Section,Weight Management,0,1,0,0,0,0,1
4,5,12272,2023-01-14,2023,2022-08-16,2022,300.92,0,1,Morbid Obesity,...,E66,Morbid Obesity ICD10 Section,Preventive Care,1,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49996,14600,2021-04-09,2021,2022-01-19,2022,352.43,0,1,Overweight,...,E66,Obesity ICD10 Section,Weight Management,0,0,1,1,1,1,1
49996,49997,14274,2021-05-16,2021,2022-03-11,2022,187.23,0,0,Obesity,...,E66,Morbid Obesity ICD10 Section,Preventive Care,0,1,0,1,1,0,0
49997,49998,12020,2023-04-03,2023,2022-08-23,2022,43.17,1,0,Obesity,...,E66,Morbid Obesity ICD10 Section,Weight Management,1,1,1,0,1,1,1
49998,49999,19577,2021-03-31,2021,2022-03-18,2022,41.91,0,0,Morbid Obesity,...,Z68,Morbid Obesity ICD10 Section,Weight Management,1,0,0,1,0,1,0


In [12]:
rx

Unnamed: 0,RX_CLAIM_ID,MEMBER_ID,PAID_DATE,SERVICE_DATE,DRUG_NAME_GENERIC,DRUG_NAME_PREFERRED,RX_PAID_AMOUNT,RX_SCRIPT_COUNT,ART_DRUGGRPS_L1,ART_DRUGGRPS_L2,PAID_YEAR,SERVICE_YEAR
0,1,18180,2023-08-11,2023-05-27,atorvastatin,Lipitor,220.30,3,Cardiovascular Agents,Statins,2023,2023
1,2,18708,2022-11-11,2021-08-04,levothyroxine,Synthroid,216.74,3,Endocrine Agents,Thyroid Agents,2022,2021
2,3,11681,2024-04-19,2023-02-22,levothyroxine,Synthroid,127.33,3,Endocrine Agents,Thyroid Agents,2024,2023
3,4,16682,2021-11-07,2024-05-22,atorvastatin,Lipitor,192.27,1,Cardiovascular Agents,Statins,2021,2024
4,5,13071,2023-06-08,2024-11-20,atorvastatin,Lipitor,255.77,1,Cardiovascular Agents,Statins,2023,2024
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49996,19825,2024-12-28,2024-03-08,atorvastatin,Lipitor,87.50,3,Cardiovascular Agents,Statins,2024,2024
49996,49997,14868,2021-03-18,2021-04-10,albuterol,Ventolin,66.12,3,Endocrine Agents,Beta-2 Agonists,2021,2021
49997,49998,14097,2022-11-26,2022-07-10,amlodipine,Norvasc,27.84,2,Cardiovascular Agents,Calcium Channel Blockers,2022,2022
49998,49999,12724,2023-08-02,2022-11-27,amlodipine,Norvasc,35.72,2,Cardiovascular Agents,Calcium Channel Blockers,2023,2022


In [13]:
rx_wla = rx[(rx['ART_DRUGGRPS_L2'] == 'Weight Loss Agents') | (rx['ART_DRUGGRPS_L1'] == 'GLP-1 Agonists')]
rx_wla.DRUG_NAME_PREFERRED.unique()

array([], dtype=object)

In [14]:
rx = rx.drop_duplicates()
rx

Unnamed: 0,RX_CLAIM_ID,MEMBER_ID,PAID_DATE,SERVICE_DATE,DRUG_NAME_GENERIC,DRUG_NAME_PREFERRED,RX_PAID_AMOUNT,RX_SCRIPT_COUNT,ART_DRUGGRPS_L1,ART_DRUGGRPS_L2,PAID_YEAR,SERVICE_YEAR
0,1,18180,2023-08-11,2023-05-27,atorvastatin,Lipitor,220.30,3,Cardiovascular Agents,Statins,2023,2023
1,2,18708,2022-11-11,2021-08-04,levothyroxine,Synthroid,216.74,3,Endocrine Agents,Thyroid Agents,2022,2021
2,3,11681,2024-04-19,2023-02-22,levothyroxine,Synthroid,127.33,3,Endocrine Agents,Thyroid Agents,2024,2023
3,4,16682,2021-11-07,2024-05-22,atorvastatin,Lipitor,192.27,1,Cardiovascular Agents,Statins,2021,2024
4,5,13071,2023-06-08,2024-11-20,atorvastatin,Lipitor,255.77,1,Cardiovascular Agents,Statins,2023,2024
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49996,19825,2024-12-28,2024-03-08,atorvastatin,Lipitor,87.50,3,Cardiovascular Agents,Statins,2024,2024
49996,49997,14868,2021-03-18,2021-04-10,albuterol,Ventolin,66.12,3,Endocrine Agents,Beta-2 Agonists,2021,2021
49997,49998,14097,2022-11-26,2022-07-10,amlodipine,Norvasc,27.84,2,Cardiovascular Agents,Calcium Channel Blockers,2022,2022
49998,49999,12724,2023-08-02,2022-11-27,amlodipine,Norvasc,35.72,2,Cardiovascular Agents,Calcium Channel Blockers,2023,2022


In [15]:
member = member.drop_duplicates()
member.reset_index(inplace=True)
member.MEMBER_ID.nunique()

5000

In [16]:
med.MEMBER_ID.nunique()

3435

In [17]:
rx.MEMBER_ID.nunique()

5000

In [18]:
# Filter to find members with the specified conditions
members_to_remove = med[(med['ICD10_CHAPTER'] == 'Pregnancy, childbirth and the puerperium') | (med['ICD10_SECTION'] == 'Malignant neoplasm of thyroid gland') |
(med['ICD10_CATEGORY'] == 'Acute pancreatitis') | (med['ICD10_CATEGORY'] == 'Acute kidney failure and chronic kidney disease')]['MEMBER_ID'].unique()
# Combine both lists of members to remove
all_members_to_remove = set(members_to_remove)

# Step 3: Remove those members from the original DataFrames
rx_members_remove = rx[~rx['MEMBER_ID'].isin(all_members_to_remove)]
med_members_remove = med[~med['MEMBER_ID'].isin(all_members_to_remove)]
members_remove = member[~member['MEMBER_ID'].isin(all_members_to_remove)]

In [19]:
med_members_remove.MEMBER_ID.nunique()

3435

In [20]:
rx_members_remove.MEMBER_ID.nunique()

5000

In [21]:
med_agg = med_members_remove.pivot_table(index='MEMBER_ID', columns='PAID_YEAR', values='MEDICAL_PAID_AMOUNT', aggfunc='sum')
med_agg.reset_index(inplace=True)
med_agg.columns.name = None
med_agg.columns = [f"{col}" if col != 'PAID_YEAR' else 'PAID_YEAR' for col in med_agg.columns]
med_agg.rename(columns={'2021': 'Med_2021', '2022': 'Med_2022', '2023': 'Med_2023', '2024': 'Med_2024'}, inplace=True)

In [22]:
med_agg1 = med_agg
med_agg1

Unnamed: 0,MEMBER_ID,Med_2021,Med_2022,Med_2023,Med_2024
0,10000,284.71,703.88,837.66,283.97
1,10001,543.18,192.09,541.78,147.28
2,10006,943.59,30.95,629.44,191.42
3,10010,786.29,205.40,1.81,572.26
4,10014,1542.92,641.45,979.80,425.24
...,...,...,...,...,...
3430,19992,399.66,237.57,15.27,773.78
3431,19993,1043.95,757.03,226.88,1418.49
3432,19994,721.29,286.80,708.32,17.07
3433,19997,1090.46,1257.39,293.89,124.83


In [23]:
# Define inflation rates (as factors)
inflation_factors = {
    2021: 1.175, 
    2022: 1.117,   
    2023: 1.069, 
    2024: 1
}

# Normalize employer paid amounts for inflation
for year, factor in inflation_factors.items():
    column_name = f'Med_{year}'
    med_agg[column_name] = med_agg[column_name] * factor
med_agg

Unnamed: 0,MEMBER_ID,Med_2021,Med_2022,Med_2023,Med_2024
0,10000,334.53425,786.23396,895.45854,283.97
1,10001,638.23650,214.56453,579.16282,147.28
2,10006,1108.71825,34.57115,672.87136,191.42
3,10010,923.89075,229.43180,1.93489,572.26
4,10014,1812.93100,716.49965,1047.40620,425.24
...,...,...,...,...,...
3430,19992,469.60050,265.36569,16.32363,773.78
3431,19993,1226.64125,845.60251,242.53472,1418.49
3432,19994,847.51575,320.35560,757.19408,17.07
3433,19997,1281.29050,1404.50463,314.16841,124.83


In [24]:
med_agg.fillna(0, inplace=True)
med_agg.isnull().sum()

MEMBER_ID    0
Med_2021     0
Med_2022     0
Med_2023     0
Med_2024     0
dtype: int64

In [25]:
med_agg

Unnamed: 0,MEMBER_ID,Med_2021,Med_2022,Med_2023,Med_2024
0,10000,334.53425,786.23396,895.45854,283.97
1,10001,638.23650,214.56453,579.16282,147.28
2,10006,1108.71825,34.57115,672.87136,191.42
3,10010,923.89075,229.43180,1.93489,572.26
4,10014,1812.93100,716.49965,1047.40620,425.24
...,...,...,...,...,...
3430,19992,469.60050,265.36569,16.32363,773.78
3431,19993,1226.64125,845.60251,242.53472,1418.49
3432,19994,847.51575,320.35560,757.19408,17.07
3433,19997,1281.29050,1404.50463,314.16841,124.83


In [26]:
med_agg['Med_Delta'] = med_agg['Med_2024'] - med_agg['Med_2021']
med_agg['Med_%_Change'] =(( med_agg['Med_2024'] - med_agg['Med_2021'])/ med_agg['Med_2021']) * 100
med_agg

Unnamed: 0,MEMBER_ID,Med_2021,Med_2022,Med_2023,Med_2024,Med_Delta,Med_%_Change
0,10000,334.53425,786.23396,895.45854,283.97,-50.56425,-15.114820
1,10001,638.23650,214.56453,579.16282,147.28,-490.95650,-76.923915
2,10006,1108.71825,34.57115,672.87136,191.42,-917.29825,-82.735019
3,10010,923.89075,229.43180,1.93489,572.26,-351.63075,-38.059776
4,10014,1812.93100,716.49965,1047.40620,425.24,-1387.69100,-76.544060
...,...,...,...,...,...,...,...
3430,19992,469.60050,265.36569,16.32363,773.78,304.17950,64.774101
3431,19993,1226.64125,845.60251,242.53472,1418.49,191.84875,15.640168
3432,19994,847.51575,320.35560,757.19408,17.07,-830.44575,-97.985878
3433,19997,1281.29050,1404.50463,314.16841,124.83,-1156.46050,-90.257479


In [27]:
med_agg.fillna(0, inplace=True)
med_agg.isnull().sum()

MEMBER_ID       0
Med_2021        0
Med_2022        0
Med_2023        0
Med_2024        0
Med_Delta       0
Med_%_Change    0
dtype: int64

In [28]:
import numpy as np

# Prepare the years and medical costs
years = np.array([2021, 2022, 2023, 2024])
slope_list = []

# Calculate the slope for each member
for index, row in med_agg.iterrows():
    costs = np.array([row['Med_2021'], row['Med_2022'], row['Med_2023'], row['Med_2024']])
    # Perform linear regression to get the slope
    slope, intercept = np.polyfit(years, costs, 1)  # 1 indicates linear fit
    slope_list.append(slope)

# Add the slope to the DataFrame
med_agg['Med_Slope'] = slope_list
med_agg

Unnamed: 0,MEMBER_ID,Med_2021,Med_2022,Med_2023,Med_2024,Med_Delta,Med_%_Change,Med_Slope
0,10000,334.53425,786.23396,895.45854,283.97,-50.56425,-15.114820,-4.246817
1,10001,638.23650,214.56453,579.16282,147.28,-490.95650,-76.923915,-110.827121
2,10006,1108.71825,34.57115,672.87136,191.42,-917.29825,-82.735019,-211.359454
3,10010,923.89075,229.43180,1.93489,572.26,-351.63075,-38.059776,-128.238916
4,10014,1812.93100,716.49965,1047.40620,425.24,-1387.69100,-76.544060,-383.216645
...,...,...,...,...,...,...,...,...
3430,19992,469.60050,265.36569,16.32363,773.78,304.17950,64.774101,66.349644
3431,19993,1226.64125,845.60251,242.53472,1418.49,191.84875,15.640168,-2.752154
3432,19994,847.51575,320.35560,757.19408,17.07,-830.44575,-97.985878,-205.449877
3433,19997,1281.29050,1404.50463,314.16841,124.83,-1156.46050,-90.257479,-455.971772


In [29]:
rx_agg = rx_members_remove.pivot_table(index='MEMBER_ID', columns='PAID_YEAR', values='RX_PAID_AMOUNT', aggfunc='sum')
rx_agg.reset_index(inplace=True)
rx_agg.columns.name = None
rx_agg.columns = [f"{col}" if col != 'PAID_YEAR' else 'PAID_YEAR' for col in rx_agg.columns]
rx_agg.rename(columns={'2021': 'Rx_2021', '2022': 'Rx_2022', '2023': 'Rx_2023', '2024': 'Rx_2024'}, inplace=True)
rx_agg1 = rx_agg
rx_agg1

Unnamed: 0,MEMBER_ID,Rx_2021,Rx_2022,Rx_2023,Rx_2024
0,10000,143.58,239.78,270.94,47.97
1,10003,95.34,128.73,,88.72
2,10008,,92.11,91.74,4.95
3,10010,1115.63,426.95,155.21,560.29
4,10012,11.99,340.47,505.60,212.01
...,...,...,...,...,...
4995,19985,44.84,590.73,110.35,187.69
4996,19989,625.66,452.91,234.43,
4997,19993,241.74,573.51,125.41,137.26
4998,19996,1150.77,43.40,,17.43


In [30]:
# Define inflation rates (as factors)
rx_inflation_factors = {
    2021: 1.31, 
    2022: 1.20,   
    2023: 1.103, 
    2024: 1
}

# Normalize employer paid amounts for inflation
for year, factor in rx_inflation_factors.items():
    column_name = f'Rx_{year}'
    rx_agg[column_name] = rx_agg[column_name] * factor
rx_agg

Unnamed: 0,MEMBER_ID,Rx_2021,Rx_2022,Rx_2023,Rx_2024
0,10000,188.0898,287.736,298.84682,47.97
1,10003,124.8954,154.476,,88.72
2,10008,,110.532,101.18922,4.95
3,10010,1461.4753,512.340,171.19663,560.29
4,10012,15.7069,408.564,557.67680,212.01
...,...,...,...,...,...
4995,19985,58.7404,708.876,121.71605,187.69
4996,19989,819.6146,543.492,258.57629,
4997,19993,316.6794,688.212,138.32723,137.26
4998,19996,1507.5087,52.080,,17.43


In [31]:
rx_agg.fillna(0, inplace=True)
rx_agg.isnull().sum()

MEMBER_ID    0
Rx_2021      0
Rx_2022      0
Rx_2023      0
Rx_2024      0
dtype: int64

In [32]:
rx_agg['Rx_Delta'] = rx_agg['Rx_2024'] - rx_agg['Rx_2021']
rx_agg['Rx_%_Change'] =((rx_agg['Rx_2024'] - rx_agg['Rx_2021'])/ rx_agg['Rx_2021']) * 100
# Prepare the years and medical costs
years = np.array([2021, 2022, 2023, 2024])
slope_list = []

# Calculate the slope for each member
for index, row in rx_agg.iterrows():
    costs = np.array([row['Rx_2021'], row['Rx_2022'], row['Rx_2023'], row['Rx_2024']])
    # Perform linear regression to get the slope
    slope, intercept = np.polyfit(years, costs, 1)  # 1 indicates linear fit
    slope_list.append(slope)

# Add the slope to the DataFrame
rx_agg['Rx_Slope'] = slope_list
rx_agg

Unnamed: 0,MEMBER_ID,Rx_2021,Rx_2022,Rx_2023,Rx_2024,Rx_Delta,Rx_%_Change,Rx_Slope
0,10000,188.0898,287.736,298.84682,47.97,-140.1198,-74.496225,-40.924858
1,10003,124.8954,154.476,0.00000,88.72,-36.1754,-28.964558,-26.300220
2,10008,0.0000,110.532,101.18922,4.95,4.9500,inf,0.550722
3,10010,1461.4753,512.340,171.19663,560.29,-901.1853,-61.662712,-304.469927
4,10012,15.7069,408.564,557.67680,212.01,196.3031,1249.788946,73.802210
...,...,...,...,...,...,...,...,...
4995,19985,58.7404,708.876,121.71605,187.69,128.9496,219.524552,-20.031115
4996,19989,819.6146,543.492,258.57629,0.00,-819.6146,-100.000000,-274.375951
4997,19993,316.6794,688.212,138.32723,137.26,-179.4194,-56.656480,-108.814297
4998,19996,1507.5087,52.080,0.00000,17.43,-1490.0787,-98.843788,-452.231610


In [33]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [34]:
rx_agg.replace(np.inf, 0, inplace=True)
rx_agg

Unnamed: 0,MEMBER_ID,Rx_2021,Rx_2022,Rx_2023,Rx_2024,Rx_Delta,Rx_%_Change,Rx_Slope
0,10000,188.09,287.74,298.85,47.97,-140.12,-74.50,-40.92
1,10003,124.90,154.48,0.00,88.72,-36.18,-28.96,-26.30
2,10008,0.00,110.53,101.19,4.95,4.95,0.00,0.55
3,10010,1461.48,512.34,171.20,560.29,-901.19,-61.66,-304.47
4,10012,15.71,408.56,557.68,212.01,196.30,1249.79,73.80
...,...,...,...,...,...,...,...,...
4995,19985,58.74,708.88,121.72,187.69,128.95,219.52,-20.03
4996,19989,819.61,543.49,258.58,0.00,-819.61,-100.00,-274.38
4997,19993,316.68,688.21,138.33,137.26,-179.42,-56.66,-108.81
4998,19996,1507.51,52.08,0.00,17.43,-1490.08,-98.84,-452.23


In [35]:
member_mrg = members_remove.merge(med_agg, on='MEMBER_ID')
member_mrg = member_mrg.merge(rx_agg, on='MEMBER_ID')                                     
member_mrg       

Unnamed: 0,index,MEMBER_ID,MEMBER_STATUS,HOME_STATE,MEMBER_RELATIONSHIP,GENDER,Age,GENERATIONS,Med_2021,Med_2022,...,Med_Delta,Med_%_Change,Med_Slope,Rx_2021,Rx_2022,Rx_2023,Rx_2024,Rx_Delta,Rx_%_Change,Rx_Slope
0,7,15467,1,UT,Unknown,Female,58,Generation X,288.22,272.79,...,844.36,292.96,269.75,234.91,0.00,6.67,261.03,26.12,11.12,8.50
1,14,15527,3,UT,Unknown,Male,17,Generation Z,305.29,349.14,...,136.10,44.58,31.06,1331.68,142.02,544.36,740.90,-590.78,-44.36,-137.00
2,25,19284,3,TX,Dependent,Female,47,Generation X,852.11,1802.32,...,-566.95,-66.53,-334.11,431.63,27.78,493.82,444.28,12.65,2.93,50.40
3,38,15766,1,FL,Unknown,Unknown,60,Baby Boomers,1188.03,1370.78,...,-1169.12,-98.41,-424.74,88.87,619.27,150.53,654.75,565.88,636.75,122.89
4,39,16010,3,CA,Spouse,Male,22,Generation Z,278.05,151.04,...,-176.72,-63.56,35.37,513.40,881.88,228.73,890.72,377.32,73.49,47.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1193,4991,18092,3,WA,Spouse,Unknown,46,Generation X,416.41,677.83,...,-385.08,-92.48,-179.90,811.49,78.19,222.44,176.16,-635.33,-78.29,-176.17
1194,4993,17695,3,PA,Domestic Partner,Female,63,Baby Boomers,96.00,283.58,...,-49.35,-51.40,70.46,481.14,610.01,388.10,25.95,-455.19,-94.61,-158.75
1195,4994,14113,1,IL,Domestic Partner,Male,67,Baby Boomers,712.56,343.40,...,-37.84,-5.31,259.52,336.13,210.47,82.04,687.71,351.58,104.59,92.63
1196,4998,15819,3,FL,Spouse,Female,43,Millenials,1020.26,723.38,...,-947.08,-92.83,-232.23,37.14,298.44,120.91,355.94,318.80,858.41,77.89


In [36]:
member1 = member_mrg.drop(columns=['MEMBER_STATUS'])
member1

Unnamed: 0,index,MEMBER_ID,HOME_STATE,MEMBER_RELATIONSHIP,GENDER,Age,GENERATIONS,Med_2021,Med_2022,Med_2023,...,Med_Delta,Med_%_Change,Med_Slope,Rx_2021,Rx_2022,Rx_2023,Rx_2024,Rx_Delta,Rx_%_Change,Rx_Slope
0,7,15467,UT,Unknown,Female,58,Generation X,288.22,272.79,437.21,...,844.36,292.96,269.75,234.91,0.00,6.67,261.03,26.12,11.12,8.50
1,14,15527,UT,Unknown,Male,17,Generation Z,305.29,349.14,251.44,...,136.10,44.58,31.06,1331.68,142.02,544.36,740.90,-590.78,-44.36,-137.00
2,25,19284,TX,Dependent,Female,47,Generation X,852.11,1802.32,162.03,...,-566.95,-66.53,-334.11,431.63,27.78,493.82,444.28,12.65,2.93,50.40
3,38,15766,FL,Unknown,Unknown,60,Baby Boomers,1188.03,1370.78,630.70,...,-1169.12,-98.41,-424.74,88.87,619.27,150.53,654.75,565.88,636.75,122.89
4,39,16010,CA,Spouse,Male,22,Generation Z,278.05,151.04,1034.92,...,-176.72,-63.56,35.37,513.40,881.88,228.73,890.72,377.32,73.49,47.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1193,4991,18092,WA,Spouse,Unknown,46,Generation X,416.41,677.83,34.08,...,-385.08,-92.48,-179.90,811.49,78.19,222.44,176.16,-635.33,-78.29,-176.17
1194,4993,17695,PA,Domestic Partner,Female,63,Baby Boomers,96.00,283.58,1136.25,...,-49.35,-51.40,70.46,481.14,610.01,388.10,25.95,-455.19,-94.61,-158.75
1195,4994,14113,IL,Domestic Partner,Male,67,Baby Boomers,712.56,343.40,3052.09,...,-37.84,-5.31,259.52,336.13,210.47,82.04,687.71,351.58,104.59,92.63
1196,4998,15819,FL,Spouse,Female,43,Millenials,1020.26,723.38,1242.37,...,-947.08,-92.83,-232.23,37.14,298.44,120.91,355.94,318.80,858.41,77.89


In [37]:
med_util = med.groupby('MEMBER_ID').agg({
    'DX_IS_CHRONIC': 'max',
    'ER_VISIT_FLAG': 'sum',
    'IS_PCP_VISIT': 'sum',
    'IS_URGENT_CARE_VISIT': 'sum',
    'IS_PREVENTIVE_VISIT': 'sum',
    'IP_ADMIT_INC': 'sum'
}).reset_index()
med_util

Unnamed: 0,MEMBER_ID,DX_IS_CHRONIC,ER_VISIT_FLAG,IS_PCP_VISIT,IS_URGENT_CARE_VISIT,IS_PREVENTIVE_VISIT,IP_ADMIT_INC
0,10000,1,7,5,7,8,5
1,10001,1,4,7,6,3,6
2,10006,1,7,8,6,7,6
3,10010,1,2,3,4,2,5
4,10014,1,10,9,12,7,7
...,...,...,...,...,...,...,...
3430,19992,1,4,5,3,4,4
3431,19993,1,8,6,2,8,6
3432,19994,1,6,6,5,6,3
3433,19997,1,9,9,11,8,7


In [38]:
merge_member = member1.merge(med_util, on='MEMBER_ID')
merge_member

Unnamed: 0,index,MEMBER_ID,HOME_STATE,MEMBER_RELATIONSHIP,GENDER,Age,GENERATIONS,Med_2021,Med_2022,Med_2023,...,Rx_2024,Rx_Delta,Rx_%_Change,Rx_Slope,DX_IS_CHRONIC,ER_VISIT_FLAG,IS_PCP_VISIT,IS_URGENT_CARE_VISIT,IS_PREVENTIVE_VISIT,IP_ADMIT_INC
0,7,15467,UT,Unknown,Female,58,Generation X,288.22,272.79,437.21,...,261.03,26.12,11.12,8.50,1,6,9,7,8,5
1,14,15527,UT,Unknown,Male,17,Generation Z,305.29,349.14,251.44,...,740.90,-590.78,-44.36,-137.00,1,6,6,5,4,4
2,25,19284,TX,Dependent,Female,47,Generation X,852.11,1802.32,162.03,...,444.28,12.65,2.93,50.40,1,10,5,10,8,12
3,38,15766,FL,Unknown,Unknown,60,Baby Boomers,1188.03,1370.78,630.70,...,654.75,565.88,636.75,122.89,1,6,6,8,5,5
4,39,16010,CA,Spouse,Male,22,Generation Z,278.05,151.04,1034.92,...,890.72,377.32,73.49,47.88,1,3,3,4,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1193,4991,18092,WA,Spouse,Unknown,46,Generation X,416.41,677.83,34.08,...,176.16,-635.33,-78.29,-176.17,1,4,3,1,4,2
1194,4993,17695,PA,Domestic Partner,Female,63,Baby Boomers,96.00,283.58,1136.25,...,25.95,-455.19,-94.61,-158.75,1,2,7,7,7,5
1195,4994,14113,IL,Domestic Partner,Male,67,Baby Boomers,712.56,343.40,3052.09,...,687.71,351.58,104.59,92.63,1,7,8,10,11,13
1196,4998,15819,FL,Spouse,Female,43,Millenials,1020.26,723.38,1242.37,...,355.94,318.80,858.41,77.89,1,8,9,11,7,11


In [39]:
merge_member.drop(columns='index', inplace=True)

In [40]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)
merge_member

Unnamed: 0,MEMBER_ID,HOME_STATE,MEMBER_RELATIONSHIP,GENDER,Age,GENERATIONS,Med_2021,Med_2022,Med_2023,Med_2024,Med_Delta,Med_%_Change,Med_Slope,Rx_2021,Rx_2022,Rx_2023,Rx_2024,Rx_Delta,Rx_%_Change,Rx_Slope,DX_IS_CHRONIC,ER_VISIT_FLAG,IS_PCP_VISIT,IS_URGENT_CARE_VISIT,IS_PREVENTIVE_VISIT,IP_ADMIT_INC
0,15467,UT,Unknown,Female,58,Generation X,288.22,272.79,437.21,1132.58,844.36,292.96,269.75,234.91,0.00,6.67,261.03,26.12,11.12,8.50,1,6,9,7,8,5
1,15527,UT,Unknown,Male,17,Generation Z,305.29,349.14,251.44,441.39,136.10,44.58,31.06,1331.68,142.02,544.36,740.90,-590.78,-44.36,-137.00,1,6,6,5,4,4
2,19284,TX,Dependent,Female,47,Generation X,852.11,1802.32,162.03,285.16,-566.95,-66.53,-334.11,431.63,27.78,493.82,444.28,12.65,2.93,50.40,1,10,5,10,8,12
3,15766,FL,Unknown,Unknown,60,Baby Boomers,1188.03,1370.78,630.70,18.91,-1169.12,-98.41,-424.74,88.87,619.27,150.53,654.75,565.88,636.75,122.89,1,6,6,8,5,5
4,16010,CA,Spouse,Male,22,Generation Z,278.05,151.04,1034.92,101.33,-176.72,-63.56,35.37,513.40,881.88,228.73,890.72,377.32,73.49,47.88,1,3,3,4,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1193,18092,WA,Spouse,Unknown,46,Generation X,416.41,677.83,34.08,31.33,-385.08,-92.48,-179.90,811.49,78.19,222.44,176.16,-635.33,-78.29,-176.17,1,4,3,1,4,2
1194,17695,PA,Domestic Partner,Female,63,Baby Boomers,96.00,283.58,1136.25,46.65,-49.35,-51.40,70.46,481.14,610.01,388.10,25.95,-455.19,-94.61,-158.75,1,2,7,7,7,5
1195,14113,IL,Domestic Partner,Male,67,Baby Boomers,712.56,343.40,3052.09,674.72,-37.84,-5.31,259.52,336.13,210.47,82.04,687.71,351.58,104.59,92.63,1,7,8,10,11,13
1196,15819,FL,Spouse,Female,43,Millenials,1020.26,723.38,1242.37,73.18,-947.08,-92.83,-232.23,37.14,298.44,120.91,355.94,318.80,858.41,77.89,1,8,9,11,7,11


In [41]:
episodes = med.ICD10_CHAPTER.value_counts()
episodes.head(10)

ICD10_CHAPTER
Endocrine    25110
Digestive    24890
Name: count, dtype: int64

In [42]:
episodes_cost = med.groupby('ICD10_CHAPTER')['MEDICAL_PAID_AMOUNT'].sum().reset_index()
episodes_cost.sort_values(by='MEDICAL_PAID_AMOUNT', ascending=False)

Unnamed: 0,ICD10_CHAPTER,MEDICAL_PAID_AMOUNT
1,Endocrine,5019860.24
0,Digestive,4947705.58


In [43]:
episodes_to_keep = ['Diseases of the musculoskeletal system and connective tissue', 'Neoplasms', 'Diseases of the digestive system', 
                    'Diseases of the circulatory system', 'Diseases of the circulatory system', 'Diseases of the genitourinary system',
                   'Diseases of the nervous system', 'Mental, Behavioral and Neurodevelopmental disorders']
med1 = med
med1['ICD10_CHAPTER'] = med['ICD10_CHAPTER'].apply(lambda x: x if x in episodes_to_keep else 'OTHER')

In [44]:
med_meg = pd.pivot_table(
    med1,
    values='MEDICAL_PAID_AMOUNT',
    index='MEMBER_ID',
    columns='ICD10_CHAPTER',
    aggfunc='sum',
    fill_value=0
)
med_meg.reset_index()
med_meg.columns.name = None
med_meg.drop(columns='OTHER', inplace=True)
med_meg

10000
10001
10006
10010
10014
...
19992
19993
19994
19997
19999


In [45]:
member_merge_cond = merge_member.merge(med_meg, on='MEMBER_ID')
member_merge_cond

Unnamed: 0,MEMBER_ID,HOME_STATE,MEMBER_RELATIONSHIP,GENDER,Age,GENERATIONS,Med_2021,Med_2022,Med_2023,Med_2024,Med_Delta,Med_%_Change,Med_Slope,Rx_2021,Rx_2022,Rx_2023,Rx_2024,Rx_Delta,Rx_%_Change,Rx_Slope,DX_IS_CHRONIC,ER_VISIT_FLAG,IS_PCP_VISIT,IS_URGENT_CARE_VISIT,IS_PREVENTIVE_VISIT,IP_ADMIT_INC
0,15467,UT,Unknown,Female,58,Generation X,288.22,272.79,437.21,1132.58,844.36,292.96,269.75,234.91,0.00,6.67,261.03,26.12,11.12,8.50,1,6,9,7,8,5
1,15527,UT,Unknown,Male,17,Generation Z,305.29,349.14,251.44,441.39,136.10,44.58,31.06,1331.68,142.02,544.36,740.90,-590.78,-44.36,-137.00,1,6,6,5,4,4
2,19284,TX,Dependent,Female,47,Generation X,852.11,1802.32,162.03,285.16,-566.95,-66.53,-334.11,431.63,27.78,493.82,444.28,12.65,2.93,50.40,1,10,5,10,8,12
3,15766,FL,Unknown,Unknown,60,Baby Boomers,1188.03,1370.78,630.70,18.91,-1169.12,-98.41,-424.74,88.87,619.27,150.53,654.75,565.88,636.75,122.89,1,6,6,8,5,5
4,16010,CA,Spouse,Male,22,Generation Z,278.05,151.04,1034.92,101.33,-176.72,-63.56,35.37,513.40,881.88,228.73,890.72,377.32,73.49,47.88,1,3,3,4,2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1193,18092,WA,Spouse,Unknown,46,Generation X,416.41,677.83,34.08,31.33,-385.08,-92.48,-179.90,811.49,78.19,222.44,176.16,-635.33,-78.29,-176.17,1,4,3,1,4,2
1194,17695,PA,Domestic Partner,Female,63,Baby Boomers,96.00,283.58,1136.25,46.65,-49.35,-51.40,70.46,481.14,610.01,388.10,25.95,-455.19,-94.61,-158.75,1,2,7,7,7,5
1195,14113,IL,Domestic Partner,Male,67,Baby Boomers,712.56,343.40,3052.09,674.72,-37.84,-5.31,259.52,336.13,210.47,82.04,687.71,351.58,104.59,92.63,1,7,8,10,11,13
1196,15819,FL,Spouse,Female,43,Millenials,1020.26,723.38,1242.37,73.18,-947.08,-92.83,-232.23,37.14,298.44,120.91,355.94,318.80,858.41,77.89,1,8,9,11,7,11


In [46]:
member_merge_cond.describe()

  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,MEMBER_ID,Age,Med_2021,Med_2022,Med_2023,Med_2024,Med_Delta,Med_%_Change,Med_Slope,Rx_2021,Rx_2022,Rx_2023,Rx_2024,Rx_Delta,Rx_%_Change,Rx_Slope,DX_IS_CHRONIC,ER_VISIT_FLAG,IS_PCP_VISIT,IS_URGENT_CARE_VISIT,IS_PREVENTIVE_VISIT,IP_ADMIT_INC
count,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1192.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0,1198.0
mean,14905.07,46.71,798.03,737.19,753.06,677.94,-120.09,inf,-34.44,346.07,325.78,283.13,259.31,-86.75,439.73,-30.29,1.0,6.83,6.68,6.84,6.77,6.82
std,2926.04,19.29,718.98,687.98,641.61,633.89,830.51,,264.89,301.89,285.92,254.99,226.13,373.69,9177.51,119.25,0.06,4.11,4.17,4.08,4.22,4.21
min,10010.0,13.0,0.0,0.0,0.0,0.0,-4640.75,-100.0,-1406.45,0.0,0.0,0.0,0.0,-1677.87,-100.0,-596.46,0.0,0.0,0.0,0.0,0.0,0.0
25%,12371.0,30.0,269.16,221.61,251.84,212.07,-560.95,-68.64,-182.32,120.19,122.22,88.42,80.97,-293.11,-73.42,-99.51,1.0,4.0,4.0,4.0,4.0,4.0
50%,14858.0,47.0,635.6,548.33,606.92,509.71,-101.48,-18.2,-26.69,281.22,258.56,216.98,210.06,-67.56,-28.25,-27.92,1.0,6.0,6.0,6.0,6.0,6.0
75%,17439.0,64.0,1122.85,1077.48,1086.92,971.71,354.2,106.09,123.48,498.99,458.47,412.11,374.65,128.61,40.65,42.46,1.0,9.0,8.0,9.0,9.0,9.0
max,19993.0,79.0,5760.27,6682.68,3872.53,3880.77,3306.46,inf,959.32,1868.45,1863.5,2004.21,1439.5,1395.79,306777.5,403.9,1.0,26.0,31.0,34.0,34.0,35.0


In [47]:
drugs = rx_members_remove.groupby(['ART_DRUGGRPS_L1', 'ART_DRUGGRPS_L2', 'DRUG_NAME_PREFERRED'])['RX_PAID_AMOUNT'].sum().reset_index()
drugs.sort_values(by='RX_PAID_AMOUNT', ascending=False, inplace=True)
drugs

Unnamed: 0,ART_DRUGGRPS_L1,ART_DRUGGRPS_L2,DRUG_NAME_PREFERRED,RX_PAID_AMOUNT
1,Cardiovascular Agents,Calcium Channel Blockers,Norvasc,851001.34
4,Endocrine Agents,Biguanides,Glucophage,844743.63
3,Endocrine Agents,Beta-2 Agonists,Ventolin,839552.9
5,Endocrine Agents,Thyroid Agents,Synthroid,830761.44
2,Cardiovascular Agents,Statins,Lipitor,826170.5
0,Cardiovascular Agents,ACE Inhibitors,Prinivil,826107.81


In [48]:
member_merge_cond.to_csv('no-glp1-members-nofilter.csv', index=False)

In [71]:
controlled_members = pd.read_csv('matched_pairs_controlled_members_pdc_nofilter.csv')
controlled_members = controlled_members['MEMBER_ID'].unique()

In [72]:
rx_controlled = rx[rx['MEMBER_ID'].isin(controlled_members)]
rx_controlled

Unnamed: 0,RX_CLAIM_ID,MEMBER_ID,PAID_DATE,SERVICE_DATE,DRUG_NAME_GENERIC,DRUG_NAME_PREFERRED,RX_PAID_AMOUNT,RX_SCRIPT_COUNT,ART_DRUGGRPS_L1,ART_DRUGGRPS_L2,PAID_YEAR,SERVICE_YEAR
1809,5780005.00,mma-10a643cceb7d41d098d15cc1bd9de839,2021-01-01,2020-12-29,Amoxicillin,Amoxicillin,1.46,30.00,Antibiotics,Penicillins,2021,2020
3652,5084.00,mma-3dc4837edfd543bfa723fba865f15cd4,2021-01-02,2021-01-02,Metoprolol Succinate ER,Metoprolol Succinate ER,4.50,90.00,Cardiac,Beta-Blockers,2021,2021
4808,8060244.00,mma-a16c1182f8334534971a19466b7a9f5d,2021-01-03,2021-01-03,lamoTRIgine,lamoTRIgine,0.00,60.00,CNS Agents,Anticonvulsants,2021,2021
5113,14846887.00,mma-7218eaf8d5a64a23a728ea7a41c1a2a3,2021-01-04,2021-01-04,Amitriptyline HCl,Amitriptyline HCl,4.52,60.00,CNS Agents,Antidepressatns - Tricyclics,2021,2021
5368,11667844.00,mma-2296c999ef8d4d2f816ccffd7e47d282,2021-01-04,2021-01-04,Irbesartan-hydroCHLOROthiazide,Irbesartan-hydroCHLOROthiazide,2.74,30.00,Cardiac,Antihypertensive Combinations,2021,2021
...,...,...,...,...,...,...,...,...,...,...,...,...
13478233,1510887.00,mma-9869404381fb4cb9aff7b88ce339c8ad,2024-12-31,2024-12-31,Acyclovir,Acyclovir,61.30,15.00,Topical Agents,Topical Antivirals,2024,2024
13479536,9948447.00,mma-d18899fda1c140f2b4492a936b1ffe46,2024-12-31,2024-12-27,Omeprazole,Omeprazole,0.00,0.00,Gastrointestinal,Proton Pum Inhibitors,2024,2024
13480123,4822292.00,mma-e6a2d4b53c6e425e9c7e88e0286054e1,2024-12-31,2024-12-31,Losartan Potassium-HCTZ,Losartan Potassium-HCTZ,0.00,90.00,Cardiac,Antihypertensive Combinations,2024,2024
13480491,11093707.00,mma-94842869961f4b6083b7decae408ce89,2024-12-31,2024-12-30,Topiramate,Topiramate,0.00,90.00,CNS Agents,Anticonvulsants,2024,2024


In [73]:
med_controlled = med[med['MEMBER_ID'].isin(controlled_members)]
med_controlled

Unnamed: 0,MEDICAL_CLAIM_ID,MEMBER_ID,PAID_DATE,PAID_YEAR,SERVICE_DATE,SERVICE_YEAR,MEDICAL_PAID_AMOUNT,IS_TELEMEDICINE,IS_ER_AVOIDABLE,PRIMARY_DX,PROCEDURE_DESC,ARTTOS_V2_L1,ARTTOS_V2_L3,DX_IS_CHRONIC,ICD10_CHAPTER,ICD10_CATEGORY,ICD10_SECTION,MEG_EPISODE_DESCRIPTION,OP_SURG_INC,IS_PCP_VISIT,INCLUDED_SPECIALIST,ER_VISIT_FLAG,IS_URGENT_CARE_VISIT,IS_PREVENTIVE_VISIT,IP_ADMIT_INC
13,3387484.00,mma-0966bfa11a214aedb7cf8078a20063fe,2021-01-01,2021,2020-12-14,2020,32.75,0.00,0.00,Shortness of breath,"Maximum breathing capacity, maximal voluntary ...",Professional,Pulmonology (P37g),0.00,OTHER,Symptoms and signs involving the circulatory a...,Abnormalities of breathing,"Asthma, Chronic Maintenance",0,0,0,0,0,0,0
30,15042239.00,mma-6cb6ef7f2a2048c7a5541746d50ff404,2021-01-01,2021,2020-12-21,2020,87.54,0.00,0.00,"Malignant neoplasm of colon, unspecified",CT of thorax with contrast (CPT 71260),Professional,CT/MRI/PET - CT Scan (P57a),1.00,Neoplasms,Malignant neoplasms of digestive organs,Malignant neoplasm of colon,"Neoplasm, Malignant: Colon and Rectum",0,0,0,0,0,0,0
79,714149.00,mma-bf60fda6efb7468aabbd721da1e4908b,2021-01-01,2021,2020-12-20,2020,100.01,0.00,0.00,Contact with and (suspected) exposure to other...,Cov-19 amp prb hgh thruput (HCPCS U0003),Professional,Office - Independent Lab (P63c),0.00,OTHER,Persons with potential health hazards related ...,Contact with and (suspected) exposure to commu...,Other Viral Infections,0,0,0,0,0,0,0
135,12933387.00,mma-90214768eea44255bda9d17940e6d98b,2021-01-01,2021,2020-12-23,2020,113.85,1.00,0.00,Malignant neoplasm of upper-outer quadrant of ...,Established patient office or other outpatient...,Outpatient,Clinic (O41d),1.00,Neoplasms,Malignant neoplasms of breast,Malignant neoplasm of breast,"Neoplasm, Malignant: Breast, Female",0,0,0,0,0,0,0
193,6035843.00,mma-4642427b19124ac9a86da9edd0dad466,2021-01-01,2021,2020-12-17,2020,1077.33,0.00,0.00,Dermatochalasis of left upper eyelid,Removal of excessive skin and fat of upper eye...,Professional,Outpatient Surgery (P14),0.00,OTHER,"Disorders of eyelid, lacrimal system and orbit",Other disorders of eyelid,Ptosis of Upper Lid,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3876329,1831542.00,mma-ba30c078040b48aaa2508472ea553f0f,2024-12-20,2024,2024-12-11,2024,151.21,1.00,0.00,Encounter for immunization,Established patient office or other outpatient...,Professional,PCP (P32c),0.00,OTHER,Persons with potential health hazards related ...,Encounter for immunization,Encounter for Preventive Health Services,0,1,0,0,0,0,0
3876443,1831542.00,mma-ba30c078040b48aaa2508472ea553f0f,2024-12-20,2024,2024-12-11,2024,54.90,0.00,0.00,Encounter for immunization,"Tetanus, diphtheria toxoids and acellular pert...",Professional,Preventive Immunizations (P41),0.00,OTHER,Persons with potential health hazards related ...,Encounter for immunization,Encounter for Preventive Health Services,0,0,0,0,0,0,0
3876581,1831542.00,mma-ba30c078040b48aaa2508472ea553f0f,2024-12-20,2024,2024-12-11,2024,37.81,0.00,0.00,Encounter for immunization,Subcutaneous administration of a combination v...,Professional,Preventive Immunizations (P41),0.00,OTHER,Persons with potential health hazards related ...,Encounter for immunization,Encounter for Preventive Health Services,0,0,0,0,0,0,0
3878012,13297263.00,mma-32ea1ce8d5bc4dc090887ad507bae96c,2024-12-26,2024,2024-12-23,2024,21.26,1.00,1.00,Local infection of the skin and subcutaneous t...,Established patient office or other outpatient...,Professional,Specialist (P32d),0.00,OTHER,Infections of the skin and subcutaneous tissue,Other local infections of skin and subcutaneou...,Other Inflammations and Infections of Skin and...,0,0,1,0,0,0,0


In [74]:
member_controlled = member[member['MEMBER_ID'].isin(controlled_members)]
member_controlled

Unnamed: 0,index,MEMBER_ID,MEMBER_STATUS,HOME_STATE,MEMBER_RELATIONSHIP,GENDER,Age,GENERATIONS
1,1,mma-036ffc20ff794595a44950a4729f66d3,0.00,LA,Subscriber,Female,62,Baby Boomers
4,17,mma-ca2e959e4c094ddab3a345d0d94d2f3e,2.00,GA,Subscriber,Female,65,Baby Boomers
5,19,mma-ed99a4f01306482caec74ec3d315501c,0.00,IL,Spouse,Female,38,Millenials
6,31,mma-b65786b24666471bb34a35f338d78efc,0.00,PA,Subscriber,Female,55,Generation X
9,41,mma-5c6f80aabb5140e3ad2916c50cfd7cd6,0.00,NC,Subscriber,Female,38,Millenials
...,...,...,...,...,...,...,...,...
1406,11107,mma-a9532facc59f46cea6136cbd96973f0e,0.00,MA,Subscriber,Male,67,Baby Boomers
1450,11448,mma-0b57ccda483b499ab5e8617f0b032cfc,0.00,TX,Subscriber,Female,66,Baby Boomers
1724,13426,mma-4a037934af894450b235f6dcaeb2000a,0.00,WI,Subscriber,Female,30,Millenials
3416,26395,mma-fad37e4ec5734b2f9b20294eb5838eab,0.00,NH,Subscriber,Female,69,Baby Boomers


In [112]:
controlled_members_pdc = pd.read_csv('matched_pairs_controlled_members_pdc.csv')
controlled_members_pdc = controlled_members_pdc['MEMBER_ID'].unique()
member_controlled_pdc = member[member['MEMBER_ID'].isin(controlled_members_pdc)]
med_controlled_pdc = med[med['MEMBER_ID'].isin(controlled_members_pdc)]
rx_controlled_pdc = rx[rx['MEMBER_ID'].isin(controlled_members_pdc)]

In [75]:
import openpyxl

In [76]:
rx_controlled.to_excel('rx-controlled-claims-nofilter.xlsx', index=False, engine='openpyxl')
med_controlled.to_excel('med-controlled-claims-nofilter.xlsx', index=False, engine='openpyxl')
member_controlled.to_excel('member-controlled-claims-nofilter.xlsx', index=False, engine='openpyxl')

In [115]:
rx_controlled_pdc.to_excel('rx_controlled_pdc.xlsx', index=False, engine='openpyxl')
med_controlled_pdc.to_excel('med_controlled_pdc.xlsx', index=False, engine='openpyxl')
member_controlled_pdc.to_excel('member_controlled_pdc.xlsx', index=False, engine='openpyxl')