In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

care_df = pd.read_csv('../../pcms_hackathon_data/test/care.csv')
print("care_df shape:", care_df.shape)

care_df shape: (107, 8)


In [2]:
care_df['last_care_dt'] = pd.to_datetime(care_df['last_care_dt'], errors='coerce')
care_df['next_care_dt'] = pd.to_datetime(care_df['next_care_dt'], errors='coerce')

care_df['has_next_care'] = care_df['next_care_dt'].notna().astype(int)

type_dummies = pd.get_dummies(care_df['msrmnt_type'], prefix='type')
subtype_dummies = pd.get_dummies(care_df['msrmnt_sub_type'], prefix='subtype')


care_binary = pd.concat([care_df[['patient_id']], type_dummies, subtype_dummies], axis=1)
care_binary = care_binary.groupby('patient_id').max().reset_index()


type_cols = [c for c in care_binary.columns if c.startswith('type_')]
subtype_cols = [c for c in care_binary.columns if c.startswith('subtype_')]
care_binary['count_msrmnt_type'] = care_binary[type_cols].sum(axis=1)
care_binary['count_msrmnt_sub_type'] = care_binary[subtype_cols].sum(axis=1)


care_agg = (
    care_df
    .groupby('patient_id')
    .agg({
        'care_id': 'count',                         
        'msrmnt_value': 'median',                   
        'last_care_dt': 'min',                      
        'next_care_dt': 'max',                      
        'care_gap_ind': lambda x: (x == 't').sum(), 
        'has_next_care': 'max'                      
    })
    .reset_index()
    .rename(columns={
        'care_id': 'num_care_events',
        'msrmnt_value': 'avg_msrmnt_value',
        'care_gap_ind': 'num_care_gaps'
    })
)


care_agg['has_last_care'] = care_agg['last_care_dt'].notna().astype(int)
care_agg['has_next_care'] = care_agg['has_next_care'].astype(int)

care_final = care_agg.merge(care_binary, on='patient_id', how='left')

care_final.fillna({
    'num_care_events': 0,
    'avg_msrmnt_value': 0,
    'num_care_gaps': 0,
    'has_next_care': 0,
    'count_msrmnt_type': 0,
    'count_msrmnt_sub_type': 0
}, inplace=True)


care_final['care_duration_days'] = (
    care_final['next_care_dt'] - care_final['last_care_dt']
).dt.days.replace([np.inf, np.nan], 0)


care_div = (
    care_df.groupby('patient_id')
    .agg({
        'msrmnt_type': pd.Series.nunique,
        'msrmnt_sub_type': pd.Series.nunique
    })
    .rename(columns={
        'msrmnt_type': 'msrmnt_type_diversity',
        'msrmnt_sub_type': 'msrmnt_subtype_diversity'
    })
    .reset_index()
)

care_final = care_final.merge(care_div, on='patient_id', how='left')

care_df = care_df.sort_values(['patient_id', 'last_care_dt'])
care_df['days_until_next_care'] = (
    care_df.groupby('patient_id')['next_care_dt'].shift(0) - care_df['last_care_dt']
).dt.days

days_next = (
    care_df.groupby('patient_id')['days_until_next_care'].median().fillna(0).reset_index()
)
care_final = care_final.merge(days_next, on='patient_id', how='left')


if 'msrmnt_value' in care_df.columns:
    weighted = (
        care_df.groupby('patient_id')['msrmnt_value'].mean().rename('weighted_care_score').reset_index()
    )
    care_final = care_final.merge(weighted, on='patient_id', how='left')
else:
    care_final['weighted_care_score'] = 0


care_final.fillna({
    'msrmnt_type_diversity': 0,
    'msrmnt_subtype_diversity': 0,
    'days_until_next_care': 0,
    'weighted_care_score': 0
}, inplace=True)


for col in ['last_care_dt', 'next_care_dt']:
    care_final[col] = pd.to_datetime(care_final[col], errors='coerce')
    care_final[col] = care_final[col].astype('int64', errors='ignore') // 1e9


care_final.to_csv("test_care_final.csv", index=False)

print("\nTest care table aggregated successfully!")
print("Shape:", care_final.shape)
print("Columns:", care_final.columns.tolist())



Test care table aggregated successfully!
Shape: (79, 26)
Columns: ['patient_id', 'num_care_events', 'avg_msrmnt_value', 'last_care_dt', 'next_care_dt', 'num_care_gaps', 'has_next_care', 'has_last_care', 'type_LAB TEST', 'type_MEDICATION ADHERENCE', 'type_SCREENING', 'subtype_BREAST CANCER', 'subtype_CHOLESTEROL', 'subtype_COLORECTAL CANCER', 'subtype_DIABETES', 'subtype_DIASTOLIC BLOOD PRESSURE', 'subtype_HYPERTENSION', 'subtype_HbA1c', 'subtype_SYSTOLIC BLOOD PRESSURE', 'count_msrmnt_type', 'count_msrmnt_sub_type', 'care_duration_days', 'msrmnt_type_diversity', 'msrmnt_subtype_diversity', 'days_until_next_care', 'weighted_care_score']
