In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
visit_df=pd.read_csv('../../pcms_hackathon_data/test/visit.csv')

In [3]:

visit_df['visit_start_dt'] = pd.to_datetime(visit_df['visit_start_dt'], errors='coerce')
visit_df['visit_end_dt'] = pd.to_datetime(visit_df['visit_end_dt'], errors='coerce')

visit_type_dummies = pd.get_dummies(visit_df['visit_type'], prefix='visit_type')

expected_visit_types = ['visit_type_ER', 'visit_type_URGENT CARE', 'visit_type_INPATIENT']
for col in expected_visit_types:
    if col not in visit_type_dummies.columns:
        visit_type_dummies[col] = 0 

visit_df = pd.concat([visit_df, visit_type_dummies], axis=1)

visit_final = (
    visit_df.groupby('patient_id')
    .agg({
        'visit_id': 'count',                      
        'visit_type_ER': 'sum',                   
        'visit_type_URGENT CARE': 'sum',          
        'visit_type_INPATIENT': 'sum',            
        'readmsn_ind': 'sum',                     
        'visit_start_dt': 'min',                  
        'visit_end_dt': 'max'                     
    })
    .reset_index()
)


visit_final.rename(columns={
    'visit_id': 'num_visits',
    'visit_type_ER': 'num_er_visits',
    'visit_type_URGENT CARE': 'num_urgent_visits',
    'visit_type_INPATIENT': 'num_inpatient_visits',
    'readmsn_ind': 'num_readmissions',
    'visit_start_dt': 'first_visit_dt',
    'visit_end_dt': 'last_visit_dt'
}, inplace=True)


visit_final['days_between_first_last_visit'] = (
    (visit_final['last_visit_dt'] - visit_final['first_visit_dt']).dt.days
).fillna(0)


numeric_cols = [
    'num_visits', 'num_er_visits', 'num_urgent_visits',
    'num_inpatient_visits', 'num_readmissions', 'days_between_first_last_visit'
]
for col in numeric_cols:
    visit_final[col] = pd.to_numeric(visit_final[col], errors='coerce').fillna(0)



visit_final['num_follow_ups'] = visit_final['num_readmissions']

visit_final['inpatient_visit_ratio'] = np.where(
    visit_final['num_visits'] > 0,
    visit_final['num_inpatient_visits'] / visit_final['num_visits'],
    0
)

visit_final['followup_ratio'] = np.where(
    visit_final['num_visits'] > 0,
    visit_final['num_follow_ups'] / visit_final['num_visits'],
    0
)

visit_final['er_visit_ratio'] = np.where(
    visit_final['num_visits'] > 0,
    visit_final['num_er_visits'] / visit_final['num_visits'],
    0
)

visit_final['visit_frequency_per_month'] = np.where(
    visit_final['days_between_first_last_visit'] > 0,
    visit_final['num_visits'] / (visit_final['days_between_first_last_visit'] / 30.0),
    visit_final['num_visits']  # fallback
)

visit_final.replace([np.inf, -np.inf], 0, inplace=True)
visit_final.fillna(0, inplace=True)

visit_final.to_csv('test_visit_final.csv', index=False)

print("Shape:", visit_final.shape)
print("Columns:", visit_final.columns.tolist())


Shape: (767, 14)
Columns: ['patient_id', 'num_visits', 'num_er_visits', 'num_urgent_visits', 'num_inpatient_visits', 'num_readmissions', 'first_visit_dt', 'last_visit_dt', 'days_between_first_last_visit', 'num_follow_ups', 'inpatient_visit_ratio', 'followup_ratio', 'er_visit_ratio', 'visit_frequency_per_month']
