In [4]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [5]:
diagnosis_test=pd.read_csv('../../pcms_hackathon_data/test/diagnosis.csv')

In [8]:
cond_name_dummies = pd.get_dummies(diagnosis_test['condition_name'], prefix='cond')
cond_desc_dummies = pd.get_dummies(diagnosis_test['condition_description'], prefix='desc')

diagnosis_binary = pd.concat(
    [diagnosis_test[['patient_id']], cond_name_dummies, cond_desc_dummies],
    axis=1
)
diagnosis_binary = diagnosis_binary.groupby('patient_id').max().reset_index()

diagnosis_agg = (
    diagnosis_test.groupby('patient_id')
    .agg({
        'condition_name': 'nunique',
        'is_chronic': lambda x: (x == 't').sum()
    })
    .reset_index()
    .rename(columns={
        'condition_name': 'num_conditions',
        'is_chronic': 'num_chronic_conditions'
    })
)

diagnosis_final_test = diagnosis_agg.merge(diagnosis_binary, on='patient_id', how='left')
diagnosis_final_test.fillna(0, inplace=True)

diagnosis_final_test['has_cancer_history'] = (
    diagnosis_final_test.filter(like='CANCER').sum(axis=1) > 0
).astype(int)

diagnosis_final_test['chronic_condition_ratio'] = np.where(
    diagnosis_final_test['num_conditions'] > 0,
    diagnosis_final_test['num_chronic_conditions'] / diagnosis_final_test['num_conditions'],
    0
)

condition_counts = (
    diagnosis_test
    .groupby(['patient_id', 'condition_name'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

rename_map = {
    'CANCER': 'cancer_count',
    'DIABETES': 'diabetes_count',
    'HYPERTENSION': 'hypertension_count'
}
condition_counts.rename(columns=rename_map, inplace=True)

diagnosis_final_test = diagnosis_final_test.merge(condition_counts, on='patient_id', how='left')

for col in ['cancer_count', 'diabetes_count', 'hypertension_count']:
    if col in diagnosis_final_test.columns:
        diagnosis_final_test[col] = diagnosis_final_test[col].fillna(0).astype(int)
    else:
        diagnosis_final_test[col] = 0


train_diagnosis = pd.read_csv('../../Diagnosis_Analysis/diagnosis_final.csv') 
missing_cols = set(train_diagnosis.columns) - set(diagnosis_final_test.columns)

for col in missing_cols:
    diagnosis_final_test[col] = 0

diagnosis_final_test = diagnosis_final_test[train_diagnosis.columns]

diagnosis_final_test.to_csv("test_diagnosis_final.csv", index=False)

print("Shape:", diagnosis_final_test.shape)

Shape: (648, 18)
