In [1]:
import json
import pandas as pd

# required attributes
required_attributes = {
    "primary_diagnosis",
    "case_id",
    "age_at_diagnosis",
    "gender",
    "race",
    "ethnicity",
    "age_at_index",
    "ajcc_pathologic_stage",
    "prior_malignancy",
    "prior_treatment",
    "classification_of_tumor",
    "metastasis_at_diagnosis",
    "tumor_grade",
    "progression_or_recurrence",
    "alcohol_history",
    "cigarettes_per_day",
    "years_smoked",
    "vital_status",
    "days_to_diagnosis",
    "days_to_last_follow_up",
    "days_to_treatment_start",
    "days_to_treatment_end",
    "days_to_recurrence",
    "days_to_death"
}

# recursively parse the JSON and extract required attributes
def extract_attributes(data, result):
    if isinstance(data, dict):
        for key, value in data.items():
            if key in required_attributes:
                result[key] = value
            extract_attributes(value, result)
    elif isinstance(data, list):
        for item in data:
            extract_attributes(item, result)

# load the JSON file
with open('../datasets/ml-final-project/clinical.cohort.2025-02-13_updated.json', 'r') as file:
    data = json.load(file)

patients_data = []

# loops through array of patients and extracts required attributes
for patient in data:
    extracted_data = {}
    extract_attributes(patient, extracted_data)
    patients_data.append(extracted_data)

df = pd.DataFrame(patients_data)

print(df.head())

# df.to_csv('clinical_data_extracted.csv', index=False)

# Filter rows where 'metastasis_at_diagnosis' and 'days_to_treatment_end' are not NaN
# filtered_df = df[df['metastasis_at_diagnosis'].notna() & df['days_to_treatment_end'].notna()]
filtered_df = df[df['metastasis_at_diagnosis'].notna()]

print(filtered_df.head())

# filtered_df.to_csv('filtered_clinical_data.csv', index=False)

                                case_id  age_at_diagnosis  \
0  00016c8f-a0be-4319-9c42-4f3bcd90ac92           14656.0   
1  00048fa6-4318-42ef-9709-7dedb0d938b3           20375.0   
2  0004d251-3f70-4395-b175-c94c2f5b1b81           18736.0   
3  00061f34-c891-4f9c-b8d6-3ca68b98c875           23207.0   
4  0008bdfb-24a3-50fa-b112-89966d6ca423             505.0   

               primary_diagnosis classification_of_tumor   tumor_grade  \
0                 Carcinoma, NOS              metastasis  Not Reported   
1            Adenocarcinoma, NOS                 primary  Not Reported   
2  Hepatocellular carcinoma, NOS              recurrence            G1   
3       Duct adenocarcinoma, NOS              metastasis  Not Reported   
4   Clear cell sarcoma of kidney                 primary           NaN   

  progression_or_recurrence               ethnicity  gender          race  \
0              not reported            not reported  female  not reported   
1              not reported       