In [39]:
import pandas as pd
import glob
import os

In [40]:
base_path = '../../src/fetch_tissue_data/.oncoexporter_cache/'
glob_pattern = os.path.join(base_path, '**', '*')
files = glob.glob(glob_pattern, recursive=True)

In [41]:
keyword = 'Bone'
dataframes = {}

for file in files:
    if keyword in file:
        df_name = file.split('/')[-1].replace('.pkl', '').replace('_df', '').replace(f'{keyword}_', '')
        dataframes[df_name] = pd.read_pickle(file)

In [42]:
required_keys = ['mutation', 'diagnosis', 'individual', 'rsub', 'specimen', 'treatment']
missing_keys = [key for key in required_keys if key not in dataframes]

if missing_keys:
    print(f"Missing dataframes for keys: {missing_keys}")
else:
    print("All required keys are present.")

All required keys are present.


## Step 1: Merge diagnosis and individual dataframes

In [43]:
merged_df = pd.merge(dataframes['diagnosis'], dataframes['individual'], on='subject_id', how='left', suffixes=('_diag', '_ind'))

## Step 2: Merge with research subject dataframe

In [44]:
merged_df = pd.merge(merged_df, dataframes['rsub'], on='researchsubject_id', how='left', suffixes=('', '_rsub'))

## Step 3: Merge with specimen dataframe

In [45]:
merged_df = pd.merge(merged_df, dataframes['specimen'], on='researchsubject_id', how='left', suffixes=('', '_spec'))

## Step 4: Merge with treatment dataframe

In [46]:
merged_df = pd.merge(merged_df, dataframes['treatment'], on='researchsubject_id', how='left', suffixes=('', '_treat'))

## Step 5: Merge with mutation dataframe

In [47]:
merged_df = pd.merge(merged_df, dataframes['mutation'], left_on='subject_id', right_on='cda_subject_id', how='left', suffixes=('', '_mut'))

## Get unique rows based on certain columns

In [48]:
unique_rows_df = merged_df.drop_duplicates(subset=['primary_diagnosis', 'primary_diagnosis_condition', 'primary_diagnosis_site']).reset_index(drop=True)[['primary_diagnosis', 'primary_diagnosis_condition', 'primary_diagnosis_site']]

In [49]:
unique_rows_df

Unnamed: 0,primary_diagnosis,primary_diagnosis_condition,primary_diagnosis_site
0,"Chondrosarcoma, NOS",Osseous and Chondromatous Neoplasms,"Bones, joints and articular cartilage of other..."
1,"Osteosarcoma, NOS",Osseous and Chondromatous Neoplasms,"Bones, joints and articular cartilage of other..."
2,Ewing sarcoma,Miscellaneous Bone Tumors,"Bones, joints and articular cartilage of other..."
3,"Osteosarcoma, NOS",Osseous and Chondromatous Neoplasms,"Bones, joints and articular cartilage of limbs"
4,"Chordoma, NOS",Miscellaneous Tumors,"Bones, joints and articular cartilage of other..."
5,Not specified in data,"Neoplasm, uncertain whether benign or malignant","Pelvic bones, sacrum, coccyx and associated jo..."
6,High grade surface osteosarcoma,Osseous and Chondromatous Neoplasms,"Bones, joints and articular cartilage of other..."
7,Undifferentiated sarcoma,"Soft Tissue Tumors and Sarcomas, NOS","Bones, joints and articular cartilage of other..."
8,Rhabdomyosarcoma,"Rhabdomyosarcoma, NOS","Bone, NOS"
9,Myxoid chondrosarcoma,Osseous and Chondromatous Neoplasms,"Bones, joints and articular cartilage of other..."


In [37]:
unique_rows_df.to_csv(f'../CombinationsForTissues/results/{keyword}_unique_combination.csv')