In [1]:
import pandas as pd
import glob
import os

In [2]:
base_path = '../../src/fetch_tissue_data/.oncoexporter_cache/'
glob_pattern = os.path.join(base_path, '**', '*')
files = glob.glob(glob_pattern, recursive=True)

In [3]:
keyword = 'Heart'
dataframes = {}

for file in files:
    if keyword in file:
        df_name = file.split('/')[-1].replace('.pkl', '').replace('_df', '').replace(f'{keyword}_', '')
        dataframes[df_name] = pd.read_pickle(file)

In [4]:
required_keys = ['mutation', 'diagnosis', 'individual', 'rsub', 'specimen', 'treatment']
missing_keys = [key for key in required_keys if key not in dataframes]

if missing_keys:
    print(f"Missing dataframes for keys: {missing_keys}")
else:
    print("All required keys are present.")

All required keys are present.


## Step 1: Merge diagnosis and individual dataframes

In [5]:
merged_df = pd.merge(dataframes['diagnosis'], dataframes['individual'], on='subject_id', how='left', suffixes=('_diag', '_ind'))

## Step 2: Merge with research subject dataframe

In [6]:
merged_df = pd.merge(merged_df, dataframes['rsub'], on='researchsubject_id', how='left', suffixes=('', '_rsub'))

## Step 3: Merge with specimen dataframe

In [7]:
merged_df = pd.merge(merged_df, dataframes['specimen'], on='researchsubject_id', how='left', suffixes=('', '_spec'))

## Step 4: Merge with treatment dataframe

In [8]:
merged_df = pd.merge(merged_df, dataframes['treatment'], on='researchsubject_id', how='left', suffixes=('', '_treat'))

## Step 5: Merge with mutation dataframe

In [9]:
merged_df = pd.merge(merged_df, dataframes['mutation'], left_on='subject_id', right_on='cda_subject_id', how='left', suffixes=('', '_mut'))

## Get unique rows based on certain columns

In [10]:
unique_rows_df = merged_df.drop_duplicates(subset=['primary_diagnosis', 'primary_diagnosis_condition', 'primary_diagnosis_site']).reset_index(drop=True)[['primary_diagnosis', 'primary_diagnosis_condition', 'primary_diagnosis_site']]

In [11]:
unique_rows_df

Unnamed: 0,primary_diagnosis,primary_diagnosis_condition,primary_diagnosis_site
0,"Germ cell tumor, NOS",Germ Cell Neoplasms,"Heart, mediastinum, and pleura"
1,"Mesothelioma, NOS",Mesothelial Neoplasms,"Heart, mediastinum, and pleura"
2,"Epithelioid mesothelioma, malignant",Mesothelial Neoplasms,"Heart, mediastinum, and pleura"
3,"Mesothelioma, biphasic, malignant",Mesothelial Neoplasms,"Heart, mediastinum, and pleura"
4,"Mesothelioma, malignant",Mesothelial Neoplasms,"Heart, mediastinum, and pleura"
5,"Neuroblastoma, NOS",Neuroepitheliomatous Neoplasms,"Heart, mediastinum, and pleura"
6,"Thymoma, type B2, malignant",Thymic Epithelial Neoplasms,"Heart, mediastinum, and pleura"
7,Neuroblastoma,"Neuroblastoma, NOS","Heart, mediastinum, and pleura"
8,"Thymoma, type A, malignant",Thymic Epithelial Neoplasms,"Heart, mediastinum, and pleura"
9,Ganglioneuroblastoma,Neuroepitheliomatous Neoplasms,"Heart, mediastinum, and pleura"


In [37]:
unique_rows_df.to_csv(f'../CombinationsForTissues/results/{keyword}_unique_combination.csv')