In [1]:
import pandas as pd
import glob
import os

In [2]:
base_path = '../../src/fetch_tissue_data/.oncoexporter_cache/'
glob_pattern = os.path.join(base_path, '**', '*')
files = glob.glob(glob_pattern, recursive=True)

In [3]:
keyword = 'Lung'
dataframes = {}

for file in files:
    if keyword in file:
        df_name = file.split('/')[-1].replace('.pkl', '').replace('_df', '').replace(f'{keyword}_', '')
        dataframes[df_name] = pd.read_pickle(file)

In [4]:
required_keys = ['mutation', 'diagnosis', 'individual', 'rsub', 'specimen', 'treatment']
missing_keys = [key for key in required_keys if key not in dataframes]

if missing_keys:
    print(f"Missing dataframes for keys: {missing_keys}")
else:
    print("All required keys are present.")

All required keys are present.


## Step 1: Merge diagnosis and individual dataframes

In [5]:
merged_df = pd.merge(dataframes['diagnosis'], dataframes['individual'], on='subject_id', how='left', suffixes=('_diag', '_ind'))

## Step 2: Merge with research subject dataframe

In [6]:
merged_df = pd.merge(merged_df, dataframes['rsub'], on='researchsubject_id', how='left', suffixes=('', '_rsub'))

## Step 3: Merge with specimen dataframe

In [7]:
merged_df = pd.merge(merged_df, dataframes['specimen'], on='researchsubject_id', how='left', suffixes=('', '_spec'))

## Step 4: Merge with treatment dataframe

In [8]:
merged_df = pd.merge(merged_df, dataframes['treatment'], on='researchsubject_id', how='left', suffixes=('', '_treat'))

## Step 5: Merge with mutation dataframe

In [9]:
merged_df = pd.merge(merged_df, dataframes['mutation'], left_on='subject_id', right_on='cda_subject_id', how='left', suffixes=('', '_mut'))

## Get unique rows based on certain columns

In [10]:
unique_rows_df = merged_df.drop_duplicates(subset=['primary_diagnosis', 'primary_diagnosis_condition', 'primary_diagnosis_site']).reset_index(drop=True)[['primary_diagnosis', 'primary_diagnosis_condition', 'primary_diagnosis_site']]

In [11]:
unique_rows_df

Unnamed: 0,primary_diagnosis,primary_diagnosis_condition,primary_diagnosis_site
0,"Squamous cell carcinoma, NOS",Squamous Cell Neoplasms,Bronchus and lung
1,"Adenocarcinoma, NOS",Adenomas and Adenocarcinomas,Bronchus and lung
2,"Small cell carcinoma, NOS","Epithelial Neoplasms, NOS",Bronchus and lung
3,Non-small cell carcinoma,"Epithelial Neoplasms, NOS",Bronchus and lung
4,Large cell neuroendocrine carcinoma,"Epithelial Neoplasms, NOS",Bronchus and lung
...,...,...,...
100,Large cell neuroendocrine carcinoma,"Neoplasms, NOS",Bronchus and lung
101,Mixed type rhabdomyosarcoma,Myomatous Neoplasms,Bronchus and lung
102,Lymphangiomyomatosis,Lymphatic Vessel Tumors,Bronchus and lung
103,Poorly differentiated pulmonary adenocarcinoma,"Adenocarcinoma, NOS","Lung, NOS"


In [12]:
unique_rows_df.to_csv(f'../CombinationsForTissues/results/{keyword}_unique_combination.csv')