In [5]:
%pylab inline
import pandas as pd
from glob2 import glob
from pathlib import Path
from sqlalchemy import create_engine
from common.utils import init_cached_database, parse_mongodb_connection_string
from src.utils import get_config
import os

Populating the interactive namespace from numpy and matplotlib


In [6]:
config = get_config('brca-database')

2021-07-25 12:30:09.516 | DEBUG    | src.utils:get_config:10 - Reading brca-database from /home/ohad/Projects/ohad/src/../config.toml


In [7]:
xlsx_files = glob(
    "/data/Database/Human/TCGA-BRCA_Online/Clinical Data (CSV)/*.xlsx")


In [8]:
db = init_cached_database(parse_mongodb_connection_string(**config), db_name=config['db_name'])

In [17]:
def split_name(name):
    return Path(name).name.split('org_')[-1].split('.xlsx')[0]

def replace_not_available_with_none(item):
    for key, value in item.items():
        if item[key] in ['[Not Available]', '[Not Applicable]']:
            item[key] = None
    return item


In [20]:
for xlsx_file in xlsx_files:
    table_name = split_name(xlsx_file)
    df = pd.read_excel(xlsx_file,engine='openpyxl')[2:]
    # df.to_sql(table_name, con=engine, if_exists='replace')
    items = [row.to_dict() for _, row in df.iterrows()]
    items = [replace_not_available_with_none(item) for item in items]
    db[table_name].insert_many(items)



In [19]:
items[0]

{'bcr_patient_uuid': 'e17c565c-9857-4df3-a352-903b53093c85',
 'bcr_patient_barcode': 'TCGA-AO-A0JI',
 'bcr_omf_barcode': 'TCGA-AO-A0JI-O7952',
 'bcr_omf_uuid': '2EB50E2A-F7D5-4CF1-B85D-2935532A495B',
 'form_completion_date': datetime.datetime(2011, 2, 18, 0, 0),
 'malignancy_type': 'Prior Malignancy',
 'other_malignancy_dx_days_to': None,
 'surgery_indicator': None,
 'other_malignancy_surgery_type': None,
 'other_malignancy_surgery_days_to': -731,
 'pharmaceutical_therapy_indicator': 'NO',
 'pharmaceutical_therapy_extent': None,
 'pharmaceutical_therapy_drug_name': None,
 'pharmaceutical_tx_started_days_to': None,
 'radiation_therapy_indicator': 'NO',
 'radiation_therapy_extent': None,
 'history_rt_tx_to_site_of_tcga_tumor': None,
 'radiation_therapy_started_days_to': None,
 'ajcc_staging_edition': None,
 'ajcc_tumor_pathologic_pt': None,
 'ajcc_nodes_pathologic_pn': None,
 'ajcc_metastasis_pathologic_pm': None,
 'ajcc_pathologic_tumor_stage': 'Stage IA',
 'clinical_stage': None,
 'oth

In [1]:
base_dir = '/data/Database/Human/TCGA-BRCA_Online/MRI Scans/TCGA-BRCA'


In [14]:
files = glob(f'{base_dir}/**/*.dcm', recursive=True)

In [31]:
dcm_dirs = ['/'.join(file.split('/')[:-1]) for file in files]

['TCGA-OL-A66L']

In [47]:
dcm_items = []
for dcm_dir in dcm_dirs:
    patient_barcode = [barcode for barcode in bcr_patient_barcode if barcode in dcm_dir][0]
    dcm_items.append(
        dict(bcr_patient_barcode=patient_barcode, dcm_dir=dcm_dir))


In [54]:
dcm_items[10000]

{'bcr_patient_barcode': 'TCGA-OL-A5RU',
 'dcm_dir': '/data/Database/Human/TCGA-BRCA_Online/MRI Scans/TCGA-BRCA/TCGA-OL-A5RU/1.3.6.1.4.1.14519.5.2.1.5382.4002.621050843457519255436782462563/1.3.6.1.4.1.14519.5.2.1.5382.4002.173625326281036136333326088660',
 '_id': ObjectId('60fd367ce1f70bec86d605e5')}

In [55]:
db['dcm_files'].insert_many(dcm_items)

<pymongo.results.InsertManyResult at 0x7f14d5b38380>

In [23]:
bcr_patient_barcode = [Path(file).name for file in list(set(['/'.join(file.split('/')[:-3]) for file in files]))]

In [43]:
les_files = glob("/data/TCGA_Segmented_Lesions_UofC/*.les")


In [60]:
segmentation_files = []
for les_file in les_files:
    patient_barcode = [
        barcode for barcode in bcr_patient_barcode if barcode in les_file][0]
    segmentation_files.append(
        dict(bcr_patient_barcode=patient_barcode, segmentation_file=les_file)
        )


In [61]:
segmentation_files

[{'bcr_patient_barcode': 'TCGA-AO-A12D',
  'segmentation_file': '/data/TCGA_Segmented_Lesions_UofC/TCGA-AO-A12D-S2-1.les'},
 {'bcr_patient_barcode': 'TCGA-BH-A0H7',
  'segmentation_file': '/data/TCGA_Segmented_Lesions_UofC/TCGA-BH-A0H7-1.les'},
 {'bcr_patient_barcode': 'TCGA-E2-A1IJ',
  'segmentation_file': '/data/TCGA_Segmented_Lesions_UofC/TCGA-E2-A1IJ-1.les'},
 {'bcr_patient_barcode': 'TCGA-BH-A0BT',
  'segmentation_file': '/data/TCGA_Segmented_Lesions_UofC/TCGA-BH-A0BT-1.les'},
 {'bcr_patient_barcode': 'TCGA-BH-A0E2',
  'segmentation_file': '/data/TCGA_Segmented_Lesions_UofC/TCGA-BH-A0E2-1.les'},
 {'bcr_patient_barcode': 'TCGA-BH-A0BG',
  'segmentation_file': '/data/TCGA_Segmented_Lesions_UofC/TCGA-BH-A0BG-1.les'},
 {'bcr_patient_barcode': 'TCGA-BH-A0BM',
  'segmentation_file': '/data/TCGA_Segmented_Lesions_UofC/TCGA-BH-A0BM-1.les'},
 {'bcr_patient_barcode': 'TCGA-BH-A0W3',
  'segmentation_file': '/data/TCGA_Segmented_Lesions_UofC/TCGA-BH-A0W3-1.les'},
 {'bcr_patient_barcode': 'TCG

In [62]:
db['segmentation_files'].insert_many(segmentation_files)

<pymongo.results.InsertManyResult at 0x7f14d520e980>

In [71]:
for col in db.list_collection_names():
    try:
        db[col].create_index('bcr_patient_barcode')
    except:
        pass


In [None]:
bcr_patient_barcode