In [3]:
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import concurrent.futures
import pdb
config = sbg.Config(profile='turbo') # assumes sbg credentials file, use your profile name here
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])

In [4]:
def populate_metadata(samp_id):
    try:
        samp_id = samp_id.rstrip('\n')
        related = api.files.query(project=project, metadata = {'sample_id': samp_id}).all()
        for fobj in related:
            if suffix in fobj.name and mod_date in str(fobj.created_on):
                if fobj.metadata['Composition'] and fobj.metadata['Composition'] == "Derived Cell Line":
                    continue
                metadata[samp_id] = {}
                for key in fobj.metadata:
                    if key not in head_list:
                        head_list.append(key)
                    metadata[samp_id][key] = fobj.metadata[key]
                metadata[samp_id]['id'] = fobj.id
                metadata[samp_id]['name'] = fobj.name
                return 0
        # some seem to have the old suffixes still?
        samp_id2 = samp_id + "-T.WGS"
        related = api.files.query(project=project, metadata = {'sample_id': samp_id2}).all()
        for fobj in related:
            if suffix in fobj.name and mod_date in str(fobj.created_on):
                metadata[samp_id] = {}
                for key in fobj.metadata:
                    if key not in head_list:
                        head_list.append(key)
                    metadata[samp_id][key] = fobj.metadata[key]
                metadata[samp_id]['id'] = fobj.id
                metadata[samp_id]['name'] = fobj.name
                # set to ID style in paper
                metadata[samp_id]['sample_id'] = samp_id
                #pdb.set_trace()
                return 0

        return samp_id
    except Exception as e:
        sys.stderr.write(str(e) + "\nDoes file exist for " + samp_id + "?\n")
        exit(1)


## Get maf manifest

In [5]:
project="kfdrc-harmonization/sd-bhjxbdqk-08"
samp_id_list = open("/Users/brownm28/Documents/PORTAL_LOADS/2020-Oct-5_proteomics_pub/load_build/mut_sample_list.txt")
metadata = {}
suffix = "strelka2.vep.maf"
mod_date = "2019-10"
head_list = ['id', 'name']

x = 1
n = 25
with concurrent.futures.ThreadPoolExecutor(8) as executor:
    results = {executor.submit(populate_metadata, samp_id): samp_id for samp_id in samp_id_list}
    for result in concurrent.futures.as_completed(results):
        if result.result() == 0:
#             if x % n == 0:
#                 sys.stderr.write(str(x) + ' samples catalogged\n')
            x += 1
        else:
            sys.stderr.write(result.result() + "\tnot found\n")
# for samp_id in samp_id_list:
#     populate_metadata(samp_id)

In [6]:
maf_manifest = open("/Users/brownm28/Documents/PORTAL_LOADS/2020-Oct-5_proteomics_pub/load_build/maf_manifest.tsv", "w")
maf_manifest.write("\t".join(head_list) + "\n")
for samp_id in metadata:
    temp = []
    for key in head_list:
        try:
            temp.append(str(metadata[samp_id][key]))
        except KeyError:
            temp.append("MISSING")
    maf_manifest.write("\t".join(temp) + "\n")
maf_manifest.close()

## Get cnv kit file manifest
Uses maf file manifest to get cnvkit files

In [2]:
def search_tasks(task_obj):
    try:
        parts = task_obj.name.split()
        if parts[-1] in metadata:
            return parts[-1], task_obj.outputs["cnvkit_calls"].id, task_obj.outputs["cnvkit_calls"].name
        else:
            return 0
    except Exception as e:
        sys.stderr.write(str(e) + "\nError processing " + task_obj.id)
        exit(1)
    

In [4]:
project="kfdrc-harmonization/sd-bhjxbdqk-12"
maf_manifest = open("/Users/brownm28/Documents/PORTAL_LOADS/2020-Oct-5_proteomics_pub/load_build/maf_manifest.tsv")
head = next(maf_manifest)
header = head.rstrip('\n').split('\t')
kf_id = header.index("Kids First Biospecimen ID Tumor")
metadata = {}

for line in maf_manifest:
    info = line.rstrip('\n').split('\t')
    metadata[info[kf_id]] = info
    
cnv_manifest = open("/Users/brownm28/Documents/PORTAL_LOADS/2020-Oct-5_proteomics_pub/load_build/cnvkit_manifest.tsv", "w")
cnv_manifest.write("\t".join(header) + "\n")
tasks = api.tasks.query(status="COMPLETED", project=project).all()
# pdb.set_trace()
x = 1
n = 25
with concurrent.futures.ThreadPoolExecutor(8) as executor:
    results = {executor.submit(search_tasks, task): task for task in tasks}
    for result in concurrent.futures.as_completed(results):
        if result.result() != 0:
            if x % n == 0:
                sys.stderr.write(str(x) + ' samples found\n')
            x += 1
            data = metadata[result.result()[0]]
            data[0] = result.result()[1]
            data[1] = result.result()[2]
            cnv_manifest.write("\t".join(data) + "\n")
cnv_manifest.close()
            


25 samples found
50 samples found
75 samples found
100 samples found
125 samples found
150 samples found
175 samples found
200 samples found
