In [1]:
import os
import re
import sys
import argparse
import logging
import subprocess
from tabulate import tabulate
from distutils.version import StrictVersion

from dbclients.tantalus import TantalusApi
from dbclients.colossus import ColossusApi
from dbclients.basicclient import NotFoundError

tantalus_api = TantalusApi()
colossus_api = ColossusApi()

JIRA_TICKET_REGEX = r".*SC-\d{4}$"

def find_latest_analysis_ticket(pool_id):
    '''
    Return the jira ticket of the newest possible analysis of the given library.
    '''
    analyses = list(colossus_api.list("analysis_information", library__pool_id = pool_id))
    latest_completion_version = None
    analysis_jira_ticket = None
    newest_version = False
    #loop over the list of analyses to find the latest possible one
    for analysis in analyses:
        status = analysis["analysis_run"]["run_status"]
        # print(status)
        current_version = analysis["version"][1:]
        # print(current_version)
        if status in ["complete", "hmmcopy_complete"]:
            if latest_completion_version:
                if StrictVersion(current_version.strip('v')) >= StrictVersion(latest_completion_version.strip('v')):
                    #print("The updated version %s" % (current_version))
                    latest_completion_version = current_version
                    analysis_jira_ticket = analysis["analysis_jira_ticket"]
                    current_analysis = analysis
            else:
                latest_completion_version = current_version
                analysis_jira_ticket = analysis["analysis_jira_ticket"]
                current_analysis = analysis
                newest_version = True
            #print(status, current_version, latest_completion_version, current_version > latest_completion_version)
        else:
            pass
            #if the latest COMPLETE analysis exisis, save the information to the dict.

    if newest_version:
        return analysis_jira_ticket

    return None

def check_user_input(intial_question, options):

    answer = input(intial_question)

    while answer not in options:
        answer = input("Invalid input. Options are {}: ".format(options))

        if answer in options:
            break

    return answer


def check_user_file_input(user_input):
    file_choices = ["metrics", "reads", "segments", "plots"]
    if user_input.lower() == "y":
        return "all"

    elif user_input.lower() == "n":
        file_table = tabulate([[f] for f in file_choices], showindex="always")
        file_question = "\n{}\n\nPlease enter the index of the desired file: ".format(file_table)

        file_index = check_user_input(file_question, list(map(str, range(len(file_choices)))))

    print("\nPreparing to download {} files...\n".format(file_choices[int(file_index)]))
    return file_choices[int(file_index)]



In [2]:
# def get_segment(jira_ticket, version, library_id):

#     filenames = []
#     file_resources = list(tantalus_api.list(
#             "file_resource",
#             filename__startswith=jira_ticket,
#             filename__endswith="{}_segments.csv.gz".format(library_id)
#         ))

#     filenames += [f["filename"] for f in file_resources]

#     return filenames


In [3]:
def get_haplotype(jira_ticket, version, library_id):

    filenames = []
    file_resources = list(tantalus_api.list(
            "file_resource",
            filename__startswith=jira_ticket,
            filename__endswith="allele_counts.tsv"
        ))

    filenames += [f["filename"] for f in file_resources]

    return filenames


In [4]:
def get_all_files(jira_ticket, version, library_id):
    filenames = []

    filenames += get_haplotype(jira_ticket, version, library_id)
    return filenames

def main(library_id, download_dir):
    jira_ticket = find_latest_analysis_ticket(library_id) 
    # Hoa, add condition
    if jira_ticket:
        print('Latest Jira ticket  is: {0} correspond to library id: {1}'.format(jira_ticket,library_id))
        # Define source and destination storage clients
        from_storage_name = "singlecellresults"
        to_storage_name = download_dir

        from_storage = tantalus_api.get_storage(from_storage_name)
        from_storage_client = tantalus_api.get_storage_client(from_storage_name)

        # Check if user input is a jira ticket
        # If so, check if its a analysis ticket.
        # Else check if its a library ticket and return analysis tickets

        if re.match(JIRA_TICKET_REGEX, jira_ticket):

            try:
                analysis_object = colossus_api.get("analysis_information", analysis_jira_ticket=jira_ticket)
                analysis_ticket = jira_ticket
                # library_id = analysis_object["library"]["pool_id"]
                print("{} is a valid analysis ticket\n".format(analysis_ticket))

            except NotFoundError:
                raise Exception("{} is not an analysis ticket".format(jira_ticket))

        version = analysis_object["version"]
        filenames = get_all_files(analysis_ticket, version, library_id)
        downloaded_files = []
        for filename in filenames:
            filepath_parsed = filename.split("/")
            cell_state = filepath_parsed[1]
            analysis_type = filepath_parsed[-2]
            file = filepath_parsed[-1]
            print("cell state {0}; analysis type: {1};  files: {2}".format(cell_state,analysis_type,file))

            # REFACTOR
            # This takes care of results from old version of scpipeline
            if analysis_type == "plots":
                if "heatmap" in filename:
                    analysis_type = "hmmcopy_autoploidy"
                else:
                    analysis_type = "alignment"

            # accommadates weird cell_state_prediction naming /storage convention
            if cell_state == "cell_state_prediction":
                subdir = os.path.join(to_storage_name, library_id, cell_state)
                filepath = os.path.join(to_storage_name, library_id, cell_state, "{}_cell_state_prediction.csv".format(library_id))
            else:
                subdir = os.path.join(to_storage_name, library_id, analysis_type)
                filepath = os.path.join(to_storage_name, library_id, analysis_type, file)

            if not os.path.exists(subdir):
                os.makedirs(subdir)

            print("Downloading {} to {}".format(file, subdir))

            try:
                blob = from_storage_client.blob_service.get_blob_to_path(
                    container_name="results",
                    blob_name=filename,
                    file_path=filepath
                )
                downloaded_files.append(filepath)
            except:
                print('{} does not exist in blob.'.format(filename.split("/")[-1]))
                pass

        #clean up dir and unzip
        for root, dirs, files in os.walk(os.path.join(to_storage_name, library_id)):
            for name in files:
                if (os.path.getsize(os.path.join(root, name)) == 0 and re.search(".gz$", os.path.join(root, name))):
                    rm_cmd = ["rm", "-f", os.path.join(root, name)]
                    subprocess.check_call(rm_cmd)
                    continue

                elif re.search(".gz$", os.path.join(root, name)):
                    gunzip_cmd = ["gunzip", "-f", os.path.join(root, name)]
                    subprocess.check_call(gunzip_cmd)


        print("\n********** Download complete **********\n\n")
        for file in downloaded_files:
            print(file)
            
    else:
        print("Jira ticket do not exist for library id: {0}. Please double check the analysis".format(library_id))


In [35]:
download_dir = '/home/htran/storage/datasets/hakwoo_metastasis'
import pandas as pd
meta_sample = pd.read_csv(os.path.join(download_dir,'SA919X7_metadata_Hoa.csv'),header=0)
# meta_sample = meta_sample.loc[i for i in meta_sample.index if meta_sample['DLP_analysis']=='TRUE',:]

print(meta_sample.shape)
# ls_rows = [i for i in meta_sample.index if meta_sample.loc[i,'DLP_analysis']=='TRUE']
# print(ls_rows)
# len(library_ids)
# meta_sample.values
meta_sample.columns


(16, 6)


Index(['Sample_ID', 'PDX_ID', 'DLP+_jira_ticket', 'Library ID_DLP+',
       'Results_DLP', 'DLP_analysis'],
      dtype='object')

In [36]:
meta_sample = meta_sample[meta_sample['DLP_analysis']==True]
meta_sample.shape

(14, 6)

In [37]:
library_ids = meta_sample['Library ID_DLP+']
for library_id in library_ids:
    main(library_id, download_dir)

Latest Jira ticket  is: SC-3755 correspond to library id: A96200A
SC-3755 is a valid analysis ticket

cell state results; analysis type: sample_SA919X7XB05402;  files: allele_counts.tsv
Downloading allele_counts.tsv to /home/htran/storage/datasets/hakwoo_metastasis/A96200A/sample_SA919X7XB05402

********** Download complete **********


/home/htran/storage/datasets/hakwoo_metastasis/A96200A/sample_SA919X7XB05402/allele_counts.tsv
Latest Jira ticket  is: SC-3756 correspond to library id: A96204B
SC-3756 is a valid analysis ticket

cell state results; analysis type: sample_SA919X7XB05691;  files: allele_counts.tsv
Downloading allele_counts.tsv to /home/htran/storage/datasets/hakwoo_metastasis/A96204B/sample_SA919X7XB05691

********** Download complete **********


/home/htran/storage/datasets/hakwoo_metastasis/A96204B/sample_SA919X7XB05691/allele_counts.tsv
Latest Jira ticket  is: SC-3137 correspond to library id: A96229A
SC-3137 is a valid analysis ticket

cell state results; analysis t