In [3]:
import os
from varseek.utils import make_entex_df, download_entex_fastq_links
data_directory = os.path.dirname(os.path.abspath(""))  # if this notebook resides in varseek/notebooks/0_data_download.ipynb, then this retrieves varseek
sequencing_data_download_base = os.path.join(data_directory, "data", "sequencing")
os.makedirs(sequencing_data_download_base, exist_ok=True)

# Bulk, CCLE - run script

# Bulk, Healthy Entex - see varseek/scripts/download_entex_data.py for script version

In [None]:
entex_data_download_base = os.path.join(sequencing_data_download_base, "bulk", "entex")
tissue = "upper lobe of left lung"  # print(sorted(set(entex_df['tissue']), key=str.lower))  # uncomment to see the list of tissue selections
entex_df = make_entex_df()
download_entex_fastq_links(entex_df, tissue=tissue, data_download_base=sequencing_data_download_base)

scRNA-seq, illumina, cancer (melanoma)

In [None]:
melanoma_dir = os.path.join(sequencing_data_download_base, "sc/melanoma_10x")
os.makedirs(melanoma_dir, exist_ok=True)

!cd {melanoma_dir} && !curl -O https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/8.0.0/10k_Human_DTC_Melanoma_3p_nextgem_Multiplex/10k_Human_DTC_Melanoma_3p_nextgem_Multiplex_fastqs.tar && tar -xvf 10k_Human_DTC_Melanoma_3p_nextgem_Multiplex_fastqs.tar

# scRNA-seq, illumina, cancer (NSCLC)

In [None]:
# https://www.10xgenomics.com/datasets/40-k-mixture-of-nsclc-dt-cs-from-7-donors-3-ht-v-3-1-3-1-high-6-1-0
nsclc_dir = os.path.join(sequencing_data_download_base, "sc/nsclc_10x_40k")
os.makedirs(nsclc_dir, exist_ok=True)
!cd {nsclc_dir} && curl -O https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/6.1.0/40k_NSCLC_DTC_3p_HT_nextgem_Multiplex/40k_NSCLC_DTC_3p_HT_nextgem_Multiplex_fastqs.tar && tar -xvf 40k_NSCLC_DTC_3p_HT_nextgem_Multiplex_fastqs.tar

# scRNA-seq, illumina, healthy

In [None]:
# https://www.10xgenomics.com/datasets/10k-human-pbmcs-3-v3-1-chromium-x-without-introns-3-1-high
pbmc_dir = os.path.join(sequencing_data_download_base, "sc/pbmc_10x")
os.makedirs(pbmc_dir, exist_ok=True)
!cd {pbmc_dir} && curl -O https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/6.1.2/10k_PBMC_3p_nextgem_Chromium_X/10k_PBMC_3p_nextgem_Chromium_X_fastqs.tar && tar -xvf 10k_PBMC_3p_nextgem_Chromium_X_fastqs.tar

# scRNA-seq, SMART-Seq, colorectal cancer CTCs

In [None]:
#* Download SRR_Acc_List.txt metadata file from Kozuka et al. (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8507666/) on SRA (https://www.ncbi.nlm.nih.gov/sra/?term=PRJNA759644) with sra-tools (https://github.com/ncbi/sra-tools)

# Optional: kb ref on T2T reference genome

In [None]:
import subprocess
from varseek.utils import download_t2t_reference_files
reference_out_dir = os.path.join(data_directory, "data", "reference")
threads = 4
k_standard = 31

t2t_folder = f"{reference_out_dir}/T2T/GCF_009914755.1"
t2t_kb_folder = f"{t2t_folder}/kb_index"
standard_index = f"{t2t_kb_folder}/index.idx"
standard_t2g = f"{t2t_kb_folder}/t2g.txt"
standard_f1 = f"{t2t_kb_folder}/f1.fa"

if not os.path.exists(standard_index) or not os.path.exists(standard_t2g):
    os.makedirs(t2t_kb_folder, exist_ok=True)
    t2t_genome = f"{t2t_folder}/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna"
    t2t_gtf = f"{t2t_folder}/genomic.gtf"
    
    if not os.path.exists(t2t_genome) or not os.path.exists(t2t_gtf):
        os.makedirs(t2t_folder, exist_ok=True)
        t2t_genome, t2t_cdna, t2t_gtf = download_t2t_reference_files(t2t_folder)

    kb_ref_standard_command = f"kb ref -k {k_standard} -i {standard_index} -g {standard_t2g} -f1 {standard_f1} -t {threads} {t2t_genome} {t2t_gtf}"
    result = subprocess.run(kb_ref_standard_command, shell=True, check=True)