In [None]:
import os
import pysam
import subprocess

In [None]:
synthetic_read_fastq = "/home/jrich/data/varseek_data_fresh/manuscript_worthy/vk_sim_2024dec17_complex_testing/synthetic_reads.fq"  #!!! update path
unique_mcrs_df_path = "/home/jrich/data/varseek_data_fresh/manuscript_worthy/vk_sim_2024dec17_complex_testing/unique_mcrs_df.csv"  #!!! update path
strelka_output_dir = "/home/jrich/data/varseek_data_fresh/manuscript_worthy/vk_sim_2024dec17_complex_testing/strelka2_simulated_data_dir"

threads = 16
read_length = 150
mutation_source = "cdna"  # "cdna", "cds"

cosmic_tsv = "/home/jrich/data/varseek_data/reference/cosmic/CancerMutationCensus_AllData_Tsv_v100_GRCh37/CancerMutationCensus_AllData_v100_GRCh37.tsv"
cosmic_cdna_info_csv = "/home/jrich/data/varseek_data/reference/cosmic/CancerMutationCensus_AllData_Tsv_v100_GRCh37/CancerMutationCensus_AllData_v100_GRCh37_mutation_workflow_with_cdna.csv"

# if these paths don't exist then they will be created
reference_genome_fasta = "/home/jrich/data/varseek_data/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.dna.primary_assembly.fa"
reference_genome_gtf = "/home/jrich/data/varseek_data/reference/ensembl_grch37_release93/Homo_sapiens.GRCh37.87.gtf"
star_genome_dir = "/home/jrich/data/varseek_data/reference/ensembl_grch37_release93/star_reference"
star_alignment_dir = "/home/jrich/data/varseek_data_fresh/manuscript_worthy/vk_sim_2024dec17_complex_testing/star_alignment"

opt_dir = '/home/jrich/Desktop/variant_calling'
STAR = "/home/jrich/opt/STAR-2.7.11b/source/STAR"
STRELKA_INSTALL_PATH = os.path.join(opt_dir, "strelka-2.9.10.centos6_x86_64")
VARSCAN_INSTALL_PATH = os.path.join(opt_dir, "VarScan.v2.3.9.jar")

In [None]:
os.makedirs(star_genome_dir, exist_ok=True)
os.makedirs(star_alignment_dir, exist_ok=True)
os.makedirs(opt_dir, exist_ok=True)

out_file_name_prefix = f"{star_alignment_dir}/sample_"
aligned_and_unmapped_bam = f"{out_file_name_prefix}Aligned.sortedByCoord.out.bam"

## Download software and reference files

In [None]:
if not os.path.exists(reference_genome_fasta):
    !wget -O {reference_genome_fasta}.gz https://ftp.ensembl.org/pub/grch37/release-93/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz && gunzip {reference_genome_fasta}.gz

if not os.path.exists(reference_genome_gtf):
    !wget -O {reference_genome_gtf}.gz https://ftp.ensembl.org/pub/grch37/release-93/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz && gunzip {reference_genome_gtf}.gz

if not os.path.exists(STRELKA_INSTALL_PATH):
    subprocess.run(['wget', '-O', STRELKA_INSTALL_PATH, 'https://github.com/Illumina/strelka/releases/download/v2.9.10/strelka-2.9.10.centos6_x86_64.tar.bz2'], check=True)
    subprocess.run(['tar', 'xvjf', f'{STRELKA_INSTALL_PATH}.tar.bz2'], check=True)
    # subprocess.run(['bash', os.path.join(STRELKA_INSTALL_PATH, 'bin/runStrelkaSomaticWorkflowDemo.bash')], check=True)
    # subprocess.run(['bash', os.path.join(STRELKA_INSTALL_PATH, 'bin/runStrelkaGermlineWorkflowDemo.bash')], check=True)

if not os.path.exists(VARSCAN_INSTALL_PATH):
    subprocess.run(["wget", "-O", VARSCAN_INSTALL_PATH, "https://sourceforge.net/projects/varscan/files/VarScan.v2.3.9.jar/download"], check=True   )

## Genome alignment with STAR

In [None]:
read_length_minus_one = read_length - 1

if not os.listdir(star_genome_dir):
    !$STAR \
        --runThreadN $threads \
        --runMode genomeGenerate \
        --genomeDir $star_genome_dir \
        --genomeFastaFiles $reference_genome_fasta \
        --sjdbGTFfile $reference_genome_gtf \
        --sjdbOverhang $read_length_minus_one

if not os.path.exists(aligned_and_unmapped_bam):
    !$STAR \
        --runThreadN $threads \
        --genomeDir $star_genome_dir \
        --readFilesIn $synthetic_read_fastq \
        --sjdbOverhang $read_length_minus_one \
        --outFileNamePrefix $out_file_name_prefix \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMunmapped Within \
        --outSAMmapqUnique 60 \
        --twopassMode Basic

## Index reference genome

In [None]:
if not os.path.exists(f"{reference_genome_fasta}.fai"):
    _ = pysam.faidx(reference_genome_fasta)

# Strelka2

## Strelka2 variant calling

In [None]:
!${STRELKA_INSTALL_PATH}/bin/configureStrelkaGermlineWorkflow.py \
    --bam $aligned_and_unmapped_bam \
    --referenceFasta $reference_genome_fasta \
    --rna \
    --runDir $strelka_output_dir

# execution on a single local machine with 20 parallel jobs
!$strelka_output_dir/runWorkflow.py -m local -j $threads

# VarScan

## BAM indexing (maybe not needed?) and sorting (maybe already done by STAR?)

In [None]:
varscan_output_dir = "/home/jrich/data/varseek_data_fresh/manuscript_worthy/vk_sim_2024dec17_complex_testing/varscan_simulated_data_dir"
data_pileup_file = f"{varscan_output_dir}/simulated_data.pileup"

In [None]:
!samtools index $aligned_and_unmapped_bam  # can possibly skip samtools sort below due to STAR mode --outSAMtype BAM SortedByCoordinate, but might need this step instead
# !samtools sort --threads $threads -o sorted.bam $aligned_and_unmapped_bam  # stated to be needed by VarScan documentation

## Samtools mpileup

In [None]:
!samtools mpileup -B -f $reference_genome_fasta $aligned_and_unmapped_bam > $data_pileup_file

## Varscan

In [None]:
!java -jar VarScan.v2.2.jar mpileup2snp $data_pileup_file

# Merging into COSMIC