# [vk sim](https://github.com/pachterlab/varseek) demonstration
Create a simulated RNA-seq dataset with variants of interests with `varseek sim`.

Written by Joseph Rich.
___

### Install varseek, and import all packages

In [None]:
try:
    import varseek as vk
except ImportError:
    print("varseek not found, installing...")
    !pip install -U -q varseek

In [None]:
import os

import pandas as pd
import gget
import varseek as vk
from varseek.utils import build_random_genome_read_df

### Define important paths and parameters

In [None]:
#!!! DELETE
# df = pd.read_csv("/Users/joeyrich/Desktop/local/varseek/data/reference/cosmic/CancerMutationCensus_AllData_Tsv_v100_GRCh37_v2/CancerMutationCensus_AllData_v100_GRCh37.tsv", sep="\t", nrows=2)
# df

In [None]:
# input paths
reference_dir = os.path.join("data", "reference")
variants = os.path.join(reference_dir, "CancerMutationCensus_AllData_Tsv_v101_GRCh37", "CancerMutationCensus_AllData_v101_GRCh37.tsv")
sequences_transcriptome = os.path.join(reference_dir, "Homo_sapiens.GRCh37.cds.all.fa")

# output paths
reads_fastq_out = os.path.join("data", "synthetic_reads.fastq")
reads_csv_out = os.path.join("data", "synthetic_reads.csv")
variants_updated_csv_out = os.path.join("data", "CancerMutationCensus_AllData_v101_GRCh37_with_synthetic_read_info.tsv")

# parameters
strand = None
k = 59
w = 54
read_length = 150
seed = 42
add_noise = True
error_rate = 0.0001
error_distribution = (0.85, 0.1, 0.05)
max_errors = float("inf")
with_replacement = False
seq_id_column = "ACCESSION_NUMBER"
var_column = "Mutation CDS"

### Download the reference genome (GRCh37, Ensembl 93, CDS file)

In [None]:
if not os.path.exists(sequences_transcriptome):
    !gget ref -w cds -r 93 --out_dir {reference_dir} -d human_grch37

### Download the COSMIC Cancer Mutation Census file

In [None]:
if not os.path.exists(variants):
    gget.cosmic(
        None,
        grch_version=37,
        cosmic_version=101,
        out=reference_dir,
        mutation_class="cancer",
        download_cosmic=True,
    )

### Print the first few lines of the COSMIC file
Note that our sequence IDs are in the column "ACCESSION_NUMBER", and our variants are in the column "Mutation CDS"

In [None]:
df = pd.read_csv(variants, sep="/t", nrows=10)
df

### Create the first set of simulated data: gene PODN, MIN_SIFT_SCORE<0.4, 50 variants selected, 8 reads per alt allele, 2 reads per ref allele --> 50 x (8+2) = 500 reads

In [None]:
filters = ['GENE_NAME:equal=PODN', 'MIN_SIFT_SCORE:less_than=0.4']
number_of_variants_to_sample = 50
number_of_reads_per_variant_alt = 8
number_of_reads_per_variant_ref = 2
reads_fastq_out_condition1 = os.path.join("data", "synthetic_reads_condition1.fastq")

simulated_df_dict = vk.sim(
    variants=variants,
    number_of_variants_to_sample=number_of_variants_to_sample,
    strand=strand,
    number_of_reads_per_variant_alt=number_of_reads_per_variant_alt,
    number_of_reads_per_variant_ref=number_of_reads_per_variant_ref,
    k=k,
    w=w,
    read_length=read_length,
    seed=seed,
    add_noise=add_noise,
    error_rate=error_rate,
    error_distribution=error_distribution,
    max_errors=max_errors,
    with_replacement=with_replacement,
    sequences=sequences_transcriptome,
    seq_id_column=seq_id_column,
    var_column=var_column,
    reference_out_dir=reference_dir,
    vk_build_out_dir="vk_build_tmp",
    filters=filters,
    reads_fastq_parent=reads_fastq_out,
    reads_fastq_out=reads_fastq_out_condition1,
    reads_csv_out=reads_csv_out,
    variants_updated_csv_out=variants_updated_csv_out,
)

variants_updated_df, read_df = simulated_df_dict['variants'], simulated_df_dict['read_df']


### Inspect the output files

In [None]:
variants_updated_df.head()

In [None]:
read_df.head()

### Create the second set of simulated data: non-substitutions, 25 variants selected, 4 reads per alt allele, 6 reads per ref allele --> 25 x (4+6) = 250 reads

In [None]:
filters = ['Mutation Description CDS:not_equal=substitution']
number_of_variants_to_sample = 25
number_of_reads_per_variant_alt = 4
number_of_reads_per_variant_ref = 6
reads_fastq_out_condition2 = os.path.join("data", "synthetic_reads_condition2.fastq")

filters.append('included_in_synthetic_reads-is_not_true')
simulated_df_dict = vk.sim(
    variants=variants_updated_df,
    reads_fastq_parent=reads_fastq_out,
    reads_csv_parent=read_df,
    number_of_variants_to_sample=number_of_variants_to_sample,
    strand=strand,
    number_of_reads_per_variant_alt=number_of_reads_per_variant_alt,
    number_of_reads_per_variant_ref=number_of_reads_per_variant_ref,
    k=k,
    w=w,
    read_length=read_length,
    seed=seed,
    add_noise=add_noise,
    error_rate=error_rate,
    error_distribution=error_distribution,
    max_errors=max_errors,
    with_replacement=with_replacement,
    sequences=sequences_transcriptome,
    seq_id_column=seq_id_column,
    var_column=var_column,
    reference_out_dir=reference_dir,
    vk_build_out_dir="vk_build_tmp",
    filters=filters,
    reads_fastq_out=reads_fastq_out_condition2,
    reads_csv_out=reads_csv_out,
    variants_updated_csv_out=variants_updated_csv_out,
)

### Add in random portions from the genome/transcriptome

In [None]:
sequences_genome = os.path.join(reference_dir, "Homo_sapiens.GRCh37.dna.primary_assembly.fa")

fastq_output_path_random_transcriptome = os.path.join("data", "synthetic_reads_random_transcriptome.fastq")
fastq_output_path_random_genome = os.path.join("data", "synthetic_reads_random_genome.fastq")

number_of_random_reads_transcriptome = 100
number_of_random_reads_genome = 50

In [None]:
if not os.path.exists(sequences_genome):
    !gget ref -w dna -r 93 --out_dir {reference_dir} -d human_grch37

In [None]:
if number_of_random_reads_transcriptome > 0:
    read_df = build_random_genome_read_df(reference_fasta_file_path = sequences_transcriptome, read_df = read_df, read_df_out=reads_csv_out, fastq_output_path = fastq_output_path_random_transcriptome, fastq_parent_path = reads_fastq_out, n = number_of_random_reads_transcriptome, read_length = read_length, input_type = "transcriptome", strand = strand, add_noise_sequencing_error=add_noise, seed=seed)
if number_of_random_reads_genome > 0:
    read_df = build_random_genome_read_df(reference_fasta_file_path = sequences_genome, read_df = read_df, read_df_out=reads_csv_out, fastq_output_path = fastq_output_path_random_genome, fastq_parent_path = reads_fastq_out, n = number_of_random_reads_genome, read_length = read_length, input_type = "genome", strand = strand, add_noise_sequencing_error=add_noise, seed=seed)

In [None]:
read_df.tail()