# [vk sim](https://github.com/pachterlab/varseek) demonstration
Create a simulated RNA-seq dataset with variants of interests with `varseek sim`.

Written by Joseph Rich.
___

### Install varseek, and import all packages

In [1]:
try:
    import varseek as vk
except ImportError:
    print("varseek not found, installing...")
    !pip install -U -q varseek

In [2]:
import os

import pandas as pd
import gget
import varseek as vk
from varseek.utils import build_random_genome_read_df

### Define important paths and parameters

In [None]:
# input paths
reference_dir = os.path.join("data", "reference")
variants = os.path.join(reference_dir, "CancerMutationCensus_AllData_Tsv_v101_GRCh37", "CancerMutationCensus_AllData_v101_GRCh37.tsv")
sequences_transcriptome = os.path.join(reference_dir, "Homo_sapiens.GRCh37.cds.all.fa")

# output paths
vk_sim_out = os.path.join("data", "vk_sim_out")
reads_fastq_out = os.path.join(vk_sim_out, "synthetic_reads.fastq")
reads_csv_out = os.path.join(vk_sim_out, "synthetic_reads.csv")
variants_updated_csv_out = os.path.join(vk_sim_out, "CancerMutationCensus_AllData_v101_GRCh37_with_synthetic_read_info.tsv")

# parameters
strand = None 
k = 59
w = 54
read_length = 150
seed = 42
add_noise = True
error_rate = 0.0001
error_distribution = (0.85, 0.1, 0.05)
max_errors = float("inf")
with_replacement = False
seq_id_column = "ACCESSION_NUMBER_VERSIONLESS"
var_column = "Mutation CDS"
header_column="header"
variant_type_column="Mutation_Description_CDS"

### Download the reference genome (GRCh37, Ensembl 93, CDS file)

In [4]:
if not os.path.exists(sequences_transcriptome):
    !gget ref -w cds -r 93 --out_dir {reference_dir} -d human_grch37
    !gunzip {sequences_transcriptome}.gz

18:07:31 - INFO - Fetching reference information for homo_sapiens from Ensembl release: 93.
{
    "homo_sapiens": {
        "coding_seq_cds": {
            "ftp": "http://ftp.ensembl.org/pub/grch37/release-93/fasta/homo_sapiens/cds/Homo_sapiens.GRCh37.cds.all.fa.gz",
            "ensembl_release": 93,
            "release_date": "2015-11-27",
            "release_time": "20:17",
            "bytes": "19M"
        }
    }
}
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0 19.0M    0  153k    0     0   185k      0  0:01:45 --:--:--  0:01:45  185k^C
gunzip: data/reference/Homo_sapiens.GRCh37.cds.all.fa.gz: unexpected end of file
gunzip: data/reference/Homo_sapiens.GRCh37.cds.all.fa.gz: uncompress failed


### Download the COSMIC Cancer Mutation Census file

In [13]:
if not os.path.exists(variants):
    gget.cosmic(
        None,
        grch_version=37,
        cosmic_version=101,
        out=reference_dir,
        mutation_class="cancer",
        download_cosmic=True,
    )

17:30:39 - INFO - NOTE: Licence fees apply for the commercial use of COSMIC.
17:30:59 - INFO - Downloading data...
17:31:20 - INFO - Extracted tar file to data/reference/CancerMutationCensus_AllData_Tsv_v101_GRCh37
17:31:24 - INFO - Unzipped file to data/reference/CancerMutationCensus_AllData_Tsv_v101_GRCh37/CancerMutationCensus_AllData_v101_GRCh37.tsv
17:31:24 - INFO - Creating modified mutations file for use with gget mutate...
17:31:51 - INFO - Modified mutations file for use with gget mutate created at data/reference/CancerMutationCensus_AllData_Tsv_v101_GRCh37/CancerMutationCensus_AllData_v101_GRCh37_mutation_workflow.csv


### Print the first few lines of the COSMIC file
Note that our sequence IDs are in the column "ACCESSION_NUMBER", and our variants are in the column "Mutation CDS"

In [6]:
df = pd.read_csv(variants, sep="\t", nrows=5)
df

Unnamed: 0,GENE_NAME,ACCESSION_NUMBER,ONC_TSG,CGC_TIER,MUTATION_URL,LEGACY_MUTATION_ID,Mutation CDS,Mutation AA,AA_MUT_START,AA_MUT_STOP,SHARED_AA,GENOMIC_WT_ALLELE_SEQ,GENOMIC_MUT_ALLELE_SEQ,AA_WT_ALLELE_SEQ,AA_MUT_ALLELE_SEQ,Mutation Description CDS,Mutation Description AA,ONTOLOGY_MUTATION_CODE,GENOMIC_MUTATION_ID,Mutation genome position GRCh37,Mutation genome position GRCh38,COSMIC_SAMPLE_TESTED,COSMIC_SAMPLE_MUTATED,DISEASE,WGS_DISEASE,EXAC_AF,EXAC_AFR_AF,EXAC_AMR_AF,EXAC_ADJ_AF,EXAC_EAS_AF,EXAC_FIN_AF,EXAC_NFE_AF,EXAC_SAS_AF,GNOMAD_EXOMES_AF,GNOMAD_EXOMES_AFR_AF,GNOMAD_EXOMES_AMR_AF,GNOMAD_EXOMES_ASJ_AF,GNOMAD_EXOMES_EAS_AF,GNOMAD_EXOMES_FIN_AF,GNOMAD_EXOMES_NFE_AF,GNOMAD_EXOMES_SAS_AF,GNOMAD_GENOMES_AF,GNOMAD_GENOMES_AFR_AF,GNOMAD_GENOMES_AMI_AF,GNOMAD_GENOMES_AMR_AF,GNOMAD_GENOMES_ASJ_AF,GNOMAD_GENOMES_EAS_AF,GNOMAD_GENOMES_FIN_AF,GNOMAD_GENOMES_MID_AF,GNOMAD_GENOMES_NFE_AF,GNOMAD_GENOMES_SAS_AF,CLINVAR_CLNSIG,CLINVAR_TRAIT,GERP++_RS,MIN_SIFT_SCORE,MIN_SIFT_PRED,DNDS_DISEASE_QVAL_SIG,MUTATION_SIGNIFICANCE_TIER,ACCESSION_NUMBER_VERSIONLESS
0,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM4577438,c.1751C>T,p.A584V,584,584,2,C,T,A,V,Substitution,Substitution - Missense,SO:0001583,COSV57014428,1:53546494-53546494,1:53080822-53080822,45922,5,,,,,,,,,,,8e-06,6.2e-05,0.0,0.0,0.0,0.0,9e-06,0.0,3.9e-05,4.8e-05,0.0,6.5e-05,0.0,0.0,0.0,0.0,4.4e-05,0.0,,,3.47,0.292,T,,Other,ENST00000312553
1,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM4397856,c.1496G>A,p.R499Q,499,499,1,G,A,R,Q,Substitution,Substitution - Missense,SO:0001583,COSV57013127,1:53544534-53544534,1:53078862-53078862,45922,6,,,3.3e-05,0.0004,0.0,3.6e-05,0.0,0.0,0.0,0.0,2.4e-05,0.0002,0.0,0.0,5.5e-05,0.0,9e-06,0.0,3.9e-05,7.2e-05,0.0,0.0,0.0,0.0002,0.0,0.0,2.9e-05,0.0,,,2.75,0.109,T,,Other,ENST00000312553
2,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM9234519,c.934C>A,p.L312M,312,312,2,C,A,L,M,Substitution,Substitution - Missense,SO:0001583,COSV104396799,1:53543408-53543408,1:53077736-53077736,45922,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.94,0.003,D,,Other,ENST00000312553
3,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6063694,c.1423C>T,p.R475C,475,475,2,C,T,R,C,Substitution,Substitution - Missense,SO:0001583,COSV57016857,1:53544461-53544461,1:53078789-53078789,45922,2,,,2.5e-05,0.0002,8.7e-05,2.5e-05,0.0,0.0,0.0,0.0,2.4e-05,0.0002,2.9e-05,0.0,0.0,0.0,1.8e-05,0.0,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5e-05,0.0,,,4.81,0.027,D,,Other,ENST00000312553
4,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM4390121,c.1282C>A,p.R428S,428,428,3,C,A,R,S,Substitution,Substitution - Missense,SO:0001583,COSV57012790,1:53544320-53544320,1:53078648-53078648,45922,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.81,0.012,D,,Other,ENST00000312553


### We want to strip the version number off of ACCESSION_NUMBER before passing into vk sim, as well as create a header column for the variants

In [7]:
if "ACCESSION_NUMBER_VERSIONLESS" not in df.columns or "header" not in df.columns or "Mutation_Description_CDS" not in df.columns:
    df = pd.read_csv(variants, sep="\t")
    if "ACCESSION_NUMBER_VERSIONLESS" not in df.columns:
        df["ACCESSION_NUMBER_VERSIONLESS"] = df["ACCESSION_NUMBER"].str.split(".").str[0]
    if "header" not in df.columns:
        df["header"] = df["ACCESSION_NUMBER_VERSIONLESS"] + ":" + df["Mutation CDS"]
    if "Mutation_Description_CDS" not in df.columns:
        df.rename(columns={"Mutation Description CDS": "Mutation_Description_CDS"}, inplace=True)
    df.to_csv(variants, sep="\t", index=False)
    del df

    df = pd.read_csv(variants, sep="\t", nrows=5)

  df = pd.read_csv(variants, sep="\t")


### Create the first set of simulated data: gene PODN, MIN_SIFT_SCORE<0.4, 50 variants selected, 8 reads per alt allele, 2 reads per ref allele --> 50 x (8+2) = 500 reads

In [6]:
filters = ['GENE_NAME:equal=PODN', 'MIN_SIFT_SCORE:less_than=0.4']
number_of_variants_to_sample = 50
number_of_reads_per_variant_alt = 8
number_of_reads_per_variant_ref = 2
reads_fastq_out_condition1 = os.path.join(vk_sim_out, "synthetic_reads_condition1.fastq")

simulated_df_dict = vk.varseek_sim.sim(
    variants=variants,
    number_of_variants_to_sample=number_of_variants_to_sample,
    strand=strand,
    number_of_reads_per_variant_alt=number_of_reads_per_variant_alt,
    number_of_reads_per_variant_ref=number_of_reads_per_variant_ref,
    k=k,
    w=w,
    read_length=read_length,
    seed=seed,
    add_noise_sequencing_error=add_noise,
    error_rate=error_rate,
    error_distribution=error_distribution,
    max_errors=max_errors,
    with_replacement=with_replacement,
    sequences=sequences_transcriptome,
    seq_id_column=seq_id_column,
    var_column=var_column,
    header_column=header_column,
    variant_type_column=variant_type_column,
    reference_out_dir=reference_dir,
    filters=filters,
    reads_fastq_parent=reads_fastq_out,
    reads_fastq_out=reads_fastq_out_condition1,
    reads_csv_out=reads_csv_out,
    variants_updated_csv_out=variants_updated_csv_out,
    out=vk_sim_out,
)

variants_updated_df, read_df = simulated_df_dict['variants'], simulated_df_dict['read_df']


  variants = pd.read_csv(variants, sep="\t")
11:36:08 - INFO - cannot find mutant sequence read parent
11:36:08 - INFO - running varseek build
11:36:08 - INFO - Using COSMIC email from COSMIC_EMAIL environment variable: jmrich@caltech.edu
11:36:08 - INFO - Using COSMIC password from COSMIC_PASSWORD environment variable
11:36:55 - INFO - Using the seq_id_column:var_column 'ACCESSION_NUMBER_VERSIONLESS:Mutation CDS' columns as the variant header column.
11:40:42 - INFO - Removed 17920 variant kmers with length less than 150...
        5322688 variants correctly recorded (99.58%)
        22192 variants removed (0.42%)
          0 variants missing seq_id or var_column (0.000%)
          86 entries removed due to having a duplicate entry (0.002%)
          0 variants with seq_ID not found in sequences (0.000%)
          325 intronic variants found (0.006%)
          0 posttranslational region variants found (0.000%)
          0 unknown variants found (0.000%)
          0 variants with uncer

Updated variant info has been saved to ./sim_data_df.csv


11:43:48 - INFO - FASTA file containing VCRSs created at ./vcrs.fa.
11:43:48 - INFO - t2g file containing VCRSs created at ./vcrs_t2g.txt.
11:43:48 - INFO - Total runtime for vk build: 7m, 40.30s
  sim_data_df = pd.read_csv(update_df_out)
11:47:01 - INFO - Initial variant report
11:47:01 - INFO - Number of total variants: 5344880; VCRSs: 5344880; unique variants: 5344880; merged variants: 0

11:47:01 - INFO - wt_sequence_read_parent is_not_null None
11:47:53 - INFO - Number of total variants: 5322770 (22110 filtered); VCRSs: 5322770 (22110 filtered); unique variants: 5322770 (22110 filtered); merged variants: 0 (0 filtered)

11:47:53 - INFO - mutant_sequence_read_parent is_not_null None
11:48:31 - INFO - Number of total variants: 5322770 (0 filtered); VCRSs: 5322770 (0 filtered); unique variants: 5322770 (0 filtered); merged variants: 0 (0 filtered)

11:48:31 - INFO - GENE_NAME equal PODN
11:48:36 - INFO - Number of total variants: 419 (5322351 filtered); VCRSs: 419 (5322351 filtered);

### Inspect the output files

In [7]:
variants_updated_df.head()

Unnamed: 0,GENE_NAME,ACCESSION_NUMBER,ONC_TSG,CGC_TIER,MUTATION_URL,LEGACY_MUTATION_ID,Mutation CDS,Mutation AA,AA_MUT_START,AA_MUT_STOP,SHARED_AA,GENOMIC_WT_ALLELE_SEQ,GENOMIC_MUT_ALLELE_SEQ,AA_WT_ALLELE_SEQ,AA_MUT_ALLELE_SEQ,Mutation Description CDS,Mutation Description AA,ONTOLOGY_MUTATION_CODE,GENOMIC_MUTATION_ID,Mutation genome position GRCh37,Mutation genome position GRCh38,COSMIC_SAMPLE_TESTED,COSMIC_SAMPLE_MUTATED,DISEASE,WGS_DISEASE,EXAC_AF,EXAC_AFR_AF,EXAC_AMR_AF,EXAC_ADJ_AF,EXAC_EAS_AF,EXAC_FIN_AF,EXAC_NFE_AF,EXAC_SAS_AF,GNOMAD_EXOMES_AF,GNOMAD_EXOMES_AFR_AF,GNOMAD_EXOMES_AMR_AF,GNOMAD_EXOMES_ASJ_AF,GNOMAD_EXOMES_EAS_AF,GNOMAD_EXOMES_FIN_AF,GNOMAD_EXOMES_NFE_AF,GNOMAD_EXOMES_SAS_AF,GNOMAD_GENOMES_AF,GNOMAD_GENOMES_AFR_AF,GNOMAD_GENOMES_AMI_AF,GNOMAD_GENOMES_AMR_AF,GNOMAD_GENOMES_ASJ_AF,GNOMAD_GENOMES_EAS_AF,GNOMAD_GENOMES_FIN_AF,GNOMAD_GENOMES_MID_AF,GNOMAD_GENOMES_NFE_AF,GNOMAD_GENOMES_SAS_AF,CLINVAR_CLNSIG,CLINVAR_TRAIT,GERP++_RS,MIN_SIFT_SCORE,MIN_SIFT_PRED,DNDS_DISEASE_QVAL_SIG,MUTATION_SIGNIFICANCE_TIER,ACCESSION_NUMBER_VERSIONLESS,header,original_order,mutant_sequence_read_parent,mutant_sequence_read_parent_rc,mutant_sequence_read_parent_length,wt_sequence_read_parent,wt_sequence_read_parent_rc,wt_sequence_read_parent_length,semicolon_count,included_in_synthetic_reads,included_in_synthetic_reads_wt,included_in_synthetic_reads_mutant,list_of_read_starting_indices_wt,list_of_read_starting_indices_mutant,number_of_reads_wt,number_of_reads_mutant,any_noisy_reads_wt,noisy_read_indices_wt,any_noisy_reads_mutant,noisy_read_indices_mutant,any_noisy_reads,tumor_purity
0,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM4577438,c.1751C>T,p.A584V,584,584,2,C,T,A,V,Substitution,Substitution - Missense,SO:0001583,COSV57014428,1:53546494-53546494,1:53080822-53080822,45922,5,,,,,,,,,,,8e-06,6.2e-05,0.0,0.0,0.0,0.0,9e-06,0.0,3.9e-05,4.8e-05,0.0,6.5e-05,0.0,0.0,0.0,0.0,4.4e-05,0.0,,,3.47,0.292,T,,Other,ENST00000312553,ENST00000312553:c.1751C>T,0,CGCAGCCGAGCCCTGGGCCCCCGTGCCTGGGTGGACCTCGCCCATC...,CTCTAAGTTGCCTTCAATGTCCAAGACCTGCAGGTGCTTCAGCCTC...,291.0,CGCAGCCGAGCCCTGGGCCCCCGTGCCTGGGTGGACCTCGCCCATC...,CTCTAAGTTGCCTTCAATGTCCAAGACCTGCAGGTGCTTCAGCCTC...,291.0,0,False,False,False,,,,,False,,False,,False,
1,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM4397856,c.1496G>A,p.R499Q,499,499,1,G,A,R,Q,Substitution,Substitution - Missense,SO:0001583,COSV57013127,1:53544534-53544534,1:53078862-53078862,45922,6,,,3.3e-05,0.0004,0.0,3.6e-05,0.0,0.0,0.0,0.0,2.4e-05,0.0002,0.0,0.0,5.5e-05,0.0,9e-06,0.0,3.9e-05,7.2e-05,0.0,0.0,0.0,0.0002,0.0,0.0,2.9e-05,0.0,,,2.75,0.109,T,,Other,ENST00000312553,ENST00000312553:c.1496G>A,1,ACCTACTTCCTGGAGGAGCTCAACCTCAGCTACAACCGCATCACCA...,GTCCACCCAGGCACGGGGGCCCAGGGCTCGGCTGCGCAGTCGGTTG...,291.0,ACCTACTTCCTGGAGGAGCTCAACCTCAGCTACAACCGCATCACCA...,GTCCACCCAGGCACGGGGGCCCAGGGCTCGGCTGCGCAGTCGGTTG...,291.0,0,False,False,False,,,,,False,,False,,False,
2,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM9234519,c.934C>A,p.L312M,312,312,2,C,A,L,M,Substitution,Substitution - Missense,SO:0001583,COSV104396799,1:53543408-53543408,1:53077736-53077736,45922,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.94,0.003,D,,Other,ENST00000312553,ENST00000312553:c.934C>A,2,CTCCAGCAACGTCGAGGTCCTCATCCTGTCCAGCAACTTCCTGCGC...,ACCAGGCTGCGCGGCAGCCCAGCTGGGACCCGAGACAGGTTGTTGC...,291.0,CTCCAGCAACGTCGAGGTCCTCATCCTGTCCAGCAACTTCCTGCGC...,ACCAGGCTGCGCGGCAGCCCAGCTGGGACCCGAGACAGGTTGTTGC...,291.0,0,False,False,False,,,,,False,,False,,False,
3,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6063694,c.1423C>T,p.R475C,475,475,2,C,T,R,C,Substitution,Substitution - Missense,SO:0001583,COSV57016857,1:53544461-53544461,1:53078789-53078789,45922,2,,,2.5e-05,0.0002,8.7e-05,2.5e-05,0.0,0.0,0.0,0.0,2.4e-05,0.0002,2.9e-05,0.0,0.0,0.0,1.8e-05,0.0,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5e-05,0.0,,,4.81,0.027,D,,Other,ENST00000312553,ENST00000312553:c.1423C>T,3,GCCTCGCCGCGTGCGCACCCTCATGATCCTGCACAACCAGATCACA...,GCCATGCCCACCAGCGCCCCTCGTGCCAAGGCAGCCAGCTCATTGC...,291.0,GCCTCGCCGCGTGCGCACCCTCATGATCCTGCACAACCAGATCACA...,GCCATGCCCACCAGCGCCCCTCGTGCCAAGGCAGCCAGCTCATTGC...,291.0,0,False,False,False,,,,,False,,False,,False,
4,PODN,ENST00000312553.5,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM4390121,c.1282C>A,p.R428S,428,428,3,C,A,R,S,Substitution,Substitution - Missense,SO:0001583,COSV57012790,1:53544320-53544320,1:53078648-53078648,45922,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.81,0.012,D,,Other,ENST00000312553,ENST00000312553:c.1282C>A,4,CATCCGCAGCCTGGAGTACCTGCTGCTGCACAGCAACCAGCTGCGG...,TTGCGGAAGGCGTCGCGGTGCACCTGCGGGCTGGTGATGCGGTTGT...,291.0,CATCCGCAGCCTGGAGTACCTGCTGCTGCACAGCAACCAGCTGCGG...,TTGCGGAAGGCGTCGCGGTGCACCTGCGGGCTGGTGATGCGGTTGT...,291.0,0,False,False,False,,,,,False,,False,,False,


In [8]:
read_df.head()

Unnamed: 0,read_id,read_header,read_sequence,read_index,read_strand,reference_header,vcrs_id,vcrs_header,vcrs_variant_type,mutant_read,wt_read,region_included_in_vcrs_reference,noise_added
0,ENST00000312553:c.1898T>G_81rM_0,ENST00000312553:c.1898T>G_81rM_0,GTCACCACACTCTAAGTTGCCTTCAATGTCCAAGACCTGCAGGTGC...,81,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False
1,ENST00000312553:c.1898T>G_14rM_1,ENST00000312553:c.1898T>G_14rM_1,CTTCCTCCTCCTCCTCCTCTTCCTCCTCCTCCTTTTCCTTCCCCAA...,14,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False
2,ENST00000312553:c.1898T>G_3rM_2,ENST00000312553:c.1898T>G_3rM_2,TCTTGTTTCCTCTTCCTCCTCCTCCTCCTCTTCCTCCTCCTCCTTT...,3,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False
3,ENST00000312553:c.1898T>G_35rM_3,ENST00000312553:c.1898T>G_35rM_3,CCTCCTCCTCCTTTTCCTTCCCCAAGCGGCCACGGTCCTTGGAAAT...,35,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False
4,ENST00000312553:c.1898T>G_31rM_4,ENST00000312553:c.1898T>G_31rM_4,TCTTCCTCCTCCTCCTTTTCCTTCCCCAAGCGGCCACGGTCCTTGG...,31,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False


### Create the second set of simulated data: non-substitutions, 25 variants selected, 4 reads per alt allele, 6 reads per ref allele --> 25 x (4+6) = 250 reads

In [13]:
filters = ['Mutation_Description_CDS:not_equal=Substitution']
number_of_variants_to_sample = 25
number_of_reads_per_variant_alt = 4
number_of_reads_per_variant_ref = 6
reads_fastq_out_condition2 = os.path.join(vk_sim_out, "synthetic_reads_condition2.fastq")

filters.append('included_in_synthetic_reads:is_not_true')
simulated_df_dict = vk.sim(
    variants=variants_updated_df,
    reads_fastq_parent=reads_fastq_out,
    reads_csv_parent=read_df,
    number_of_variants_to_sample=number_of_variants_to_sample,
    strand=strand,
    number_of_reads_per_variant_alt=number_of_reads_per_variant_alt,
    number_of_reads_per_variant_ref=number_of_reads_per_variant_ref,
    k=k,
    w=w,
    read_length=read_length,
    seed=seed,
    add_noise_sequencing_error=add_noise,
    error_rate=error_rate,
    error_distribution=error_distribution,
    max_errors=max_errors,
    with_replacement=with_replacement,
    sequences=sequences_transcriptome,
    seq_id_column=seq_id_column,
    var_column=var_column,
    header_column=header_column,
    variant_type_column=variant_type_column,
    reference_out_dir=reference_dir,
    filters=filters,
    reads_fastq_out=reads_fastq_out_condition2,
    reads_csv_out=reads_csv_out,
    variants_updated_csv_out=variants_updated_csv_out,
    out=vk_sim_out,
)

variants_updated_df, read_df = simulated_df_dict['variants'], simulated_df_dict['read_df']

12:06:28 - INFO - Initial variant report
12:06:28 - INFO - Number of total variants: 5344880; VCRSs: 5344880; unique variants: 5344880; merged variants: 0

12:06:28 - INFO - wt_sequence_read_parent is_not_null None
12:08:23 - INFO - Number of total variants: 5322770 (22110 filtered); VCRSs: 5322770 (22110 filtered); unique variants: 5322770 (22110 filtered); merged variants: 0 (0 filtered)

12:08:23 - INFO - included_in_synthetic_reads is_not_true True
12:09:30 - INFO - Number of total variants: 5322720 (50 filtered); VCRSs: 5322720 (50 filtered); unique variants: 5322720 (50 filtered); merged variants: 0 (0 filtered)

12:09:30 - INFO - mutant_sequence_read_parent is_not_null None
12:10:30 - INFO - Number of total variants: 5322720 (0 filtered); VCRSs: 5322720 (0 filtered); unique variants: 5322720 (0 filtered); merged variants: 0 (0 filtered)

12:10:30 - INFO - Mutation_Description_CDS not_equal substitution
12:11:02 - INFO - Number of total variants: 5322720 (0 filtered); VCRSs: 5322

### Add in random portions from the genome/transcriptome

In [None]:
sequences_genome = os.path.join(reference_dir, "Homo_sapiens.GRCh37.dna.primary_assembly.fa")

fastq_output_path_random_transcriptome = os.path.join(vk_sim_out, "synthetic_reads_random_transcriptome.fastq")
fastq_output_path_random_genome = os.path.join(vk_sim_out, "synthetic_reads_random_genome.fastq")

number_of_random_reads_transcriptome = 100
number_of_random_reads_genome = 50

In [15]:
if not os.path.exists(sequences_genome):
    !gget ref -w dna -r 93 --out_dir {reference_dir} -d human_grch37
    !gunzip {sequences_genome}.gz

12:59:49 - INFO - Fetching reference information for homo_sapiens from Ensembl release: 93.
{
    "homo_sapiens": {
        "genome_dna": {
            "ftp": "http://ftp.ensembl.org/pub/grch37/release-93/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz",
            "ensembl_release": 93,
            "release_date": "2015-11-27",
            "release_time": "19:02",
            "bytes": "830M"
        }
    }
}
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  829M  100  829M    0     0  14.2M      0  0:00:58  0:00:58 --:--:-- 10.6M0  12.3M      0  0:01:07  0:00:07  0:01:00 14.8M  14.5M      0  0:00:56  0:00:16  0:00:40 16.3M  0:00:50  0:00:22  0:00:28 22.8M0:25  0:00:25 18.1M 14.2M      0  0:00:58  0:00:57  0:00:01 10.8M


In [16]:
if number_of_random_reads_transcriptome > 0:
    read_df = build_random_genome_read_df(reference_fasta_file_path = sequences_transcriptome, read_df = read_df, read_df_out=reads_csv_out, fastq_output_path = fastq_output_path_random_transcriptome, fastq_parent_path = reads_fastq_out, n = number_of_random_reads_transcriptome, read_length = read_length, input_type = "transcriptome", strand = strand, add_noise_sequencing_error=add_noise, seed=seed)
if number_of_random_reads_genome > 0:
    read_df = build_random_genome_read_df(reference_fasta_file_path = sequences_genome, read_df = read_df, read_df_out=reads_csv_out, fastq_output_path = fastq_output_path_random_genome, fastq_parent_path = reads_fastq_out, n = number_of_random_reads_genome, read_length = read_length, input_type = "genome", strand = strand, add_noise_sequencing_error=add_noise, seed=seed)

In [17]:
read_df.tail()

Unnamed: 0,read_id,read_header,read_sequence,read_index,read_strand,reference_header,vcrs_id,vcrs_header,vcrs_variant_type,mutant_read,wt_read,region_included_in_vcrs_reference,noise_added
895,wt_randomfW_row45,GL000200.1:31638_31788_randomfW_row45,AATTTAAATGTGTTTTAATTTTCTCTTACTGTTAGGAAATTGATAT...,31638,f,,,,,False,True,False,False
896,wt_randomfW_row46,GL000221.1:119095_119245_randomfW_row46,TATAATGTCTCATGCAAAATTTAGTTCTTTGTTTCTCAGCTTAAAC...,119095,f,,,,,False,True,False,False
897,wt_randomfW_row47,11:72252017_72252167_randomfW_row47,GGAGACCAGATTGGGGACAGGGAAGTGGAGAACTGGGGATTGTTGA...,72252017,f,,,,,False,True,False,False
898,wt_randomfW_row48,GL000233.1:4879_5029_randomfW_row48,TTTCCAGGTGCCATCTGTTACCCCTTTCCTTGCCCAGGAATGGGAA...,4879,f,,,,,False,True,False,False
899,wt_randomfW_row49,GL000210.1:14213_14363_randomfW_row49,ACCCTCACCTGGTGTCTCTGTGGTGGGGACACCCCCTCACCTGCTG...,14213,f,,,,,False,True,False,False


In [53]:
read_df

Unnamed: 0,read_id,read_header,read_sequence,read_index,read_strand,reference_header,vcrs_id,vcrs_header,vcrs_variant_type,mutant_read,wt_read,region_included_in_vcrs_reference,noise_added
0,ENST00000312553:c.1898T>G_81rM_0,ENST00000312553:c.1898T>G_81rM_0,GTCACCACACTCTAAGTTGCCTTCAATGTCCAAGACCTGCAGGTGC...,81,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False
1,ENST00000312553:c.1898T>G_14rM_1,ENST00000312553:c.1898T>G_14rM_1,CTTCCTCCTCCTCCTCCTCTTCCTCCTCCTCCTTTTCCTTCCCCAA...,14,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False
2,ENST00000312553:c.1898T>G_3rM_2,ENST00000312553:c.1898T>G_3rM_2,TCTTGTTTCCTCTTCCTCCTCCTCCTCCTCTTCCTCCTCCTCCTTT...,3,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False
3,ENST00000312553:c.1898T>G_35rM_3,ENST00000312553:c.1898T>G_35rM_3,CCTCCTCCTCCTTTTCCTTCCCCAAGCGGCCACGGTCCTTGGAAAT...,35,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False
4,ENST00000312553:c.1898T>G_31rM_4,ENST00000312553:c.1898T>G_31rM_4,TCTTCCTCCTCCTCCTTTTCCTTCCCCAAGCGGCCACGGTCCTTGG...,31,r,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,ENST00000312553:c.1898T>G,,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,wt_randomfW_row45,GL000200.1:31638_31788_randomfW_row45,AATTTAAATGTGTTTTAATTTTCTCTTACTGTTAGGAAATTGATAT...,31638,f,,,,,False,True,False,False
896,wt_randomfW_row46,GL000221.1:119095_119245_randomfW_row46,TATAATGTCTCATGCAAAATTTAGTTCTTTGTTTCTCAGCTTAAAC...,119095,f,,,,,False,True,False,False
897,wt_randomfW_row47,11:72252017_72252167_randomfW_row47,GGAGACCAGATTGGGGACAGGGAAGTGGAGAACTGGGGATTGTTGA...,72252017,f,,,,,False,True,False,False
898,wt_randomfW_row48,GL000233.1:4879_5029_randomfW_row48,TTTCCAGGTGCCATCTGTTACCCCTTTCCTTGCCCAGGAATGGGAA...,4879,f,,,,,False,True,False,False
