<a href="https://colab.research.google.com/github/pachterlab/varseek-examples/blob/main/vk_sim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [vk sim](https://github.com/pachterlab/varseek) demonstration
Create a simulated RNA-seq dataset with variants of interests with `varseek sim`. Note: This requires a [COSMIC](https://cancer.sanger.ac.uk/cosmic) account.

Written by Joseph Rich.
___

### Install varseek, and import all packages

In [1]:
try:
    import varseek as vk
except ImportError:
    print("varseek not found, installing...")
    !pip install -U -q varseek

In [2]:
import os

import pandas as pd
import gget
import varseek as vk
from varseek.utils import build_random_genome_read_df

### Define important paths and parameters

In [3]:
# input paths
reference_dir = os.path.join("data", "reference")
variants = os.path.join(reference_dir, "CancerMutationCensus_AllData_Tsv_v101_GRCh37", "CancerMutationCensus_AllData_v101_GRCh37.tsv")
sequences_transcriptome = os.path.join(reference_dir, "Homo_sapiens.GRCh37.cds.all.fa")

# output paths
vk_sim_out = os.path.join("data", "vk_sim_out2")
reads_fastq_out = os.path.join(vk_sim_out, "synthetic_reads.fastq")
reads_csv_out = os.path.join(vk_sim_out, "synthetic_reads.csv")
variants_updated_csv_out = os.path.join(vk_sim_out, "CancerMutationCensus_AllData_v101_GRCh37_with_synthetic_read_info.tsv")

# parameters
strand = None 
k = 59
w = 54
read_length = 150
seed = 42
add_noise = True
error_rate = 0.0001
error_distribution = (0.85, 0.1, 0.05)
max_errors = float("inf")
with_replacement = False
seq_id_column = "ACCESSION_NUMBER_VERSIONLESS"
var_column = "Mutation CDS"
variant_type_column="Mutation_Description_CDS"

cosmic_email = os.environ.get("COSMIC_EMAIL")
cosmic_password = os.environ.get("COSMIC_PASSWORD")

### Download the reference genome (GRCh37, Ensembl 93, CDS file)

In [4]:
if not os.path.exists(sequences_transcriptome):
    !gget ref -w cds -r 93 --out_dir {reference_dir} -d human_grch37
    !gunzip {sequences_transcriptome}.gz

### Download the COSMIC Cancer Mutation Census file

In [None]:
if not os.path.exists(variants):
    gget.cosmic(
        None,
        grch_version=37,
        cosmic_version=101,
        out=reference_dir,
        mutation_class="cancer",
        download_cosmic=True,
        gget_mutate=True,
        email=cosmic_email,
        password=cosmic_password
    )

### Print the first few lines of the COSMIC file
Note that our sequence IDs are in the column "ACCESSION_NUMBER", and our variants are in the column "Mutation CDS"

In [6]:
df = pd.read_csv(variants, sep="\t", nrows=5)
df

Unnamed: 0,GENE_NAME,ACCESSION_NUMBER,ONC_TSG,CGC_TIER,MUTATION_URL,LEGACY_MUTATION_ID,Mutation CDS,Mutation AA,AA_MUT_START,AA_MUT_STOP,SHARED_AA,GENOMIC_WT_ALLELE_SEQ,GENOMIC_MUT_ALLELE_SEQ,AA_WT_ALLELE_SEQ,AA_MUT_ALLELE_SEQ,Mutation_Description_CDS,Mutation Description AA,ONTOLOGY_MUTATION_CODE,GENOMIC_MUTATION_ID,Mutation genome position GRCh37,Mutation genome position GRCh38,COSMIC_SAMPLE_TESTED,COSMIC_SAMPLE_MUTATED,DISEASE,WGS_DISEASE,EXAC_AF,EXAC_AFR_AF,EXAC_AMR_AF,EXAC_ADJ_AF,EXAC_EAS_AF,EXAC_FIN_AF,EXAC_NFE_AF,EXAC_SAS_AF,GNOMAD_EXOMES_AF,GNOMAD_EXOMES_AFR_AF,GNOMAD_EXOMES_AMR_AF,GNOMAD_EXOMES_ASJ_AF,GNOMAD_EXOMES_EAS_AF,GNOMAD_EXOMES_FIN_AF,GNOMAD_EXOMES_NFE_AF,GNOMAD_EXOMES_SAS_AF,GNOMAD_GENOMES_AF,GNOMAD_GENOMES_AFR_AF,GNOMAD_GENOMES_AMI_AF,GNOMAD_GENOMES_AMR_AF,GNOMAD_GENOMES_ASJ_AF,GNOMAD_GENOMES_EAS_AF,GNOMAD_GENOMES_FIN_AF,GNOMAD_GENOMES_MID_AF,GNOMAD_GENOMES_NFE_AF,GNOMAD_GENOMES_SAS_AF,CLINVAR_CLNSIG,CLINVAR_TRAIT,GERP++_RS,MIN_SIFT_SCORE,MIN_SIFT_PRED,DNDS_DISEASE_QVAL_SIG,MUTATION_SIGNIFICANCE_TIER,ACCESSION_NUMBER_VERSIONLESS,header
0,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM9691637,c.1468C>T,p.Q490*,490,490,1,G,A,Q,*,Substitution,Substitution - Nonsense,SO:0001587,COSV105286190,1:51001067-51001067,1:50535395-50535395,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.52,1.0,,,Other,ENST00000396153,ENST00000396153:c.1468C>T
1,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM7986191,c.711G>T,p.W237C,237,237,1,C,A,W,C,Substitution,Substitution - Missense,SO:0001583,COSV100875125,1:51121147-51121147,1:50655475-50655475,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.17,0.0,D,,Other,ENST00000396153,ENST00000396153:c.711G>T
2,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6063740,c.1548C>T,p.R516=,516,516,2,G,A,R,R,Substitution,Substitution - coding silent,SO:1000057,COSV65638235,1:50957420-50957420,1:50491748-50491748,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,Other,ENST00000396153,ENST00000396153:c.1548C>T
3,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6657959,c.429del,p.G144Afs*13,144,144,1,T,,G,AGRREMWKTVRS*,Deletion,Deletion - Frameshift,SO:0001589,COSV65641246,1:51210389-51210389,1:50744717-50744717,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,Other,ENST00000396153,ENST00000396153:c.429del
4,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM10419714,c.797G>A,p.G266E,266,266,1,C,T,G,E,Substitution,Substitution - Missense,SO:0001583,COSV107478819,1:51061836-51061836,1:50596164-50596164,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.64,0.248,T,,Other,ENST00000396153,ENST00000396153:c.797G>A


### We make a few small modifications to the variants dataframe
- strip the version number off of ACCESSION_NUMBER
- create a header column for the variants
- ensure no spaces in the Mutation Description CDS column

In [7]:
if "ACCESSION_NUMBER_VERSIONLESS" not in df.columns or "header" not in df.columns or "Mutation_Description_CDS" not in df.columns:
    df = pd.read_csv(variants, sep="\t")
    if "ACCESSION_NUMBER_VERSIONLESS" not in df.columns:
        df["ACCESSION_NUMBER_VERSIONLESS"] = df["ACCESSION_NUMBER"].str.split(".").str[0]
    if "Mutation_Description_CDS" not in df.columns:
        df.rename(columns={"Mutation Description CDS": "Mutation_Description_CDS"}, inplace=True)
    df.to_csv(variants, sep="\t", index=False)
    del df

    df = pd.read_csv(variants, sep="\t", nrows=5)

### Create the first set of simulated data: gene PODN, MIN_SIFT_SCORE<0.4, 50 variants selected, 8 reads per alt allele, 2 reads per ref allele --> 50 x (8+2) = 500 reads

In [None]:
filters = ['GENE_NAME:equal=PODN', 'MIN_SIFT_SCORE:less_than=0.4']
number_of_variants_to_sample = 50
number_of_reads_per_variant_alt = 8
number_of_reads_per_variant_ref = 2
reads_fastq_out_condition1 = os.path.join(vk_sim_out, "synthetic_reads_condition1.fastq")

simulated_df_dict = vk.sim(
    variants=variants,
    number_of_variants_to_sample=number_of_variants_to_sample,
    strand=strand,
    number_of_reads_per_variant_alt=number_of_reads_per_variant_alt,
    number_of_reads_per_variant_ref=number_of_reads_per_variant_ref,
    k=k,
    w=w,
    read_length=read_length,
    seed=seed,
    add_noise_sequencing_error=add_noise,
    error_rate=error_rate,
    error_distribution=error_distribution,
    max_errors=max_errors,
    with_replacement=with_replacement,
    sequences=sequences_transcriptome,
    seq_id_column=seq_id_column,
    var_column=var_column,
    variant_type_column=variant_type_column,
    reference_out_dir=reference_dir,
    filters=filters,
    reads_fastq_parent=reads_fastq_out,
    reads_fastq_out=reads_fastq_out_condition1,
    reads_csv_out=reads_csv_out,
    variants_updated_csv_out=variants_updated_csv_out,
    out=vk_sim_out,
)

variants_updated_df, read_df = simulated_df_dict['variants'], simulated_df_dict['read_df']


  variants = pd.read_csv(variants, sep="\t")
11:08:22 - INFO - cannot find mutant sequence read parent
11:08:22 - INFO - running varseek build
11:08:22 - INFO - Using COSMIC email from COSMIC_EMAIL environment variable: jmrich@caltech.edu
11:08:22 - INFO - Using COSMIC password from COSMIC_PASSWORD environment variable
11:09:13 - INFO - Using the seq_id_column:var_column 'ACCESSION_NUMBER_VERSIONLESS:Mutation CDS' columns as the variant header column.
11:13:07 - INFO - Removed 18217 variant kmers with length less than 150...
        5396728 variants correctly recorded (99.58%)
        22600 variants removed (0.42%)
          0 variants missing seq_id or var_column (0.000%)
          86 entries removed due to having a duplicate entry (0.002%)
          0 variants with seq_ID not found in sequences (0.000%)
          368 intronic variants found (0.007%)
          0 posttranslational region variants found (0.000%)
          0 unknown variants found (0.000%)
          0 variants with uncer

Updated variant info has been saved to ./sim_data_df.csv


11:16:13 - INFO - FASTA file containing VCRSs created at ./vcrs.fa.
11:16:13 - INFO - t2g file containing VCRSs created at ./vcrs_t2g.txt.
11:16:13 - INFO - Total runtime for vk build: 7m, 51.42s
  sim_data_df = pd.read_csv(update_df_out)
11:19:32 - INFO - Initial variant report
11:19:32 - INFO - Number of total variants: 5419328; VCRSs: 5419328; unique variants: 5419328; merged variants: 0

11:19:32 - INFO - GENE_NAME equal PODN
11:20:02 - INFO - Number of total variants: 425 (5418903 filtered); VCRSs: 425 (5418903 filtered); unique variants: 425 (5418903 filtered); merged variants: 0 (0 filtered)

11:20:02 - INFO - MIN_SIFT_SCORE less_than 0.4
11:20:02 - INFO - Number of total variants: 246 (179 filtered); VCRSs: 246 (179 filtered); unique variants: 246 (179 filtered); merged variants: 0 (0 filtered)

11:20:02 - INFO - mutant_sequence_read_parent is_not_null None
11:20:02 - INFO - Number of total variants: 246 (0 filtered); VCRSs: 246 (0 filtered); unique variants: 246 (0 filtered); 

### Inspect the output files

In [10]:
variants_updated_df.head()

Unnamed: 0,GENE_NAME,ACCESSION_NUMBER,ONC_TSG,CGC_TIER,MUTATION_URL,LEGACY_MUTATION_ID,Mutation CDS,Mutation AA,AA_MUT_START,AA_MUT_STOP,SHARED_AA,GENOMIC_WT_ALLELE_SEQ,GENOMIC_MUT_ALLELE_SEQ,AA_WT_ALLELE_SEQ,AA_MUT_ALLELE_SEQ,Mutation_Description_CDS,Mutation Description AA,ONTOLOGY_MUTATION_CODE,GENOMIC_MUTATION_ID,Mutation genome position GRCh37,Mutation genome position GRCh38,COSMIC_SAMPLE_TESTED,COSMIC_SAMPLE_MUTATED,DISEASE,WGS_DISEASE,EXAC_AF,EXAC_AFR_AF,EXAC_AMR_AF,EXAC_ADJ_AF,EXAC_EAS_AF,EXAC_FIN_AF,EXAC_NFE_AF,EXAC_SAS_AF,GNOMAD_EXOMES_AF,GNOMAD_EXOMES_AFR_AF,GNOMAD_EXOMES_AMR_AF,GNOMAD_EXOMES_ASJ_AF,GNOMAD_EXOMES_EAS_AF,GNOMAD_EXOMES_FIN_AF,GNOMAD_EXOMES_NFE_AF,GNOMAD_EXOMES_SAS_AF,GNOMAD_GENOMES_AF,GNOMAD_GENOMES_AFR_AF,GNOMAD_GENOMES_AMI_AF,GNOMAD_GENOMES_AMR_AF,GNOMAD_GENOMES_ASJ_AF,GNOMAD_GENOMES_EAS_AF,GNOMAD_GENOMES_FIN_AF,GNOMAD_GENOMES_MID_AF,GNOMAD_GENOMES_NFE_AF,GNOMAD_GENOMES_SAS_AF,CLINVAR_CLNSIG,CLINVAR_TRAIT,GERP++_RS,MIN_SIFT_SCORE,MIN_SIFT_PRED,DNDS_DISEASE_QVAL_SIG,MUTATION_SIGNIFICANCE_TIER,ACCESSION_NUMBER_VERSIONLESS,header,original_order,mutant_sequence_read_parent,mutant_sequence_read_parent_rc,mutant_sequence_read_parent_length,wt_sequence_read_parent,wt_sequence_read_parent_rc,wt_sequence_read_parent_length,semicolon_count,included_in_synthetic_reads,included_in_synthetic_reads_wt,included_in_synthetic_reads_mutant,list_of_read_starting_indices_wt,list_of_read_starting_indices_mutant,number_of_reads_wt,number_of_reads_mutant,any_noisy_reads_wt,noisy_read_indices_wt,any_noisy_reads_mutant,noisy_read_indices_mutant,any_noisy_reads,tumor_purity
0,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM9691637,c.1468C>T,p.Q490*,490,490,1,G,A,Q,*,Substitution,Substitution - Nonsense,SO:0001587,COSV105286190,1:51001067-51001067,1:50535395-50535395,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.52,1.0,,,Other,ENST00000396153,ENST00000396153:c.1468C>T,0,GACTCAAAAAACGGATCAGTTTCCGCTTTTCCTGATTATTATGGGA...,CGAAACTGTTCTGCCATCTCTCTCTCGTGAGCTTCCCTCTTTGCTC...,291.0,GACTCAAAAAACGGATCAGTTTCCGCTTTTCCTGATTATTATGGGA...,CGAAACTGTTCTGCCATCTCTCTCTCGTGAGCTTCCCTCTTTGCTC...,291.0,0,False,False,False,,,,,False,,False,,False,
1,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM7986191,c.711G>T,p.W237C,237,237,1,C,A,W,C,Substitution,Substitution - Missense,SO:0001583,COSV100875125,1:51121147-51121147,1:50655475-50655475,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.17,0.0,D,,Other,ENST00000396153,ENST00000396153:c.711G>T,1,CATTAAATCAAAACTTCATGCTGATCATCACCCACCGAGAAGTCCA...,GAACATCGGTGATTTGTTCTTCCGACTGTTCCCGGGTCTGTGCAGG...,291.0,CATTAAATCAAAACTTCATGCTGATCATCACCCACCGAGAAGTCCA...,GAACATCGGTGATTTGTTCTTCCGACTGTTCCCGGGTCTGTGCAGG...,291.0,0,False,False,False,,,,,False,,False,,False,
2,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6063740,c.1548C>T,p.R516=,516,516,2,G,A,R,R,Substitution,Substitution - coding silent,SO:1000057,COSV65638235,1:50957420-50957420,1:50491748-50491748,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,Other,ENST00000396153,ENST00000396153:c.1548C>T,2,AAGGGAACACAACAGTAGATGAGTTAATGATGAGACTCATGGCTGC...,GCTCAGGAGGCAGGGCTTGCTCTAAGGACAGCCGGATGGCCTCACG...,291.0,AAGGGAACACAACAGTAGATGAGTTAATGATGAGACTCATGGCTGC...,GCTCAGGAGGCAGGGCTTGCTCTAAGGACAGCCGGATGGCCTCACG...,291.0,0,False,False,False,,,,,False,,False,,False,
3,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM6657959,c.429del,p.G144Afs*13,144,144,1,T,,G,AGRREMWKTVRS*,Deletion,Deletion - Frameshift,SO:0001589,COSV65641246,1:51210389-51210389,1:50744717-50744717,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,Other,ENST00000396153,ENST00000396153:c.429del,3,AAAGGCAACCTCGGATGCTGGACTTCAGGGTTGAATACAGAGACAG...,GATTTAATGACTCCTGCAGGGCACCAGCATGACTAGATGATGAAGG...,290.0,AAAGGCAACCTCGGATGCTGGACTTCAGGGTTGAATACAGAGACAG...,GATTTAATGACTCCTGCAGGGCACCAGCATGACTAGATGATGAAGG...,291.0,0,False,False,False,,,,,False,,False,,False,
4,FAF1,ENST00000396153.2,,,https://cancer.sanger.ac.uk/cosmic/mutation/ov...,COSM10419714,c.797G>A,p.G266E,266,266,1,C,T,G,E,Substitution,Substitution - Missense,SO:0001583,COSV107478819,1:51061836-51061836,1:50596164-50596164,49031,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.64,0.248,T,,Other,ENST00000396153,ENST00000396153:c.797G>A,4,CAAGAGGTAAAGAGAAATGTGTATGACCTTACAAGTATCCCCGTTC...,TGACGCCATGCCAAATACTTCTCCATCATCCACCCCAAATTCTGTA...,291.0,CAAGAGGTAAAGAGAAATGTGTATGACCTTACAAGTATCCCCGTTC...,TGACGCCATGCCAAATACTTCTCCATCATCCACCCCAAATTCTGTA...,291.0,0,False,False,False,,,,,False,,False,,False,


In [11]:
read_df.head()

Unnamed: 0,read_id,read_header,read_sequence,read_index,read_strand,reference_header,vcrs_id,vcrs_header,vcrs_variant_type,mutant_read,wt_read,region_included_in_vcrs_reference,noise_added
0,ENST00000312553:c.1264G>A_28fM_0,ENST00000312553:c.1264G>A_28fM_0,CTGGAGTACCTGCTGCTGCACAGCAACCAGCTGCGGGAGCAGGGCA...,28,f,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,Substitution,True,False,True,False
1,ENST00000312553:c.1264G>A_6fM_1,ENST00000312553:c.1264G>A_6fM_1,TGTGCTGACCCCCATCCGCAGCCTGGAGTACCTGCTGCTGCACAGC...,6,f,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,Substitution,True,False,True,False
2,ENST00000312553:c.1264G>A_70fM_2,ENST00000312553:c.1264G>A_70fM_2,GGCATCCACCCACTGGCCTTCCAGGGCCTCAAGCGGTTGCACACGG...,70,f,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,Substitution,True,False,True,False
3,ENST00000312553:c.1264G>A_62fM_3,ENST00000312553:c.1264G>A_62fM_3,GGGAGCAGGGCATCCACCCACTGGCCTTCCAGGGCCTCAAGCGGTT...,62,f,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,Substitution,True,False,True,False
4,ENST00000312553:c.1264G>A_57fM_4,ENST00000312553:c.1264G>A_57fM_4,GCTGCGGGAGCAGGGCATCCACCCACTGGCCTTCCAGGGCCTCAAG...,57,f,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,ENST00000312553:c.1264G>A,Substitution,True,False,True,False


### Create the second set of simulated data: non-substitutions, 25 variants selected, 4 reads per alt allele, 6 reads per ref allele --> 25 x (4+6) = 250 reads

In [None]:
filters = ['Mutation_Description_CDS:not_equal=Substitution']
number_of_variants_to_sample = 25
number_of_reads_per_variant_alt = 4
number_of_reads_per_variant_ref = 6
reads_fastq_out_condition2 = os.path.join(vk_sim_out, "synthetic_reads_condition2.fastq")

filters.append('included_in_synthetic_reads:is_not_true')
simulated_df_dict = vk.sim(
    variants=variants_updated_df,
    reads_fastq_parent=reads_fastq_out,
    reads_csv_parent=read_df,
    number_of_variants_to_sample=number_of_variants_to_sample,
    strand=strand,
    number_of_reads_per_variant_alt=number_of_reads_per_variant_alt,
    number_of_reads_per_variant_ref=number_of_reads_per_variant_ref,
    k=k,
    w=w,
    read_length=read_length,
    seed=seed,
    add_noise_sequencing_error=add_noise,
    error_rate=error_rate,
    error_distribution=error_distribution,
    max_errors=max_errors,
    with_replacement=with_replacement,
    sequences=sequences_transcriptome,
    seq_id_column=seq_id_column,
    var_column=var_column,
    variant_type_column=variant_type_column,
    reference_out_dir=reference_dir,
    filters=filters,
    reads_fastq_out=reads_fastq_out_condition2,
    reads_csv_out=reads_csv_out,
    variants_updated_csv_out=variants_updated_csv_out,
    out=vk_sim_out,
)

variants_updated_df, read_df = simulated_df_dict['variants'], simulated_df_dict['read_df']

11:27:13 - INFO - Initial variant report
11:27:13 - INFO - Number of total variants: 5419328; VCRSs: 5419328; unique variants: 5419328; merged variants: 0

11:27:13 - INFO - Mutation_Description_CDS not_equal Substitution
11:28:08 - INFO - Number of total variants: 284407 (5134921 filtered); VCRSs: 284407 (5134921 filtered); unique variants: 284407 (5134921 filtered); merged variants: 0 (0 filtered)

11:28:08 - INFO - included_in_synthetic_reads is_not_true True
11:28:11 - INFO - Number of total variants: 284407 (0 filtered); VCRSs: 284407 (0 filtered); unique variants: 284407 (0 filtered); merged variants: 0 (0 filtered)

11:28:11 - INFO - mutant_sequence_read_parent is_not_null None
11:28:16 - INFO - Number of total variants: 275352 (9055 filtered); VCRSs: 275352 (9055 filtered); unique variants: 275352 (9055 filtered); merged variants: 0 (0 filtered)

11:28:16 - INFO - wt_sequence_read_parent is_not_null None
11:28:22 - INFO - Number of total variants: 275352 (0 filtered); VCRSs: 27

### Add in random portions from the genome/transcriptome

In [13]:
sequences_genome = os.path.join(reference_dir, "Homo_sapiens.GRCh37.dna.primary_assembly.fa")

fastq_output_path_random_transcriptome = os.path.join(vk_sim_out, "synthetic_reads_random_transcriptome.fastq")
fastq_output_path_random_genome = os.path.join(vk_sim_out, "synthetic_reads_random_genome.fastq")

number_of_random_reads_transcriptome = 100
number_of_random_reads_genome = 50

In [14]:
if not os.path.exists(sequences_genome):
    !gget ref -w dna -r 93 --out_dir {reference_dir} -d human_grch37
    !gunzip {sequences_genome}.gz

11:33:52 - INFO - Fetching reference information for homo_sapiens from Ensembl release: 93.
{
    "homo_sapiens": {
        "genome_dna": {
            "ftp": "http://ftp.ensembl.org/pub/grch37/release-93/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz",
            "ensembl_release": 93,
            "release_date": "2015-11-27",
            "release_time": "19:02",
            "bytes": "830M"
        }
    }
}
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  829M  100  829M    0     0  2840k      0  0:04:59  0:04:59 --:--:-- 2133k08:09  0:00:01  0:08:08 1733k0  9412k      0  0:01:30  0:00:08  0:01:22 10.4M  7534k      0  0:01:52  0:00:20  0:01:32 4556k:47  0:00:37  0:02:10 1399k1351k 0:03:44  0:00:58  0:02:46 1632k   0  3714k      0  0:03:48  0:01:00  0:02:48 1731k0  3294k      0  0:04:17  0:01:14  0:03:03 1689k   0  3165k      0  0:04:28  0:01:22  0:

In [15]:
if number_of_random_reads_transcriptome > 0:
    read_df = build_random_genome_read_df(reference_fasta_file_path = sequences_transcriptome, read_df = read_df, read_df_out=reads_csv_out, fastq_output_path = fastq_output_path_random_transcriptome, fastq_parent_path = reads_fastq_out, n = number_of_random_reads_transcriptome, read_length = read_length, input_type = "transcriptome", strand = strand, add_noise_sequencing_error=add_noise, seed=seed)
if number_of_random_reads_genome > 0:
    read_df = build_random_genome_read_df(reference_fasta_file_path = sequences_genome, read_df = read_df, read_df_out=reads_csv_out, fastq_output_path = fastq_output_path_random_genome, fastq_parent_path = reads_fastq_out, n = number_of_random_reads_genome, read_length = read_length, input_type = "genome", strand = strand, add_noise_sequencing_error=add_noise, seed=seed)

In [16]:
read_df.tail()

Unnamed: 0,read_id,read_header,read_sequence,read_index,read_strand,reference_header,vcrs_id,vcrs_header,vcrs_variant_type,mutant_read,wt_read,region_included_in_vcrs_reference,noise_added
895,wt_genome_randomfW_45,GL000200.1:31638_31788_randomfW_45,AATTTAAATGTGTTTTAATTTTCTCTTACTGTTAGGAAATTGATAT...,31638,f,,,,,False,True,False,False
896,wt_genome_randomfW_46,GL000221.1:119095_119245_randomfW_46,TATAATGTCTCATGCAAAATTTAGTTCTTTGTTTCTCAGCTTAAAC...,119095,f,,,,,False,True,False,False
897,wt_genome_randomfW_47,11:72252017_72252167_randomfW_47,GGAGACCAGATTGGGGACAGGGAAGTGGAGAACTGGGGATTGTTGA...,72252017,f,,,,,False,True,False,False
898,wt_genome_randomfW_48,GL000233.1:4879_5029_randomfW_48,TTTCCAGGTGCCATCTGTTACCCCTTTCCTTGCCCAGGAATGGGAA...,4879,f,,,,,False,True,False,False
899,wt_genome_randomfW_49,GL000210.1:14213_14363_randomfW_49,ACCCTCACCTGGTGTCTCTGTGGTGGGGACACCCCCTCACCTGCTG...,14213,f,,,,,False,True,False,False
