In [1]:
import subprocess
import pysam
import numpy.random as r
import re
import ast
from cyvcf2 import VCF
from tqdm import tqdm
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq


In [4]:
"""
Creates ITERATIONS number of fasta sequences from vcf2fasta sequences
that have RECOMBINATION_CHANCE recombination rate for power analysis
"""


args = {}
# properties of vcf2fasta generated sequences sequences
VCF2FASTA_START = 1
VCF2FASTA_END = 800000
VCF2FASTA_FILEPATH = 'output/vcf2fasta.fasta'

# recombination event probabilitiy per base
RECOMBINATION_CHANCE = 0.00004

# number of recombined sequences to generate
ITERATIONS = 5

# tsv output filepath containing phase change locations for each sequence
PHASE_CHANGE_TSV = 'phase_changes.tsv'
# fasta output filepath containing all recombined sequences
SEQUENCES_OUTPUT = 'generated_sequences.fasta'



# parse results from vcf2fasta
fasta_file = list(SeqIO.parse(VCF2FASTA_FILEPATH, 'fasta'))

# tsv for phase changes
phase_change_tsv = open(PHASE_CHANGE_TSV, 'w')

# regex to identify correct fasta reference_name
chrom = re.search(r'chromosome_[0-9]{1,2}', str(fasta_file[0].id)).group()

fastas = []

iteration_counter = 1

for i in range(ITERATIONS):

    events = r.poisson((VCF2FASTA_END - VCF2FASTA_START) * RECOMBINATION_CHANCE)
    phase_changes = sorted(list(r.choice((VCF2FASTA_END - VCF2FASTA_START), size=events)))
    
    phase_change_tsv.write('sequence' + str(iteration_counter) + '\t' + str(phase_changes) + '\n')

    haplotype0 = str(fasta_file[0].seq)
    haplotype1 = str(fasta_file[1].seq)

    current = 0
    idx = VCF2FASTA_START
    recomb_seq = []

    for base in zip(haplotype0, haplotype1):

        # switch haplotype if idx in list phase_changes    
        current = (current + 1) % 2 if idx in phase_changes else current
        recomb_seq.append(base[current])

        if idx >= VCF2FASTA_END:
            break
        else:
            idx += 1

    recomb_record = SeqRecord(
        Seq(''.join(recomb_seq)),
        id=chrom + '_' + str(iteration_counter),
        description=chrom
    )

    fastas.append(recomb_record)
    iteration_counter += 1

SeqIO.write(fastas, SEQUENCES_OUTPUT, 'fasta')
phase_change_tsv.close()


In [None]:
"""
This cell uses the output from above (SEQUENCES_OUTPUT) and generates read pairs using ART,
then it uses readcomb.filter (not the one in pypi package, but the filter.py file in this folder) to
generate statistics comparing number of filtered and unfiltered reads in windows of 1000
"""

import subprocess
from cyvcf2 import VCF
from tqdm import tqdm

# import classification new one I haven't uploaded to pypi
# not good habit so don't do this if you're copying this script
from classification import *


VCF_FILEPATH = 'data/filtered_full.vcf.gz'
vcf = VCF(VCF_FILEPATH)

window_counts_tsv = open('window_counts.tsv', 'w')

COVERAGE = 0.25
INSERT = 100
UNFILTERED_OUTPUT = 'unfiltered'
FILTERED_OUTPUT = 'filtered'

iteration = 1

for i in range(5):
    
    # run art_illumina on 400 sequences with coverage 400
    print('Running art_illumina iteration: ' + str(iteration))
    
    subprocess.run(['art_illumina',
                    '-f', str(COVERAGE),
                    '-l', '250',
                    '-ss', 'MSv1',
                    '-i', SEQUENCES_OUTPUT,
                    '-m', str(500 + INSERT),
                    '-s', '1',
                    '-o', UNFILTERED_OUTPUT + str(iteration),
                    '-M', '-p', '-sam', '-na', '-q'])

    # run readcomb on 400 sequence output generated by art_illumina
    print('Running readcomb iteration: ' + str(iteration))

    subprocess.run(['readcomb-filter',
                    '-b', UNFILTERED_OUTPUT + str(iteration) + '.sam',
                    '-v', VCF_FILEPATH,
                    '-p', '4',
                    '-o', FILTERED_OUTPUT + str(iteration)])
    
    

    # read in sam files and 
    
    unfiltered_pairs = pairs_creation(UNFILTERED_OUTPUT + str(iteration) + '.sam', VCF_FILEPATH)
    filtered_pairs = pairs_creation(FILTERED_OUTPUT + str(iteration) + '.sam', VCF_FILEPATH)
    
    # call get_midpoint on all of them, this will take quite a long time so use tqdm
    
    #[[filtered, unfiltered], ...]
    window_counts = [[0,0] for i in range(200)]
    
    for pair in tqdm(filtered_pairs):
        pair.classify(vcf=vcf)
        midpoint = pair.get_midpoint()
        window = int(midpoint // 1000)
        window_counts[window][0] += 1    
    
    for pair in tqdm(unfiltered_pairs):
        start = pair.rec_1.reference_start
        end = pair.rec_2.reference_start + pair.rec_2.query_alignment_length
        midpoint = int((start + end) / 2)
        
        window = int(midpoint // 1000)
        window_counts[window][1] += 1
    
    for i in range(len(window_counts)):
        window_counts_tsv.write('\t'.join(map(str, [
            iteration,
            i * 1000,
            (i + 1) * 1000,
            window_counts[i][0],
            window_counts[i][1]
        ])) + '\n')
        
    iteration += 1

window_counts_tsv.close()