In [2]:
import subprocess
import pysam
import numpy.random as r
import re
import ast
from tqdm import tqdm
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [77]:
# 1-sequence with coverage of 1

VCF2FASTA_FILEPATH = 'output/vcf2fasta.fasta'
ANALYSIS_SCRIPT = 'analysis.py'

# run analysis.py to create fasta sequence with phase change
# and generate reads with art_illumina

COVERAGE = 1
PERCENTAGE = 0.0004
BASE_COUNT = 1000000
ANALYSIS_OUTPUT = 'output/unfiltered'
FILTERED_OUTPUT = 'output/filtered'
window_counts = open('output/window_counts.tsv', 'w')

iteration = 1

for i in range(5):

    subprocess.run(map(str, ['python3', 'analysis.py',
                    '-f', VCF2FASTA_FILEPATH,
                    '-e', BASE_COUNT,
                    '-c', COVERAGE,
                    '-p', PERCENTAGE,
                    '-o', ANALYSIS_OUTPUT + str(iteration)]))

    # read last line of output/phase_change_log.txt 
    # that analysis.py writes to get list of phase changes

    phase_changes = subprocess.check_output(['tail', '-1', 'output/phase_change_log.txt']).decode('utf-8').strip()

    phase_changes = ast.literal_eval(phase_changes)


    # run readcomb on output of art_illumina


    subprocess.run(['readcomb-filter',
                    '-b', ANALYSIS_OUTPUT + str(iteration) + '.sam',
                    '-v', '../data/filtered_full.vcf.gz',
                    '-p', '4',
                    '-o', FILTERED_OUTPUT + str(iteration)])

    # sort and index the readcomb-filtered and unfiltered sam files

    with open(ANALYSIS_OUTPUT + str(iteration) + 'sorted.bam', 'w') as f:
        subprocess.call(['samtools', 'sort', ANALYSIS_OUTPUT + str(iteration) + '.sam'], stdout=f)

    subprocess.run(['samtools', 'index', ANALYSIS_OUTPUT + str(iteration) + 'sorted.bam'])

    with open(FILTERED_OUTPUT + str(iteration) + 'sorted.bam', 'w') as f:
        subprocess.call(['samtools', 'sort', FILTERED_OUTPUT + str(iteration) + '.sam'], stdout=f)

    subprocess.run(['samtools', 'index', FILTERED_OUTPUT + str(iteration) + 'sorted.bam'])

    # read in indexed files and fetch reads inside windows

    unfiltered = pysam.AlignmentFile(ANALYSIS_OUTPUT + str(iteration) + 'sorted.bam', 'r')
    filtered = pysam.AlignmentFile(FILTERED_OUTPUT + str(iteration) + 'sorted.bam', 'r')

    for i in range(0, base_count, 1000):
        filtered_count = len(list(filtered.fetch('chromosome_1', i, i+2000)))
        unfiltered_count = len(list(unfiltered.fetch('chromosome_1', i, i+2000)))

        window_counts.write('\t'.join(map(str, [
            iteration,
            i,
            i + 2000,
            filtered_count,
            unfiltered_count,
            list(filter(lambda x: x>i and x<i+2000, phase_changes))
        ])) + '\n')
        
    iteration += 1


In [3]:
# create a 400 sequence fasta and corresponding csv

args = {}
args['end'] = 1100000
args['start'] = 1000000
args['percent'] = 0.00004
args['fasta'] = 'output/vcf2fasta.fasta'

iteration = 1

# parse results from vcf2fasta
fasta_file = list(SeqIO.parse(args['fasta'], 'fasta'))

# tsv for phase changes
phase_change_tsv = open('output400/phase_changes.tsv', 'w')

# regex to identify correct fasta reference_name
chrom = re.search(r'chromosome_[0-9]{1,2}', str(fasta_file[0].id)).group()

fastas = []

for i in range(400):

    events = r.poisson((args['end'] - args['start']) * args['percent'])
    phase_changes = sorted(list(r.choice((args['end']- args['start']), size=events)))
    
    phase_change_tsv.write('sequence' + str(iteration) + '\t' + str(phase_changes) + '\n')

    haplotype0 = str(fasta_file[0].seq)
    haplotype1 = str(fasta_file[1].seq)

    current = 0
    idx = args['start']
    recomb_seq = []

    for base in zip(haplotype0, haplotype1):

        # switch haplotype if idx in list phase_changes    
        current = (current + 1) % 2 if idx in phase_changes else current
        recomb_seq.append(base[current])

        if idx >= args['end']:
            break
        else:
            idx += 1

    recomb_record = SeqRecord(
        Seq(''.join(recomb_seq)),
        id=chrom,
        name='sequence_' + str(iteration),
        description=chrom,
        dbxrefs=fasta_file[0].dbxrefs,
        features=fasta_file[0].features,
        annotations=fasta_file[0].annotations,
        letter_annotations=fasta_file[0].letter_annotations
    )

    fastas.append(recomb_record)
    iteration += 1

SeqIO.write(fastas, 'output400/400_generated_sequences.fasta', 'fasta')
phase_change_tsv.close()


In [5]:
# import classification new one I haven't uploaded to pypi
# not good habit so don't do this if you're copying this script
from classification import *

# 400 coverage with 400 sequences

VCF_FILEPATH = '../data/filtered_full.vcf.gz'
window_counts_tsv = open('output400/window_counts.tsv', 'w')
SEQUENCE400 = 'output400/400_generated_sequences.fasta'

COVERAGE = 5
INSERT = 100
ART_OUTPUT = 'output400/unfiltered'
FILTERED_OUTPUT = 'output400/filtered'

iteration = 1

for i in range(1):
    
    # run art_illumina on 400 sequences with coverage 400
    
    subprocess.run(['art_illumina',
                    '-f', str(COVERAGE),
                    '-l', '250',
                    '-ss', 'MSv1',
                    '-i', SEQUENCE400,
                    '-m', str(500 + INSERT),
                    '-s', '1',
                    '-o', ART_OUTPUT + str(iteration),
                    '-M', '-p', '-sam', '-na', '-q'])

    # run readcomb on 400 sequence output generated by art_illumina

    subprocess.run(['readcomb-filter',
                    '-b', ART_OUTPUT + str(iteration) + '.sam',
                    '-v', VCF_FILEPATH,
                    '-p', '4',
                    '-o', FILTERED_OUTPUT + str(iteration)])
    

    # read in sam files and 
    
    unfiltered_pairs = pairs_creation(ART_OUTPUT + str(iteration) + '.sam', VCF_FILEPATH)
    filtered_pairs = pairs_creation(FILTERED_OUTPUT + str(iteration) + '.sam', VCF_FILEPATH)
    
    # call get_midpoint on all of them, this will take quite a long time so use tqdm
    
    #[[filtered, unfiltered], ...]
    window_counts = [[0,0] for i in range(100)]
    
    for pair in tqdm(filtered_pairs):
        midpoint = pair.get_midpoint()
        window = int(midpoint // 1000)
        window_counts[window][0] += 1
    
    for pair in tqdm(unfiltered_pairs):
        midpoint = pair.get_midpoint()
        window = int(midpoint // 1000)
        window_counts[window][1] += 1
        
    for i in range(len(window_counts)):
        window_counts_tsv.write('\t'.join(map(str, [
            iteration,
            i * 1000,
            (i + 1) * 1000,
            window_counts[i][0],
            window_counts[i][1]
        ])) + '\n'
            
        )
        
    iteration += 1

100%|██████████| 22/22 [00:00<00:00, 64.85it/s] 
 13%|█▎        | 40890/318475 [01:09<07:52, 587.58it/s]


AssertionError: error loading tabix index for b'../data/filtered_full.vcf.gz'