In [1]:
import pandas as pd
import numpy as np
import pysam   
import collections
import os
from Bio import SeqIO
from cyvcf2 import VCF

df_breakpoints = pd.read_csv('../../data/references/liu-data-reference/all_break_points.txt', sep='\t')
df_lookup = pd.read_csv('../../data/references/liu-data-reference/liu_sample_lookup.tsv', sep='\t')
bam_files_path = '../../data/liu-data/bam/'
vcf_files_path = '../../data/liu-data/vcf/'
reference_files_path = '../../data/references/'

In [25]:
'''
Grab reads from all_break_points.txt, look up their corresponding bam file in liu_sample_lookup.tsv
and write the reads into a text file
'''

with open('breakpoint_bams.txt', 'w') as f:
    for index, row in df_breakpoints.iterrows():
        
        individual = row[2]
        chromosome = row[3]
        left_bound = int(row[4])
        right_bound = int(row[5])
        
        try:
            run = df_lookup[df_lookup['individual'] == individual].iloc[0]['Run'] + '.dup.bam'
        except:
            continue

        alignment_file = pysam.AlignmentFile(bam_files_path + run, 'rb')
        for alignment in alignment_file.fetch(chromosome, left_bound, right_bound):
            f.write(str(alignment)+'\n')

In [2]:
'''
Using cyvcf, go through each of the breakpoints and check the number of SNPs per segment.
Return the SNP locations in a list.
'''

def check_snps(f_name, chromosome, left_bound, right_bound):
    vcf_in = VCF(f_name)
    region = '{c}:{l}-{r}'.format(c=chromosome, l=left_bound, r=right_bound)
    records = [rec for rec in vcf_in(region)]
    return records


In [24]:
'''
SNP distribution analysis per 10k bases
'''

bound = 0
SNP_count = []
f_name = vcf_files_path + 'parental_filtered.vcf.gz'

with open('SNP_10k.txt', 'w') as f:
    while True:
        count = len(check_SNPs(f_name, 'chromosome_1', bound, bound + 10000))
        SNP_count.append(count)
        f.write('{b1}-{b2}: {c}\n'.format(b1=bound, b2=bound+10000, c=count))
        bound += 10000
        
        #didn't find a way to get the length of sequence or last SNP position, so this is my hack
        if SNP_count[-5:] == [0, 0, 0, 0, 0]:
            break
    
    

In [29]:
'''
SNP distribution analysis per breakpoint
'''
f_name = vcf_files_path + 'parental_filtered.vcf.gz'

with open('SNP_breakpoint.txt', 'w') as f:
    for index, row in df_breakpoints.iterrows():
        
        individual = row[2]
        chromosome = row[3]
        left_bound = int(row[4])
        right_bound = int(row[5])
        
        try:
            run = df_lookup[df_lookup['individual'] == individual].iloc[0]['Run'] + '.dup.bam'
        except:
            continue
        
        #parental_filtered.vcf only has chromosome 1
        if chromosome != "chromosome_1":
            continue
        
        count = len(check_SNPs(f_name, chromosome, left_bound, right_bound))
        f.write('{run} {chrom} {l} {r} {c}\n'.format(run = run, chrom=chromosome, l=left_bound, r=right_bound, c=count))
        


In [3]:
'''
Create bam file using first region of all_break_points.txt. 10% of reads should be of that section and the
rest should be from other individuals without the recombination there

recomb_filepath: full filepath to the bam file that the recombination is on
    ex: bam_file_path + 'SRR5243261.dup.bam'
chromsome: chromosome that the recombination is on
    ex: chromsome_1
left_bound/right_bound: the bounds of the recombination
    ex: 3440560, 3441577
lines: lines from each files, total number of lines in mock file is lines * (no. of other_files + 1)
    Note: there may not be enough reads for your given amount of lines
other_files: default is set in function, can also be user defined

'''

def recomb_mock(recomb_filepath, chromosome, left_bound, right_bound, lines=200, other_files=None):
    
    if other_files == None:
        other_files = ['SRR5243250.dup.bam', 'SRR5243251.dup.bam', 'SRR5243252.dup.bam', 'SRR5243253.dup.bam', 
             'SRR5243254.dup.bam', 'SRR5243255.dup.bam', 'SRR5243256.dup.bam', 'SRR5243257.dup.bam', 'SRR5243262.dup.bam']
    
    recomb_file = pysam.AlignmentFile(recomb_filepath)

    f = pysam.AlignmentFile('recomb_mock.bam', 'wh', template=recomb_file)

    counter = 0

    for alignment in recomb_file.fetch(chromosome, left_bound, right_bound):
        if counter >= lines:
            break
        f.write(alignment)
        counter += 1
        
    print('recomb, ' +  str(counter))
    
    for file in other_files:
        alignment_file = pysam.AlignmentFile(bam_files_path + file)

        counter = 0

        for alignment in alignment_file.fetch(chromosome, left_bound, right_bound):

            # write 200 lines from each file
            if counter >= lines:
                break

            f.write(alignment)
            counter += 1
        
        print(file, counter)

In [326]:
filepath = bam_files_path + 'SRR5243277.dup.bam'

f = open(filepath)

#recomb_mock(filepath, 'chromosome_1', 3440560, 3441577)
#recomb_mock(filepath, 'chromosome_1', 3440400, 3441700)
recomb_mock(filepath, 'chromosome_1', 1488300, 1497200, lines=2000)

recomb, 1449
SRR5243250.dup.bam 1509
SRR5243251.dup.bam 1623
SRR5243252.dup.bam 1328
SRR5243253.dup.bam 1323
SRR5243254.dup.bam 1368
SRR5243255.dup.bam 1810
SRR5243256.dup.bam 1501
SRR5243257.dup.bam 1449
SRR5243262.dup.bam 1643


In [4]:
'''
Given a bam file object, return the percentage of recombination in the bam files

If bam=True, create a new bam file with reads, if not, print in a human readable form

If mode='no_match', create a file with only no_match sequences.
If mode='all', create a file with all sequences
If mode='phase_change', create a file with only sequences with phase changes
'''

def recomb_diagnosis(bam_file_obj, bam=False, mode='no_match', output_filename='recomb_diagnosis'):
    
    '''
    recomb = 0
    no_match = 0
    total = 0
    SNP_count = 0
    skip_counter = 0
    '''
    
    if bam:
        f_obj = pysam.AlignmentFile(output_filename + '.sam', 'wh', template=bam_file_obj)
        
    else:
        f_obj = open(output_filename + '.txt', 'w')

        f_obj.write('Key: \nSequence: Start - End \nReference sequence \nPhred Scale Quality \nQuery alignment sequence \n')
        f_obj.write('1: CC2935, 2: CC2936, N: Does not match SNP \n\n')

    # get reference segment
    seq_obj = SeqIO.parse(reference_files_path + 'chlamy.5.3.w_organelles_mtMinus.fasta', 'fasta')

    # grab chromosome 1
    for seq in seq_obj:
        chrom_1 = seq
        break    
    
    for record in bam_file_obj:
                    
        snps = check_snps(vcf_files_path + 'parental_filtered.vcf.gz', record.reference_name, 
                          record.reference_start + 1,
                          record.reference_start + record.query_alignment_length + 1)
        
        '''
        # double list comp to see if the first of all the cigartuples are 0 or 8
        # 0 is match, 8 is point mismatch
        if not all([n in [0, 8] for n in [t[0] for t in record.cigartuples]]):
            skip_counter += 1
            continue
        '''
        
        if len(snps) > 1:
            
            # tuple-checking
            cigar_tuples = record.cigartuples
            
            # initialize segment for building
            segment = ''
            
            # reference sequence
            ref = chrom_1[record.reference_start:record.reference_start + record.query_alignment_length] 
            
            # flag to keep track of whether or not there is an insertion in the cigar string
            insertion_flag = False
            
            # 0 is match and sequences are length of 150, so it's a full match
            if cigar_tuples == [(0, 150)]:
                segment = record.query_sequence
            else:
                # index to keep track of where we are in the query_segment
                query_segment = record.query_sequence
                index = 0
                
                for cigar_tuple in cigar_tuples:
                    # 4 = soft clipping, the record.query_sequence has the portion that is soft clipping
                    # so we need to skip it with index
                    if cigar_tuple[0] == 4:
                        index += cigar_tuple[1]
                        
                    # 5 = hard clipping, record.query_sequence does not have the portion that is
                    # hard clipping so we don't skip it and we don't add anything
                    elif cigar_tuple[0] == 5:
                        continue
                    
                    # if it is a match(0), then just add it onto the segment 
                    elif cigar_tuple[0] == 0:
                        segment += query_segment[index:index+cigar_tuple[1]]
                        index += cigar_tuple[1]
                    
                    # 1 is an insertion, we will add gaps to the reference
                    elif cigar_tuple[0] == 1:
                        segment += query_segment[index:index+cigar_tuple[1]]
                        index += cigar_tuple[1]
                        
                        ref = ref[:index] + '-' * cigar_tuple[1] + ref[index:]
                        
                        insertion_flag = True
                        
                    else:
                        print('oops forgot to consider this: ' + str(cigar_tuple))
                        print(cigar_tuples)
            
            previous = 'not set'
            
            snp_lst = [' '] * record.query_alignment_length
            
            # no match flag, is true when we detect a no match for one of the SNPs below
            no_match_flag = False
            
            # flag for phase detection
            phase_change_flag = False
            previous_strand = None
            
            for snp in snps:
                # Using SNP.start and record.reference_start since they are both 0 based
                
                start = snp.start - record.reference_start
                
                # extra calculations to realign start if there is an insertion
                if insertion_flag:
                    
                    current_tuple = 0
                    current_base = 0
                    
                    while current_base < start and current_tuple < len(cigar_tuples):
                        if cigar_tuples[current_tuple][0] == 1:
                            # shift the start over by the amount of insertion to compensate for it
                            start += cigar_tuples[current_tuple][1]
                        
                        current_base += cigar_tuples[current_tuple][1]
                        current_tuple += 1
                            
                        
                
                # indexing for VCF seems to be a bit weird and will sometimes be -1
                if start < 0:
                    raise Exception('VCF indexing is off. Check SNP at {}'.format(snp))
                
                strand1 = snp.gt_bases[0][0]
                strand2 = snp.gt_bases[1][0]
                
                if start >= len(segment):
                    break
                
                if segment[start] == strand1:
                    snp_lst[start] = '1'
                    
                    # phase change detection
                    if previous_strand == 'strand2':
                        phase_change_flag = 'True'
                    previous_strand = 'strand1'
                    
                elif segment[start] == strand2:
                    snp_lst[start] = '2'
                    
                    if previous_strand == 'strand1':
                        phase_change_flag = 'True'
                    previous_strand = 'strand2'
                else:
                    snp_lst[start] = 'N'
                    no_match_flag = True
                    
            snp_str = ''.join(snp_lst)
            
            # qualities
            qualities_str = ''
            for quality in record.query_alignment_qualities:
                qualities_str += chr(quality + 33)
            
            
            # we only want no_match and we didn't find a no_match
            if mode == 'no_match' and not no_match_flag:
                continue
            
            elif mode == 'phase_change' and not phase_change_flag:
                continue
            
            elif bam:
                f_obj.write(record)
                
            else:
                f_obj.write('Sequence: {start} - {end} \n'.format(start=record.reference_start, 
                                                               end=record.reference_start+record.query_alignment_length))

                f_obj.write(str(cigar_tuples) + '\n')

                f_obj.write(str(ref.seq) + '\n' + qualities_str + '\n' + segment + '\n' + snp_str + '\n \n')


In [5]:
bam_file_obj = pysam.AlignmentFile('recomb_mock.bam', 'r')

recomb_diagnosis(bam_file_obj, bam=False, mode='phase_change')

