In [4]:
import pandas as pd
import numpy as np
import pysam   
import collections
from cyvcf2 import VCF

df_breakpoints = pd.read_csv('../../../data/references/liu-data-reference/all_break_points.txt', sep='\t')
df_lookup = pd.read_csv('../../../data/references/liu-data-reference/liu_sample_lookup.tsv', sep='\t')
bam_files_path = '../../../data/liu-data/bam/'
vcf_files_path = '../../../data/liu-data/vcf/'

In [25]:
'''
Grab reads from all_break_points.txt, look up their corresponding bam file in liu_sample_lookup.tsv
and write the reads into a text file
'''

with open('breakpoint_bams.txt', 'w') as f:
    for index, row in df_breakpoints.iterrows():
        
        individual = row[2]
        chromosome = row[3]
        left_bound = int(row[4])
        right_bound = int(row[5])
        
        try:
            run = df_lookup[df_lookup['individual'] == individual].iloc[0]['Run'] + '.dup.bam'
        except:
            continue

        alignment_file = pysam.AlignmentFile(bam_files_path + run, 'rb')
        for alignment in alignment_file.fetch(chromosome, left_bound, right_bound):
            f.write(str(alignment)+'\n')

In [84]:
'''
Using cyvcf, go through each of the breakpoints and check the number of SNPs per segment.
Return the SNP locations in a list.
'''

def check_SNPs(f_name, chromosome, left_bound, right_bound):
    vcf_in = VCF(f_name)
    region = '{c}:{l}-{r}'.format(c=chromosome, l=left_bound, r=right_bound)
    records = [rec for rec in vcf_in(region)]
    return records


In [24]:
'''
SNP distribution analysis per 10k bases
'''

bound = 0
SNP_count = []
f_name = vcf_files_path + 'parental_filtered.vcf.gz'

with open('SNP_10k.txt', 'w') as f:
    while True:
        count = len(check_SNPs(f_name, 'chromosome_1', bound, bound + 10000))
        SNP_count.append(count)
        f.write('{b1}-{b2}: {c}\n'.format(b1=bound, b2=bound+10000, c=count))
        bound += 10000
        
        #didn't find a way to get the length of sequence or last SNP position, so this is my hack
        if SNP_count[-5:] == [0, 0, 0, 0, 0]:
            break
    
    

In [29]:
'''
SNP distribution analysis per breakpoint
'''
f_name = vcf_files_path + 'parental_filtered.vcf.gz'

with open('SNP_breakpoint.txt', 'w') as f:
    for index, row in df_breakpoints.iterrows():
        
        individual = row[2]
        chromosome = row[3]
        left_bound = int(row[4])
        right_bound = int(row[5])
        
        try:
            run = df_lookup[df_lookup['individual'] == individual].iloc[0]['Run'] + '.dup.bam'
        except:
            continue
        
        #parental_filtered.vcf only has chromosome 1
        if chromosome != "chromosome_1":
            continue
        
        count = len(check_SNPs(f_name, chromosome, left_bound, right_bound))
        f.write('{run} {chrom} {l} {r} {c}\n'.format(run = run, chrom=chromosome, l=left_bound, r=right_bound, c=count))
        


In [151]:
'''
Create bam file using first region of all_break_points.txt. 10% of reads should be of that section and the
rest should be from other individuals without the recombination there

recomb_filepath: full filepath to the bam file that the recombination is on
    ex: bam_file_path + 'SRR5243261.dup.bam'
chromsome: chromosome that the recombination is on
    ex: chromsome_1
left_bound/right_bound: the bounds of the recombination
    ex: 3440560, 3441577
lines: lines from each files, total number of lines in mock file is lines * (no. of other_files + 1)
    Note: there may not be enough reads for your given amount of lines
other_files: default is set in function, can also be user defined

'''

def recomb_mock(recomb_filepath, chromosome, left_bound, right_bound, lines=200, other_files=None):
    
    if other_files == None:
        other_files = ['SRR5243250.dup.bam', 'SRR5243251.dup.bam', 'SRR5243252.dup.bam', 'SRR5243253.dup.bam', 
             'SRR5243254.dup.bam', 'SRR5243255.dup.bam', 'SRR5243256.dup.bam', 'SRR5243257.dup.bam', 'SRR5243262.dup.bam']
    
    recomb_file = pysam.AlignmentFile(recomb_filepath)

    f = pysam.AlignmentFile('recomb_mock.bam', 'wh', template=recomb_file)

    counter = 0

    for alignment in recomb_file.fetch(chromosome, left_bound, right_bound):
        if counter >= lines:
            break
        f.write(alignment)
        counter += 1
        
    print('recomb, ' +  str(counter))
    
    for file in other_files:
        alignment_file = pysam.AlignmentFile(bam_files_path + file)

        counter = 0

        for alignment in alignment_file.fetch(chromosome, left_bound, right_bound):

            # write 200 lines from each file
            if counter >= lines:
                break

            f.write(alignment)
            counter += 1
        
        print(file, counter)

In [172]:
filepath = bam_files_path + 'SRR5243261.dup.bam'

#recomb_mock(filepath, 'chromosome_1', 3440560, 3441577)
recomb_mock(filepath, 'chromosome_1', 3440400, 3441700)

recomb, 84
SRR5243250.dup.bam 46
SRR5243251.dup.bam 98
SRR5243252.dup.bam 70
SRR5243253.dup.bam 68
SRR5243254.dup.bam 63
SRR5243255.dup.bam 71
SRR5243256.dup.bam 29
SRR5243257.dup.bam 93
SRR5243262.dup.bam 88


In [170]:
'''
Given a bam file object, return the percentage of recombination in the bam files
'''

def recomb_percentage(bam_file_obj):
    
    recomb = 0
    no_match = 0
    total = 0
    SNP_count = 0
    skip_counter = 0
    
    for record in bam_file_obj:
        
        total += 1
        if total % 100 == 0:
            print('recomb: {r}, no match: {n}, sequences: {t}, SNPs: {s}'.format(r=recomb, n=no_match, 
                                                                                 t=total, s=SNP_count))
    
        SNPs = check_SNPs(vcf_files_path + 'parental_filtered.vcf.gz', record.reference_name, 
                          record.reference_start,
                          record.reference_start + record.query_alignment_length)
        
        # double list comp to see if the first of all the cigartuples are 0 or 8
        # 0 is match, 8 is point mismatch
        if not all([n in [0, 8] for n in [t[0] for t in record.cigartuples]]):
            skip_counter += 1
            continue
        
        if len(SNPs) > 1:
            previous = 'not set'
            segment = record.query_sequence
            
            for SNP in SNPs:
                start = SNP.start - record.reference_start
                SNP_count += 1
                
                strand1 = SNP.gt_bases[0][0]
                strand2 = SNP.gt_bases[1][0]
                
                if segment[start] == strand1:
                    if previous == 'not set':
                        previous = 'sample1'
                    elif previous == 'sample2':
                        recomb += 1
                        break                    
                elif segment[start] == strand2:
                    if previous == 'not set':
                        previous = 'sample2'
                    elif previous == 'sample1':
                        recomb += 1
                        break       
                else:
                    print(record.query_alignment_qualities[start])
                    no_match += 1
                
    print(skip_counter)
    print('recomb: {r}, no match: {n}, sequences: {t}, SNPs: {s}'.format(r=recomb, n=no_match, 
                                                                                 t=total, s=SNP_count))
    return (recomb/total, no_match/total)

In [171]:
bam_file_obj = pysam.AlignmentFile('recomb_mock.bam', 'r')

recomb_percentage(bam_file_obj)



32
recomb: 0, no match: 1, sequences: 100, SNPs: 131
32
8
recomb: 0, no match: 3, sequences: 200, SNPs: 391
41
recomb: 0, no match: 4, sequences: 300, SNPs: 733
32
recomb: 0, no match: 5, sequences: 400, SNPs: 877
113
recomb: 0, no match: 5, sequences: 458, SNPs: 1053


(0.0, 0.010917030567685589)