### Algorithm 1: Locate-Termination

In [1]:
from scipy.stats import poisson
import pysam
import pandas as pd
import numpy as np

In [2]:
# Poisson inverse cumulative distribution function
poisson.ppf(0.01, 20)

10.0

In [15]:
# Load the BED file
gene_names = []
gene_loc = []
for line in open( "mm10_final.bed" ):
    fields = line.split( "\t" )
    chrom = fields[0]
    loc1 = fields[1]
    loc2 = fields[2]
    strand = fields[4][1:-1]
    loc = [chrom, loc1, loc2, strand]
    gene_loc.append(loc)
    gene_names.append(fields[3])

In [12]:
samfile = pysam.AlignmentFile("end_P2_E01_529_mono_01.sorted.bam", "rb")

In [16]:
gene_loc[0:20]

[['1 ', ' 3214481 ', ' 3671498 ', '-'],
 ['1 ', ' 3648310 ', ' 3658904 ', '-'],
 ['1 ', ' 4343506 ', ' 4360314 ', '-'],
 ['1 ', ' 4490927 ', ' 4497354 ', '-'],
 ['1 ', ' 4773199 ', ' 4785726 ', '-'],
 ['1 ', ' 4807892 ', ' 4846735 ', '+'],
 ['1 ', ' 4857693 ', ' 4897909 ', '+'],
 ['1 ', ' 4909575 ', ' 5070285 ', '-'],
 ['1 ', ' 5083085 ', ' 5162549 ', '+'],
 ['1 ', ' 5588492 ', ' 5606133 ', '+'],
 ['1 ', ' 5913706 ', ' 5917398 ', '-'],
 ['1 ', ' 6209865 ', ' 6215293 ', '-'],
 ['1 ', ' 6214661 ', ' 6276104 ', '+'],
 ['1 ', ' 6359330 ', ' 6394731 ', '+'],
 ['1 ', ' 6730050 ', ' 6860940 ', '+'],
 ['1 ', ' 6692281 ', ' 6692305 ', '+'],
 ['1 ', ' 7088919 ', ' 7173628 ', '+'],
 ['1 ', ' 7349405 ', ' 7397869 ', '-'],
 ['1 ', ' 8028518 ', ' 8028555 ', '-'],
 ['1 ', ' 8179496 ', ' 8179520 ', '-']]

In [17]:
# for read in samfile.fetch('1', 30000000, 70000000):
#      print(read)

# samfile.count('1', start=30000000, stop=70000000)
hed = samfile.head(1)
# print(hed.next_reference_start())
for h in hed:
    print(h.get_reference_positions())

[4845673, 4845674, 4845675, 4845676, 4845677, 4845678, 4845679, 4845680, 4845681, 4845682, 4845683, 4845684, 4845685, 4845686, 4845687, 4845688, 4845689, 4845690, 4845691, 4845692, 4845693, 4845694, 4845695, 4845696, 4845697, 4845698, 4845699, 4845700, 4845701, 4845702, 4845703, 4845704, 4845705, 4845706, 4845707, 4845708, 4845709, 4845710, 4845711, 4845712, 4845713, 4845714, 4845715, 4845716, 4845717, 4845718, 4845719, 4845720, 4845721, 4845722, 4845723, 4845724, 4845725, 4845726, 4845727, 4845728, 4845729, 4845730, 4845731, 4845732, 4845733, 4845734, 4845735, 4845736, 4845737, 4845738, 4845739, 4845740, 4845741, 4845742, 4845743, 4845744, 4845745, 4845746, 4845747]


In [64]:
# Read count of a transcript (1st)
first_loc = 0
hed = samfile.head(1)
for h in hed:
    first_loc = h.get_reference_positions()[0]
first_loc

4845673

In [19]:
# Read count of any transcript (having chr and start transcript)
chrom = '1'
samfile.count(chrom, start=first_loc, end=first_loc+1000)

43

In [13]:
# Example of algorithm over one transcript (calculating window counts)
first_loc = 4845673
# first_loc = 5149991
chrom = '1'
l_g = 1000
N = samfile.count(chrom, start=first_loc, end=first_loc+l_g)
windows = []
threshold = 0.01
s_t = 75
s_w = 100
l = 0
n_w = (l_g - s_w)/s_t + 1
N_w = N * s_w / l_g 
m = poisson.ppf(threshold, N_w)
start_wi = first_loc
for i in range(int(n_w)):
    count = samfile.count(chrom, start=start_wi, end=start_wi+s_w)
    windows.append(count)
    start_wi += s_t
windows

[11, 13, 9, 3, 3, 3, 5, 6, 8, 12, 9, 10, 5]

In [29]:
N_w

4.3

In [30]:
poisson.ppf(0.01, 4.3)

0.0

In [24]:
# Example of algorithm output over one transcript (with introns)
first_loc = 5149991
chrom = '1'
strand = '+'
N = samfile.count(chrom, start=first_loc, end=first_loc+l_g)
threshold = 0.01
s_t = 75
l_g = 1000
s_w = 100
l = 0
if strand == '+':
    l = l_g
n_w = (l_g - s_w)/s_t + 1
N_w = N * s_w / l_g 
m = poisson.ppf(threshold, N_w)
start_wi = 0

# Count reads for each window and check quantile condition
for i in range(int(n_w)):
    start_loc = start_wi + first_loc
    count = samfile.count(chrom, start=start_loc, end=start_loc+s_w)
    if count <= m:
        l = start_wi
    start_wi += s_t
l

1000

In [82]:
# Finding introns
samfile.find_introns((read for read in samfile.fetch(chrom, start=3000000, end=5500000)))

Counter({(5150061, 5162104): 5})

### New Approach
Union all exons start window from end cut at 1000 bp from end

In [2]:
utr = pd.read_csv('mm10_3utr.bed', delimiter='\t')
bed = pd.read_csv('mm10_final.bed', delimiter='\t', converters={'CHROM': str.strip, 'NAME': str.strip, 'STRAND': str.strip})

In [3]:
bed

Unnamed: 0,CHROM,LOC1,LOC2,NAME,STRAND
0,1,3214481,3671498,Xkr4,-
1,1,3648310,3658904,AK149000,-
2,1,4343506,4360314,Rp1,-
3,1,4490927,4497354,Sox17,-
4,1,4773199,4785726,Mrpl15,-
5,1,4807892,4846735,Lypla1,+
6,1,4857693,4897909,Tcea1,+
7,1,4909575,5070285,Rgs20,-
8,1,5083085,5162549,Atp6v1h,+
9,1,5588492,5606133,Oprk1,+


In [11]:
samfile = pysam.AlignmentFile("P2_E01_529_mono_01.sorted.bam", "rb")
chrom = '10'
# next_loc = 4845673
# next_loc = 5149991
# next_loc = 80796115
threshold = 0.01
s_t = 75
s_w = 100
l_g = 1000
windows = []

In [21]:
# First find the gene associated to the beginning of the new transcript, then select all exons for that gene
loc = 16688488
gene = bed[(bed['CHROM'] == chrom) & (bed['LOC1'] <= loc) & (bed['LOC2'] >= loc)]
gene_name = gene['NAME'].iloc[0]
gene_exons = utr[utr['EXON'].str.contains(gene_name)]

In [22]:
gene_exons

Unnamed: 0,CHROM,LOC1,LOC2,EXON,STRAND
172,1,16687702,16687702,AK040785_uc056ych.1,-


In [51]:
strand = gene_exons['STRAND'].iloc[0]
strand

'+'

In [11]:
# Case strand == '+', window list
l = 0
first_loc = 164080785
chrom = '1'
strand = '+'
end = gene_exons['LOC2'].iloc[-1]
start = end - 1000
n_w = (l_g - s_w)/s_t + 1
N_w = N * s_w / l_g 
m = poisson.ppf(threshold, N_w)
start_wi = end
for i in range(int(n_w)):
    count = samfile.count('1', start=start_wi-s_w, end=start_wi)
    windows.append(count)
    start_wi -= s_t
windows

NameError: name 'l_g' is not defined

In [25]:
# Case strand == '+'
l = 0
end = gene_exons['LOC2'].iloc[-1]
start = end - 1000
N = samfile.count(chrom, start=start, end=end)
n_w = (l_g - s_w)/s_t + 1
N_w = N * s_w / l_g 
m = poisson.ppf(threshold, N_w)
loc_wi = end
start_wi = 0
prev_mid = l + (s_w/2)
for i in range(int(n_w)):
    count = samfile.count('1', start=loc_wi-s_w, end=loc_wi)
    print('Count: {}'.format(count))
    if count <= m:
        prev_mid = start_wi + (s_w/2)
        start_wi += s_t
        print('Moving start to: {}'.format(start_wi))
    else:
        new_mid = start_wi + (s_w/2)
        l = int(np.ceil((prev_mid + new_mid) / 2))
        print('Prev: {}, New: {}, l = {}'.format(prev_mid, new_mid, l))
        break
    loc_wi -= s_t
l

Count: 0
Moving start to: 75
Count: 0
Moving start to: 150
Count: 0
Moving start to: 225
Count: 0
Moving start to: 300
Count: 0
Moving start to: 375
Count: 0
Moving start to: 450
Count: 0
Moving start to: 525
Count: 0
Moving start to: 600
Count: 0
Moving start to: 675
Count: 0
Moving start to: 750
Count: 0
Moving start to: 825
Count: 0
Moving start to: 900
Count: 0
Moving start to: 975


0

In [2]:
def locate_termination(samfile, chrom, strand, gene_exons):
    
    # Initialize params
    threshold = 0.01
    s_t = 75
    s_w = 100
    l_g = 1000
    n_w = (l_g - s_w)/s_t + 1
    
    if strand == '-':
        start = gene_exons['LOC1'].iloc[0]
        end = start + 1000
        N = samfile.count(chrom, start=start, end=end)
        N_w = N * s_w / l_g 
        m = poisson.ppf(threshold, N_w)

    else:
        end = gene_exons['LOC2'].iloc[-1]
        start = end - 1000
        N = samfile.count(chrom, start=start, end=end)
        N_w = N * s_w / l_g 
        m = poisson.ppf(threshold, N_w)

    return slide_window(samfile, chrom, start, end, strand, n_w, m, s_w, s_t)

In [9]:
def slide_window(samfile, chrom, start, end, strand, n_w, m, s_w, s_t):
    
    # Init params
    l = 0
    start_wi = 0
    
    # Move sliding window according to strand
    if strand == '-':
        loc_wi = start
        prev_mid = l + (s_w/2)
        
        for i in range(int(n_w)):
            count = samfile.count(chrom, start=loc_wi, end=loc_wi+s_w)
            if count <= m:
                prev_mid = start_wi + (s_w/2)
                start_wi += s_t
            else:
                new_mid = start_wi + (s_w/2)
                l = int(np.ceil((prev_mid + new_mid) / 2))
                return l
            loc_wi += s_t
    else:
        loc_wi = end
        prev_mid = l + (s_w/2)
        
        for i in range(int(n_w)):
            count = samfile.count(chrom, start=loc_wi-s_w, end=loc_wi)
            if count <= m:
                prev_mid = start_wi + (s_w/2)
                start_wi += s_t
            else:
                new_mid = start_wi + (s_w/2)
                l = int(np.ceil((prev_mid + new_mid) / 2))
                return l
            loc_wi -= s_t
            
    return l

In [4]:
def get_gene_info(bed, utr, chrom, next_loc):
    try:
        gene = bed[(bed['CHROM'] == chrom) & (bed['LOC1'] <= next_loc) & (bed['LOC2'] >= next_loc)]
        gene_name = gene['NAME'].iloc[0]
        return gene_name, utr[utr['EXON'].str.contains(gene_name)]
    except IndexError:
        return None, None

In [5]:
def get_gene_end(bed, loc):
    gene = bed[(bed['CHROM'] == chrom) & (bed['LOC1'] <= loc) & (bed['LOC2'] >= loc)]
    return gene['LOC2'].iloc[0]

In [32]:
samfile = pysam.AlignmentFile("P2_E01_529_mono_01.sorted.bam", "rb")
# next_loc = 4845673
# next_loc = 5149991
# next_loc = 80796115
next_locs = [4845673, 5149991]
chrom = '1'
strand = '+'
for next_loc in next_locs:
    gene_exons = get_gene_exons(bed, utr, chrom, next_loc)
    print(locate_termination(samfile, chrom, strand, gene_exons))

50
88


In [45]:
cs = []
for i in range(1,20):
    cs.append(str(i))

In [46]:
cs

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19']

In [20]:
%%time

# Load samfile
samfile = pysam.AlignmentFile("P2_E01_529_mono_01.sorted.bam", "rb")

# Load mm10 bed files
utr = pd.read_csv('mm10_3utr.bed', delimiter='\t')
bed = pd.read_csv('mm10_final.bed', delimiter='\t', converters={'CHROM': str.strip, 'NAME': str.strip, 'STRAND': str.strip})

chromosomes = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17', '18', '19', 'X', 'Y']
# chromosomes = ['1']
border = 0
term_bed = bed
term_bed['TERM'] = 0
for chrom in chromosomes:
    print('CHROM: {}'.format(chrom))
    for read in samfile.fetch(chrom):
        start_position = read.get_reference_positions()[0]
        if start_position <= border: # Skip read if in region already accounted for
            continue
        gene_name, gene_exons = get_gene_info(bed, utr, chrom, start_position)
        if gene_exons is not None:
            print('Working start: {} passed'.format(start_position))
            strand = gene_exons['STRAND'].iloc[0]
            term_location = locate_termination(samfile, chrom, strand, gene_exons)
            term_bed.loc[term_bed['NAME'] == gene_name, 'TERM'] = term_location
            border = get_gene_end(bed, start_position)
            print('Strand: {} Loc: {} New Border: {}'.format(strand, term_location, border))
        else:
            continue

Working start: 4807916 passed
Strand: + Loc: 50 New Border: 4846735
Working start: 5143770 passed
Strand: + Loc: 88 New Border: 5162549
Working start: 7136376 passed
Strand: + Loc: 0 New Border: 7173628
Working start: 8639087 passed
Strand: - Loc: 0 New Border: 9299877
Working start: 9745980 passed
Strand: - Loc: 0 New Border: 9748382
Working start: 9777603 passed
Strand: + Loc: 0 New Border: 9791922
Working start: 9798203 passed
Strand: + Loc: 0 New Border: 9902568
Working start: 9996270 passed
Strand: - Loc: 0 New Border: 10009136
Working start: 10024944 passed
Strand: - Loc: 313 New Border: 10038159
Working start: 11535683 passed
Strand: + Loc: 0 New Border: 11975902
Working start: 13279086 passed
Strand: - Loc: 0 New Border: 13374083
Working start: 13564710 passed
Strand: - Loc: 50 New Border: 13589910
Working start: 13641492 passed
Strand: - Loc: 0 New Border: 13660509
Working start: 16101301 passed
Strand: - Loc: 50 New Border: 16104433
Working start: 16570779 passed
Strand: - Lo

Working start: 99790542 passed
Strand: + Loc: 0 New Border: 100485942
Working start: 105606644 passed
Strand: - Loc: 0 New Border: 105663676
Working start: 105701412 passed
Strand: + Loc: 313 New Border: 105755131
Working start: 105991591 passed
Strand: + Loc: 0 New Border: 106034079
Working start: 106660963 passed
Strand: - Loc: 0 New Border: 106714290
Working start: 106733959 passed
Strand: - Loc: 0 New Border: 106759742
Working start: 106782390 passed
Strand: - Loc: 0 New Border: 106796725
Working start: 107529225 passed
Strand: + Loc: 0 New Border: 107549271
Working start: 118300706 passed
Strand: - Loc: 0 New Border: 118311132
Working start: 118321871 passed
Strand: + Loc: 238 New Border: 118333831
Working start: 119470377 passed
Strand: - Loc: 88 New Border: 119504782
Working start: 120113297 passed
Strand: - Loc: 50 New Border: 120120919
Working start: 120227791 passed
Strand: - Loc: 50 New Border: 120265280
Working start: 125392922 passed
Strand: - Loc: 88 New Border: 125435727

Working start: 173857459 passed
Strand: - Loc: 50 New Border: 173880187
Working start: 173931077 passed
Strand: - Loc: 50 New Border: 173942492
Working start: 174014999 passed
Strand: - Loc: 50 New Border: 174031755
Working start: 175978298 passed
Strand: - Loc: 0 New Border: 176275312
Working start: 176756138 passed
Strand: - Loc: 0 New Border: 176807124
Working start: 176814893 passed
Strand: + Loc: 0 New Border: 177020432
Working start: 177025198 passed
Strand: - Loc: 0 New Border: 177248767
Working start: 178319152 passed
Strand: + Loc: 50 New Border: 178322693
Working start: 178329116 passed
Strand: - Loc: 50 New Border: 178337784
Working start: 179549391 passed
Strand: + Loc: 0 New Border: 179627473
Working start: 180729774 passed
Strand: + Loc: 0 New Border: 180754204
Working start: 180802568 passed
Strand: - Loc: 50 New Border: 180813605
Working start: 180851161 passed
Strand: + Loc: 50 New Border: 180868114
Working start: 180904322 passed
Strand: + Loc: 50 New Border: 18090808

In [19]:
%%time
term_bed[term_bed['CHROM'] == '1']

CPU times: user 8.29 ms, sys: 1 µs, total: 8.29 ms
Wall time: 7.49 ms


Unnamed: 0,CHROM,LOC1,LOC2,NAME,STRAND,TERM
0,1,3214481,3671498,Xkr4,-,0
1,1,3648310,3658904,AK149000,-,0
2,1,4343506,4360314,Rp1,-,0
3,1,4490927,4497354,Sox17,-,0
4,1,4773199,4785726,Mrpl15,-,0
5,1,4807892,4846735,Lypla1,+,50
6,1,4857693,4897909,Tcea1,+,0
7,1,4909575,5070285,Rgs20,-,0
8,1,5083085,5162549,Atp6v1h,+,88
9,1,5588492,5606133,Oprk1,+,0


Unnamed: 0,CHROM,LOC1,LOC2,EXON,STRAND
2704,1,164079696,164080785,Sell_uc007dhy.2,+
2705,1,164079696,164080785,Sell_uc011wuv.1,+
2706,1,164073211,164080785,Sell_uc011wuw.1,+


In [62]:
term_bed[(term_bed['CHROM'] == '1') & (term_bed['NAME'] == 'Sell')]

Unnamed: 0,CHROM,LOC1,LOC2,NAME,STRAND,TERM
1289,1,164062075,164080785,Sell,+,163


In [49]:
gene_name, gene_exons = get_gene_info(bed, utr, chrom, start_position)

In [44]:
bed['TERM'] = 0
t = bed[bed['NAME'] == 'Xkr4'].iloc[0]
bed.loc[bed['NAME'] == 'Xkr4', 'TERM'] = 1
bed

Unnamed: 0,CHROM,LOC1,LOC2,NAME,STRAND,TERM
0,1,3214481,3671498,Xkr4,-,1
1,1,3648310,3658904,AK149000,-,0
2,1,4343506,4360314,Rp1,-,0
3,1,4490927,4497354,Sox17,-,0
4,1,4773199,4785726,Mrpl15,-,0
5,1,4807892,4846735,Lypla1,+,0
6,1,4857693,4897909,Tcea1,+,0
7,1,4909575,5070285,Rgs20,-,0
8,1,5083085,5162549,Atp6v1h,+,0
9,1,5588492,5606133,Oprk1,+,0


In [None]:
df.loc[df[<some_column_name>] == <condition>, <another_column_name>] = <value_to_add>

to deal with introns: 

while length of transcript is < 1000: join exons and use to count

job tomorrow: keep debugging algorithm and deal with introns