In [1]:
from pairtools._parse import ends_do_overlap, pairs_do_overlap, rescue_complex_walk

In [2]:
def report_simple_pairsam(algn1, algn2, add_columns=['pos5', 'pos3']):
    cols = [
        '.',
        algn1['chrom'],
        str(algn1['pos']),
        algn2['chrom'],
        str(algn2['pos']),
        algn1['strand'],
        algn2['strand'],
        algn1['type'] + algn2['type']
    ]

    for col in add_columns:
        cols.append(str(algn1.get(col, '')))
        cols.append(str(algn2.get(col, '')))

    return(' '.join(cols))

In [3]:
max_molecule_size = 500
allowed_offset = 0

### Test case 1

<img src="TestCase1.png" width="70%" />

In [4]:
algns1 = [
    {'chrom': 'chr1', 'pos':100, 'pos5': 100, 'pos3': 150, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 200, 'pos3': 250, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 300, 'pos3': 350, 'strand': '+', 'is_mapped': True, 'is_unique': True}
]
algns2 = [
    {'chrom': 'chr3', 'pos':300, 'pos5': 400, 'pos3': 300, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 250, 'pos3': 200, 'strand': '-', 'is_mapped': True, 'is_unique': True}
]

In [5]:
assert ends_do_overlap(algns1[-1], algns2[-1], max_molecule_size, allowed_offset)==0

In [6]:
assert ends_do_overlap(algns1[-2], algns2[-1], max_molecule_size, allowed_offset)==1

In [7]:
assert pairs_do_overlap((algns1[-2], algns1[-1]), (algns2[-2], algns2[-1]), allowed_offset)==1

In [8]:
# SAM reporing format: 
# readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type pos51 pos52 pos31 pos32
for algn1, algn2, algns1, algns2 in rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset):
    print(report_simple_pairsam(algn1, algn2))

. chr2 200 chr3 300 + + JJ 200 300 250 350
. chr1 100 chr2 200 + + JJ 100 200 150 250


### Test case 1 inverted

Let's change forward and reverse reads

In [9]:
algns2 = [
    {'chrom': 'chr1', 'pos':100, 'pos5': 100, 'pos3': 150, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 200, 'pos3': 250, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 300, 'pos3': 350, 'strand': '+', 'is_mapped': True, 'is_unique': True}
]
algns1 = [
    {'chrom': 'chr3', 'pos':300, 'pos5': 400, 'pos3': 300, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 250, 'pos3': 200, 'strand': '-', 'is_mapped': True, 'is_unique': True}
]

In [10]:
assert ends_do_overlap(algns1[-1], algns2[-1], max_molecule_size, allowed_offset)==0

In [11]:
assert ends_do_overlap(algns1[-2], algns2[-1], max_molecule_size, allowed_offset)==1

In [12]:
assert pairs_do_overlap((algns1[-2], algns1[-1]), (algns2[-2], algns2[-1]), allowed_offset)==1

In [13]:
# SAM reporing format: 
# readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type pos51 pos52 pos31 pos32
for algn1, algn2, algns1, algns2 in rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset):
    print(report_simple_pairsam(algn1, algn2))

. chr3 300 chr2 200 - - JJ 400 250 300 200
. chr1 100 chr2 200 + + JJ 100 200 150 250


### Test case 2

<img src="TestCase2.png" width="70%" />

In [14]:
algns1 = [
    {'chrom': 'chr1', 'pos':100, 'pos5': 100, 'pos3': 150, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 200, 'pos3': 250, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 300, 'pos3': 350, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr4', 'pos':400, 'pos5': 400, 'pos3': 450, 'strand': '+', 'is_mapped': True, 'is_unique': True}
]
algns2 = [
    {'chrom': 'chr4', 'pos':400, 'pos5': 500, 'pos3': 400, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 350, 'pos3': 300, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 250, 'pos3': 200, 'strand': '-', 'is_mapped': True, 'is_unique': True}
]

In [15]:
assert ends_do_overlap(algns1[-1], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-2], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-3], algns2[-1], max_molecule_size, allowed_offset)==1

In [16]:
assert pairs_do_overlap((algns1[-2], algns1[-1]), (algns2[-2], algns2[-1]), allowed_offset)==0
assert pairs_do_overlap((algns1[-3], algns1[-2]), (algns2[-2], algns2[-1]), allowed_offset)==1

In [17]:
# SAM reporing format: 
# readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type pos51 pos52 pos31 pos32
for algn1, algn2, algns1, algns2 in rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset):
    print(report_simple_pairsam(algn1, algn2))

. chr3 300 chr4 400 + + JJ 300 400 350 450
. chr2 200 chr3 300 + + JJ 200 300 250 350
. chr1 100 chr2 200 + + JJ 100 200 150 250


### Test case 2 inverted

Let's change forward and reverse reads

In [18]:
algns2 = [
    {'chrom': 'chr1', 'pos':100, 'pos5': 100, 'pos3': 150, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 200, 'pos3': 250, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 300, 'pos3': 350, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr4', 'pos':400, 'pos5': 400, 'pos3': 450, 'strand': '+', 'is_mapped': True, 'is_unique': True}
]
algns1 = [
    {'chrom': 'chr4', 'pos':400, 'pos5': 500, 'pos3': 400, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 350, 'pos3': 300, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 250, 'pos3': 200, 'strand': '-', 'is_mapped': True, 'is_unique': True}
]

In [19]:
assert ends_do_overlap(algns1[-1], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-2], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-3], algns2[-1], max_molecule_size, allowed_offset)==1

In [20]:
assert pairs_do_overlap((algns1[-2], algns1[-1]), (algns2[-2], algns2[-1]), allowed_offset)==0
assert pairs_do_overlap((algns1[-3], algns1[-2]), (algns2[-2], algns2[-1]), allowed_offset)==1

In [21]:
# SAM reporing format: 
# readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type pos51 pos52 pos31 pos32
for algn1, algn2, algns1, algns2 in rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset):
    print(report_simple_pairsam(algn1, algn2))

. chr3 300 chr2 200 - - JJ 350 250 300 200
. chr4 400 chr3 300 - - JJ 500 350 400 300
. chr1 100 chr2 200 + + JJ 100 200 150 250


### Test case 2.a

Strands mixed


<img src="TestCase2a.png" width="70%" />

In [22]:
algns1 = [
    {'chrom': 'chr1', 'pos':100, 'pos5': 100, 'pos3': 150, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 250, 'pos3': 200, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 300, 'pos3': 350, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr4', 'pos':400, 'pos5': 450, 'pos3': 400, 'strand': '-', 'is_mapped': True, 'is_unique': True}
]
algns2 = [
    {'chrom': 'chr4', 'pos':400, 'pos5': 400, 'pos3': 450, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 350, 'pos3': 300, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 200, 'pos3': 250, 'strand': '+', 'is_mapped': True, 'is_unique': True}
]

In [23]:
assert ends_do_overlap(algns1[-1], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-2], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-3], algns2[-1], max_molecule_size, allowed_offset)==1

In [24]:
assert pairs_do_overlap((algns1[-2], algns1[-1]), (algns2[-2], algns2[-1]), allowed_offset)==0
assert pairs_do_overlap((algns1[-3], algns1[-2]), (algns2[-2], algns2[-1]), allowed_offset)==1

In [25]:
# SAM reporing format: 
# readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type pos51 pos52 pos31 pos32
for algn1, algn2, algns1, algns2 in rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset):
    print(report_simple_pairsam(algn1, algn2))

. chr3 300 chr4 400 + - JJ 300 450 350 400
. chr2 200 chr3 300 - + JJ 250 300 200 350
. chr1 100 chr2 200 + - JJ 100 250 150 200


### Test case 3

Not an overlap (a walk with mismatch at the end of forward read).


<img src="TestCase3.png" width="70%" />

In [26]:
algns1 = [
    {'chrom': 'chr1', 'pos':100, 'pos5': 100, 'pos3': 150, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 200, 'pos3': 250, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 300, 'pos3': 350, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr5', 'pos':500, 'pos5': 550, 'pos3': 500, 'strand': '-', 'is_mapped': True, 'is_unique': True}
]
algns2 = [
    {'chrom': 'chr4', 'pos':400, 'pos5': 500, 'pos3': 400, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 350, 'pos3': 300, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 250, 'pos3': 200, 'strand': '-', 'is_mapped': True, 'is_unique': True}
]

In [27]:
assert ends_do_overlap(algns1[-1], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-2], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-3], algns2[-1], max_molecule_size, allowed_offset)==1

In [28]:
assert pairs_do_overlap((algns1[-2], algns1[-1]), (algns2[-2], algns2[-1]), allowed_offset)==0
assert pairs_do_overlap((algns1[-3], algns1[-2]), (algns2[-2], algns2[-1]), allowed_offset)==1

In [29]:
# SAM reporing format: 
# readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type pos51 pos52 pos31 pos32
for algn1, algn2, algns1, algns2 in rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset):
    print(report_simple_pairsam(algn1, algn2))

. chr5 500 chr2 200 - - PP 550 250 500 200
. chr3 300 chr5 500 + - JJ 300 550 350 500
. chr2 200 chr3 300 + + JJ 200 300 250 350
. chr1 100 chr2 200 + + JJ 100 200 150 250
. chr3 300 chr2 200 - - JJ 350 250 300 200
. chr4 400 chr3 300 - - JJ 500 350 400 300


### Test case 4

Mismapped chimeras are treated as match. There is no need to report too much pairs with mismatches.

<img src="TestCase4.png" width="70%" />

In [30]:
algns1 = [
    {'chrom': 'chr1', 'pos':100, 'pos5': 100, 'pos3': 150, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 200, 'pos3': 250, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 300, 'pos3': 350, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': '!',    'pos':0,   'pos5': 0,   'pos3': 0,   'strand': '-', 'is_mapped': False,'is_unique': True}
]
algns2 = [
    {'chrom': '!',    'pos':0,   'pos5': 0,   'pos3': 0,   'strand': '-', 'is_mapped': False,'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 350, 'pos3': 300, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': 'chr2', 'pos':200, 'pos5': 250, 'pos3': 200, 'strand': '-', 'is_mapped': True, 'is_unique': True}
]

In [31]:
assert ends_do_overlap(algns1[-1], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-2], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-3], algns2[-1], max_molecule_size, allowed_offset)==1

In [32]:
assert pairs_do_overlap((algns1[-2], algns1[-1]), (algns2[-2], algns2[-1]), allowed_offset)==0
assert pairs_do_overlap((algns1[-3], algns1[-2]), (algns2[-2], algns2[-1]), allowed_offset)==1

In [33]:
# SAM reporing format: 
# readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type pos51 pos52 pos31 pos32
for algn1, algn2, algns1, algns2 in rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset):
    print(report_simple_pairsam(algn1, algn2))

. chr3 300 ! 0 + - JN 300 0 350 0
. chr2 200 chr3 300 + + JJ 200 300 250 350
. chr1 100 chr2 200 + + JJ 100 200 150 250


### Test case 4.a

Mismapped chimeras are treated as match. What if we introduce more of them?


<img src="TestCase5.png" width="70%" />

In [34]:
algns1 = [
    {'chrom': 'chr1', 'pos':100, 'pos5': 100, 'pos3': 150, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': '!',    'pos':0,   'pos5': 0,   'pos3': 0,   'strand': '-', 'is_mapped': False,'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 300, 'pos3': 350, 'strand': '+', 'is_mapped': True, 'is_unique': True},
    {'chrom': '!',    'pos':0,   'pos5': 0,   'pos3': 0,   'strand': '-', 'is_mapped': False,'is_unique': True}
]
algns2 = [
    {'chrom': '!',    'pos':0,   'pos5': 0,   'pos3': 0,   'strand': '-', 'is_mapped': False,'is_unique': True},
    {'chrom': 'chr3', 'pos':300, 'pos5': 350, 'pos3': 300, 'strand': '-', 'is_mapped': True, 'is_unique': True},
    {'chrom': '!',    'pos':0,   'pos5': 0,   'pos3': 0,   'strand': '-', 'is_mapped': False,'is_unique': True}
]

In [35]:
assert ends_do_overlap(algns1[-1], algns2[-1], max_molecule_size, allowed_offset)==1 # Note this difference
assert ends_do_overlap(algns1[-2], algns2[-1], max_molecule_size, allowed_offset)==0
assert ends_do_overlap(algns1[-3], algns2[-1], max_molecule_size, allowed_offset)==1

In [36]:
assert pairs_do_overlap((algns1[-2], algns1[-1]), (algns2[-2], algns2[-1]), allowed_offset)==0
assert pairs_do_overlap((algns1[-3], algns1[-2]), (algns2[-2], algns2[-1]), allowed_offset)==1

In [37]:
# SAM reporing format: 
# readID chrom1 pos1 chrom2 pos2 strand1 strand2 pair_type pos51 pos52 pos31 pos32
for algn1, algn2, algns1, algns2 in rescue_complex_walk(algns1, algns2, max_molecule_size, allowed_offset):
    print(report_simple_pairsam(algn1, algn2))

. chr3 300 ! 0 + - JN 300 0 350 0
. ! 0 chr3 300 - + NJ 0 300 0 350
. chr1 100 ! 0 + - JN 100 0 150 0
