# Finding primers for each experiment
> Verifying we have good primers

Document: https://docs.google.com/spreadsheets/d/18iF8xlzB_0yauaFV0MjEDHWyrzET0WU6bxJWvUq9Ork/edit#gid=0

In [1]:
import sys
sys.path.append('/home/phil/aptr')
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from aptr.database import RnaDB
from aptr.string_operations import rc, key, primers

In [35]:
# Mothers
# In the paper they say they used 515FB and 926R
# Source: https://www.biorxiv.org/content/biorxiv/early/2021/02/21/2021.02.20.432118.full.pdf)

p1 = primers["515FB"]
p2 = primers["926R"]

# 515FB: 5’-GTGYCAGCMGCCGCGGTAA-3’ and 926R: 5’-CCGYCAATTYMTTTRAGTTT-3’

print(f"P1: {p1}, P2: {p2}")

# Search p1 for threshold
for i in range(len(p1)):
    db = RnaDB(
        left_primer=p1[-i:],
        right_primer="",
        silent=True,
    )
    print(i, p1[-i:], len(db.db)) # Actually all 18 are good

# Search p2 for threshold
for i in range(len(p2)):
    db = RnaDB(
        left_primer=p1,
        right_primer=p2[:i],
        silent=True,
    )
    print(i, p2[:i], len(db.db)) # Drops off after 6

# 18 + 19 with RC on p2 = 1448 sequences
# 18 + 6 without RC on p2 = 3262 sequences

db_mothers = RnaDB(
    left_primer=p1,
    right_primer=p2[:6],
)
print("MOTHERS:")
print("P1: ", p1)
print("P2: ", p2[:6])

P1: GTGYCAGCMGCCGCGGTAA, P2: CCGYCAATTYMTTTRAGTTT
0 GTGYCAGCMGCCGCGGTAA 3714
1 A 5058
2 AA 5050
3 TAA 5041
4 GTAA 4894
5 GGTAA 4499
6 CGGTAA 3909
7 GCGGTAA 3796
8 CGCGGTAA 3722
9 CCGCGGTAA 3721
10 GCCGCGGTAA 3717
11 MGCCGCGGTAA 3717
12 CMGCCGCGGTAA 3717
13 GCMGCCGCGGTAA 3717
14 AGCMGCCGCGGTAA 3715
15 CAGCMGCCGCGGTAA 3715
16 YCAGCMGCCGCGGTAA 3714
17 GYCAGCMGCCGCGGTAA 3714
18 TGYCAGCMGCCGCGGTAA 3714
0  3714
1 C 3699
2 CC 3680
3 CCG 3579
4 CCGY 3537
5 CCGYC 3393
6 CCGYCA 3262
7 CCGYCAA 190
8 CCGYCAAT 0
9 CCGYCAATT 0
10 CCGYCAATTY 0
11 CCGYCAATTYM 0
12 CCGYCAATTYMT 0
13 CCGYCAATTYMTT 0
14 CCGYCAATTYMTTT 0
15 CCGYCAATTYMTTTR 0
16 CCGYCAATTYMTTTRA 0
17 CCGYCAATTYMTTTRAG 0
18 CCGYCAATTYMTTTRAGT 0
19 CCGYCAATTYMTTTRAGTT 0
0.9080560420315237 sequences remain after trimming
0.4080560420315236 sequences remain after filtering
MOTHERS:
P1:  GTGYCAGCMGCCGCGGTAA
P2:  CCGYCA


In [34]:
# Newlacto

p1 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGCCTACGGGNGGCWGCAG"
p2 = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAGGACTACHVGGGTATCTAATCC"

print(f"P1: {p1}, P2: {p2}")

# Search p1 for threshold
# for i in range(len(p1)):
#     db = RnaDB(
#         left_primer=p1[-i:],
#         right_primer="",
#         silent=True,
#     )
#     # print(i, p1[-i:], len(db.db)) # Drops off at 17

# # Search p2 for threshold
# for i in range(len(p2)):
#     db = RnaDB(
#         left_primer=p1[-17:],
#         right_primer=rc(p2)[:i],
#         silent=True,
#     )
#     print(i, p2[:i], len(db.db)) # Drops off at 22

db_newlacto = RnaDB(
    left_primer=p1[-17:],
    right_primer=rc(p2)[:22],
)
print("MOTHERS:")
print("P1: ", p1[-17:])
print("P2: ", rc(p2)[:22])

P1: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGCCTACGGGNGGCWGCAG, P2: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAGGACTACHVGGGTATCTAATCC
0.9036777583187391 sequences remain after trimming
0.28546409807355516 sequences remain after filtering
MOTHERS:
P1:  CCTACGGGNGGCWGCAG
P2:  ggattagatacccbdgtagtcc


In [43]:
# Mariana

p1 = "AGAGTTTGATCMTGGCTCAG"
p2 = "GGTTACCTTGTTACGACTT"

print(f"P1: {p1}, P2: {p2}")

# Search p1 for threshold
for i in range(len(p1)):
    db = RnaDB(
        left_primer=p1[-i:],
        right_primer="",
        silent=True,
    )
    print(i, p1[-i:], len(db.db)) # Whole seq is good

# Search p2 for threshold
for i in range(len(p2)):
    db = RnaDB(
        left_primer=p1,
        right_primer=rc(p2)[:i],
        silent=True,
    )
    print(i, p2[:i], len(db.db)) # Drops off at 16

db_mariana = RnaDB(
    left_primer=p1,
    right_primer=rc(p2)[:16],
)
print("MARIANA:")
print("P1: ", p1)
print("P2: ", rc(p2)[:16]) # May also be possible to not use filters

P1: AGAGTTTGATCMTGGCTCAG, P2: GGTTACCTTGTTACGACTT
0 AGAGTTTGATCMTGGCTCAG 2807
1 G 5059
2 AG 5044
3 CAG 4971
4 TCAG 4819
5 CTCAG 4245
6 GCTCAG 3997
7 GGCTCAG 3842
8 TGGCTCAG 3777
9 MTGGCTCAG 3666
10 CMTGGCTCAG 3605
11 TCMTGGCTCAG 3559
12 ATCMTGGCTCAG 3546
13 GATCMTGGCTCAG 3537
14 TGATCMTGGCTCAG 3440
15 TTGATCMTGGCTCAG 3389
16 TTTGATCMTGGCTCAG 3363
17 GTTTGATCMTGGCTCAG 3278
18 AGTTTGATCMTGGCTCAG 3166
19 GAGTTTGATCMTGGCTCAG 3121
0.4602201651238429 sequences remain after trimming
0.27395546659994996 sequences remain after filtering
MARIANA:
P1:  AGAGTTTGATCMTGGCTCAG
P2:  aagtcgtaacaaggta


In [54]:
# Ascites

p1 = "AGRGTTTGATYNTGGCTCAG"
p2 = "TASGGHTACCTTGTTASGACTT"

print(f"P1: {p1}, P2: {p2}")

# # Search p1 for threshold
# for i in range(len(p1)):
#     db = RnaDB(
#         left_primer=p1[-i:],
#         right_primer="",
#         silent=True,
#     )
#     print(i, p1[-i:], len(db.db)) # Whole seq is good

# # Search p2 for threshold
# for i in range(len(p2)):
#     db = RnaDB(
#         left_primer=p1,
#         right_primer=rc(p2)[:i],
#         silent=True,
#     )
#     print(i, p2[:i], len(db.db)) # Whole seq is good

db_ascites = RnaDB(
    left_primer=p1,
    right_primer=rc(p2),
)
print("Med. length:", np.median([len(x) for x in db_ascites.db["16s_sequence"]]))
print("ASCITES:")
print("P1: ", p1)
print("P2: ", rc(p2)) # May also be possible to not use filters

P1: AGRGTTTGATYNTGGCTCAG, P2: TASGGHTACCTTGTTASGACTT
0.4328246184638479 sequences remain after trimming
0.25281461095821867 sequences remain after filtering
Med. length: 1504.0
ASCITES:
P1:  AGRGTTTGATYNTGGCTCAG
P2:  aagtcstaacaaggtadccsta


In [57]:
# Mexico

p1 = "AGRGTTTGATYMTGGCTCAG"
p2 = "GGYTACCTTGTTACGACTT"

print(f"P1: {p1}, P2: {p2}")

# # Search p1 for threshold
# for i in range(len(p1)):
#     db = RnaDB(
#         left_primer=p1[-i:],
#         right_primer="",
#         silent=True,
#     )
#     print(i, p1[-i:], len(db.db)) # Whole seq is good

# Search p2 for threshold
for i in range(len(p2)):
    db = RnaDB(
        left_primer=p1,
        right_primer=rc(p2)[:i],
        silent=True,
    )
    print(i, p2[:i], len(db.db)) # Whole seq is good

db_mexico = RnaDB(
    left_primer=p1,
    right_primer=rc(p2),
)
print("Med. length:", np.median([len(x) for x in db_mexico.db["16s_sequence"]]))
print("MEXICO:")
print("P1: ", p1)
print("P2: ", rc(p2)) # May also be possible to not use filters

P1: AGRGTTTGATYMTGGCTCAG, P2: GGYTACCTTGTTACGACTT
0  2847
1 G 2847
2 GG 2814
3 GGY 2777
4 GGYT 2675
5 GGYTA 2660
6 GGYTAC 2520
7 GGYTACC 2291
8 GGYTACCT 2266
9 GGYTACCTT 2264
10 GGYTACCTTG 2264
11 GGYTACCTTGT 2263
12 GGYTACCTTGTT 2263
13 GGYTACCTTGTTA 2257
14 GGYTACCTTGTTAC 2255
15 GGYTACCTTGTTACG 2222
16 GGYTACCTTGTTACGA 2222
17 GGYTACCTTGTTACGAC 2140
18 GGYTACCTTGTTACGACT 2123
0.43782837127845886 sequences remain after trimming
0.2593194896172129 sequences remain after filtering
Med. length: 1503.0
MEXICO:
P1:  AGRGTTTGATYMTGGCTCAG
P2:  aagtcgtaacaaggtarcc


In [12]:
# Preterm

p1 = "ACTCCTACGGGAGGCAGCAG"
p2 = "GGACTACHVGGGTWTCTAAT"

print(f"P1: {p1}, P2: {p2}")

# # Search p1 for threshold
# for i in range(len(p1)):
#     db = RnaDB(
#         left_primer=p1[-i:],
#         right_primer="",
#         silent=True,
#     )
#     print(i, p1[-i:], len(db.db)) # Whole seq is good

# # Search p2 for threshold
# for i in range(len(p2)):
#     db = RnaDB(
#         left_primer=p1,
#         right_primer=rc(p2)[:i],
#         silent=True,
#     )
#     print(i, p2[:i], len(db.db)) # Whole seq is good

db_preterm = RnaDB(
    left_primer=p1,
    right_primer=rc(p2),
)
print("Med. length:", np.median([len(x) for x in db_preterm.db["16s_sequence"]]))
print("PRETERM:")
print("P1: ", p1)
print("P2: ", rc(p2)) # May also be possible to not use filters

P1: ACTCCTACGGGAGGCAGCAG, P2: GGACTACHVGGGTWTCTAAT
0.8901676257192894 sequences remain after trimming
0.2829622216662497 sequences remain after filtering
Med. length: 469.0
PRETERM:
P1:  ACTCCTACGGGAGGCAGCAG
P2:  attagawacccbdgtagtcc


In [17]:
# Athletes

p1 = primers["27F"]
p2 = primers["1492R (s)"]

print(f"P1: {p1}, P2: {p2}")

# Search p1 for threshold
for i in range(len(p1)):
    db = RnaDB(
        left_primer=p1[-i:],
        right_primer="",
        silent=True,
    )
    print(i, p1[-i:], len(db.db)) # Whole seq is good

# Search p2 for threshold
for i in range(len(p2)):
    db = RnaDB(
        left_primer=p1,
        right_primer=rc(p2)[:i],
        silent=True,
    )
    print(i, p2[:i], len(db.db)) # Whole seq is good

db_athletes = RnaDB(
    left_primer=p1,
    right_primer=rc(p2),
)
print("Med. length:", np.median([len(x) for x in db_athletes.db["16s_sequence"]]))
print("ATHLETES:")
print("P1: ", p1)
print("P2: ", rc(p2)) # May also be possible to not use filters

P1: AGAGTTTGATCMTGGCTCAG, P2: ACCTTGTTACGACTT
0 AGAGTTTGATCMTGGCTCAG 2807
1 G 5059
2 AG 5044
3 CAG 4971
4 TCAG 4819
5 CTCAG 4245
6 GCTCAG 3997
7 GGCTCAG 3842
8 TGGCTCAG 3777
9 MTGGCTCAG 3666
10 CMTGGCTCAG 3605
11 TCMTGGCTCAG 3559
12 ATCMTGGCTCAG 3546
13 GATCMTGGCTCAG 3537
14 TGATCMTGGCTCAG 3440
15 TTGATCMTGGCTCAG 3389
16 TTTGATCMTGGCTCAG 3363
17 GTTTGATCMTGGCTCAG 3278
18 AGTTTGATCMTGGCTCAG 3166
19 GAGTTTGATCMTGGCTCAG 3121
0  2807
1 A 2807
2 AC 2776
3 ACC 2739
4 ACCT 2637
5 ACCTT 2622
6 ACCTTG 2482
7 ACCTTGT 2259
8 ACCTTGTT 2234
9 ACCTTGTTA 2232
10 ACCTTGTTAC 2232
11 ACCTTGTTACG 2231
12 ACCTTGTTACGA 2231
13 ACCTTGTTACGAC 2225
14 ACCTTGTTACGACT 2223
0.46034525894420814 sequences remain after trimming
0.27395546659994996 sequences remain after filtering
Med. length: 1499.0
ATHLETES:
P1:  AGAGTTTGATCMTGGCTCAG
P2:  aagtcgtaacaaggt


In [12]:
# Canine

p1 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGCCTACGGGNGGCWGCAG"
p2 = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAGGACTACHVGGGTATCTAAT"

print(f"P1: {p1}, P2: {p2}")

# # Search p1 for threshold
# for i in range(len(p1)):
#     db = RnaDB(
#         path_to_16s="../../data/allSSU.tsv",
#         path_to_dnaA="../../data/allDnaA.tsv",
#         left_primer=p1[-i:],
#         right_primer="",
#         silent=True,
#     )
#     print(i, p1[-i:], len(db.db)) # Stops at 17

# # Search p2 for threshold
# for i in range(len(p2)):
#     db = RnaDB(
#         path_to_16s="../../data/allSSU.tsv",
#         path_to_dnaA="../../data/allDnaA.tsv",
#         left_primer=p1[-17:],
#         right_primer=rc(p2)[:i],
#         silent=True,
#     )
#     print(i, p2[:i], len(db.db)) # Stops at 20

db_athletes = RnaDB(
    path_to_16s="../../data/allSSU.tsv",
    path_to_dnaA="../../data/allDnaA.tsv",
    left_primer=p1[-17:],
    right_primer=rc(p2)[:20],
)
print("Med. length:", np.median([len(x) for x in db_athletes.db["16s_sequence"]]))
print("ATHLETES:")
print("P1: ", p1[-17:])
print("P2: ", rc(p2)[:20]) # May also be possible to not use filters

P1: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAGCCTACGGGNGGCWGCAG, P2: GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAGGACTACHVGGGTATCTAAT
0.9038028521391043 sequences remain after trimming
0.28646484863647736 sequences remain after filtering
Med. length: 466.0
ATHLETES:
P1:  CCTACGGGNGGCWGCAG
P2:  attagatacccbdgtagtcc


In [10]:
# DIABIMMUNE

p1 = "GTGCCAGCMGCCGCGGTAA"
p2 = "GGACTACHVGGGTWTCTAA"

print(f"P1: {p1}, P2: {p2}")

# # Search p1 for threshold
# for i in range(len(p1)):
#     db = RnaDB(
#         left_primer=p1[-i:],
#         right_primer="",
#         silent=True,
#     )
#     print(i, p1[-i:], len(db.db)) # Stops at 17

# # Search p2 for threshold
# for i in range(len(p2)):
#     db = RnaDB(
#         left_primer=p1,
#         right_primer=rc(p2)[:i],
#         silent=True,
#     )
#     print(i, p2[:i], len(db.db)) # Stops at 20

db_diabimmune = RnaDB(
    left_primer=p1,
    right_primer=rc(p2),
)
print("Med. length:", np.median([len(x) for x in db_diabimmune.db["16s_sequence"]]))
print("DIABIMMUNE:")
print("P1: ", p1[-17:])
print("P2: ", rc(p2)[:20]) # May also be possible to not use filters

P1: GTGCCAGCMGCCGCGGTAA, P2: GGACTACHVGGGTWTCTAA
0.9074305729296973 sequences remain after trimming
0.13197398048536402 sequences remain after filtering
Med. length: 292.0
DIABIMMUNE:
P1:  GCCAGCMGCCGCGGTAA
P2:  ttagawacccbdgtagtcc
