In [1]:
#!pip install biopython
#!pip install blosum

In [2]:
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import uuid
from Bio.Seq import Seq
import blosum as bl
import re

In [3]:
seqs = SeqIO.to_dict(SeqIO.parse("test/ab.fa", 'fasta'))
str(seqs['QBZ81690.1'].seq)

'MGQANTTIKGYSAVSQDALFATYGITEGDYQAKASAAIERIRAMPEGYASPEDRAAAINAIRTGTCGDDTELLTRVRAALDRWQRDCASTGHALKSKVV'

In [4]:
#for line in open("VOGs/same_vog_seq_pairs.tsv"):

#data = line.rstrip().split()
data = ['', 'QBZ81690.1', '', 'AGO83968.2']

seq1 = SeqRecord(seqs[data[1]].seq,
                 id=seqs[data[1]].id)
seq2 = SeqRecord(seqs[data[3]].seq,
                 id=seqs[data[3]].id)
SeqIO.write(seq1, f"seq_1.fasta", "fasta")
SeqIO.write(seq2, f"seq_2.fasta", "fasta")

str(seq1.seq)

'MGQANTTIKGYSAVSQDALFATYGITEGDYQAKASAAIERIRAMPEGYASPEDRAAAINAIRTGTCGDDTELLTRVRAALDRWQRDCASTGHALKSKVV'

In [5]:
output = NcbiblastpCommandline(query=f"seq_1.fasta", subject=f"seq_2.fasta", outfmt="6 qstart qseq qend sstart sseq send")()[0]
if len(output) < 2:
    output = f"{seqs[data[1]].id}\t{seqs[data[3]].id}\t NO_HIT"
#output = output.split("\n")
print(output)

29	DYQAKASAAIERIRAMPEGYASPEDRAAAINAIRTGTCGDDTELLTRVRAALDRWQRDCAS	89	41	EYAELKEKTIRIIVALPTQYLSADKGRAAIEAIRSGEPGREA-LYDEVWDAAMAYSRDIAA	100
16	QDALF	20	80	REALY	84



In [6]:
output2 = NcbiblastpCommandline(query=f"seq_1.fasta", subject=f"seq_2.fasta", outfmt="6 qstart sstart btop qend send")()[0]
print(output2)

29	41	DE1QAAEKLAKSEAKAT1ERRI1RV1ML1ETGQ1AL1PAEDDKRGAR3NE3TS1TECP1DRDETAE-1LYTDRE1RWAD1LADMRAWYQS2CI1SA	89	100
16	80	QRDE2FY	20	84



In [7]:
output3 = NcbiblastpCommandline(query=f"seq_1.fasta", subject=f"seq_2.fasta", outfmt=12)()[0]
print(output3)

{
  "Seq_annot": {
    "desc": [
      {
        "user": {
          "type": {
            "str": "Hist Seqalign"
          },
          "data": [
            {
              "label": {
                "str": "Hist Seqalign"
              },
              "data": {
                "bool": true
              }
            }
          ]
        }
      },
      {
        "user": {
          "type": {
            "str": "Blast Type"
          },
          "data": [
            {
              "label": {
                "str": "blastp"
              },
              "data": {
                "int": 2
              }
            }
          ]
        }
      },
      {
        "user": {
          "type": {
            "str": "Blast Database Title"
          },
          "data": [
            {
              "label": {
                "str": "n/a"
              },
              "data": {
                "bool": false
              }
            }
          ]
        }
      }
    ],
    "dat

In [8]:
#Identify the positive matches by looking directly in the score matrix? https://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt
blosumDict = dict(bl.BLOSUM(62))

def Blosum(a, b):
    key = str(a) + str(b)
    return blosumDict[key]

Blosum("A", "N")

-2.0

In [9]:
def GetIndicesFromBlastTrace(start1, end1, start2, end2, btop):
    start1 = int(start1)
    end1 = int(end1)
    start2 = int(start2)
    end2 = int(end2)
    
    chars = btop
    for c in ['0','1','2','3','4','5','6','7','8','9','0']:
        chars = chars.replace(c, "++")
    
    assert(len(chars)%2 == 0)
    
    s1 = chars[0:-1:2]
    s2 = chars[1:-1:2]
    
    i1 = [i for i in range(start1, end1)]
    i2 = [i for i in range(start2, end2)]
    
    gaps1 = [i for i, c in enumerate(s1) if c == "-"]
    gaps2 = [i for i, c in enumerate(s2) if c == "-"]
    
    for gap in gaps1:
        i1.insert(gap, -1)
    
    for gap in gaps2:
        i2.insert(gap, -1)
        
    trace = list(zip(s1, i1, s2, i2))
    
    return trace


GetIndicesFromBlastTrace(2,9,1,7,"ab1aba-ababab")

[('a', 2, 'b', 1),
 ('+', 3, '+', 2),
 ('a', 4, 'b', 3),
 ('a', 5, '-', -1),
 ('a', 6, 'b', 4),
 ('a', 7, 'b', 5)]

In [10]:
def GetMatchesFromBlastTrace(start1, end1, start2, end2, btop):
    trace = GetIndicesFromBlastTrace(start1, end1, start2, end2, btop)
    
    matches = [(i,j) for (a,i,b,j) in trace if (a != "-" and b != "-")]
    
    return matches

GetMatchesFromBlastTrace(2,9,1,7,"ab1aba-ababab")

[(2, 1), (3, 2), (4, 3), (6, 4), (7, 5)]

In [11]:
def GetBlastPairs(seq1, seq2):
    output = NcbiblastpCommandline(query=seq1, subject=seq2, outfmt="6 qstart qend sstart send btop")()[0]
    outlines = output.splitlines()
    outlists = [re.split(r'\t+', line) for line in outlines]
    
    indices = [GetMatchesFromBlastTrace(*fasta) for fasta in outlists]
    
    return indices
    
blastPairs = GetBlastPairs(f"seq_1.fasta", f"seq_2.fasta")
blastPairs

[[(29, 41),
  (30, 42),
  (31, 43),
  (32, 44),
  (33, 45),
  (34, 46),
  (35, 47),
  (36, 48),
  (37, 49),
  (38, 50),
  (39, 51),
  (40, 52),
  (41, 53),
  (42, 54),
  (43, 55),
  (44, 56),
  (45, 57),
  (46, 58),
  (47, 59),
  (48, 60),
  (49, 61),
  (50, 62),
  (51, 63),
  (52, 64),
  (53, 65),
  (54, 66),
  (55, 67),
  (56, 68),
  (57, 69),
  (58, 70),
  (59, 71),
  (60, 72),
  (61, 73),
  (62, 74),
  (63, 75),
  (64, 76),
  (65, 77),
  (66, 78),
  (68, 79),
  (69, 80),
  (70, 81),
  (71, 82),
  (72, 83),
  (73, 84),
  (74, 85),
  (75, 86),
  (76, 87),
  (77, 88),
  (78, 89),
  (79, 90),
  (80, 91),
  (81, 92),
  (82, 93),
  (83, 94)],
 [(16, 80), (17, 81), (18, 82)]]

In [12]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

intersection([(1,1), (2,2), (3,3)], [(1,2), (3,3)])

[(3, 3)]

In [13]:
mahdiPairs = [(0, 0),
 (9, 13),
 (10, 14),
 (12, 15),
 (13, 22),
 (14, 25),
 (26, 28),
 (29, 41),
 (32, 47),
 (37, 49),
 (39, 50),
 (40, 51),
 (42, 54),
 (44, 56),
 (46, 58),
 (47, 59),
 (49, 61),
 (52, 63),
 (53, 65),
 (54, 67),
 (55, 67),
 (56, 68),
 (57, 69),
 (58, 70),
 (59, 71),
 (60, 72),
 (61, 73),
 (62, 74),
 (63, 75),
 (64, 76),
 (65, 77),
 (66, 78),
 (72, 82),
 (75, 86),
 (79, 87),
 (80, 88),
 (82, 93),
 (83, 94),
 (84, 95),
 (85, 96),
 (86, 97),
 (87, 98),
 (90, 100),
 (92, 102),
 (94, 104),
 (96, 105)]

intersection(blastPairs[0], mahdiPairs)

[(29, 41),
 (37, 49),
 (42, 54),
 (44, 56),
 (46, 58),
 (47, 59),
 (49, 61),
 (53, 65),
 (55, 67),
 (56, 68),
 (57, 69),
 (58, 70),
 (59, 71),
 (60, 72),
 (61, 73),
 (62, 74),
 (63, 75),
 (64, 76),
 (65, 77),
 (66, 78),
 (75, 86),
 (82, 93),
 (83, 94)]

In [14]:
intersection(blastPairs[1], mahdiPairs)

[]