In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp39-cp39-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.84-cp39-cp39-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   --- ------------------------------------ 0.3/2.8 MB ? eta -:--:--
   ------- -------------------------------- 0.5/2.8 MB 1.1 MB/s eta 0:00:03
   ----------- ---------------------------- 0.8/2.8 MB 1.1 MB/s eta 0:00:02
   --------------- ------------------------ 1.0/2.8 MB 1.1 MB/s eta 0:00:02
   ------------------ --------------------- 1.3/2.8 MB 1.1 MB/s eta 0:00:02
   ---------------------- ----------------- 1.6/2.8 MB 1.1 MB/s eta 0:00:02
   -------------------------- ------------- 1.8/2.8 MB 1.2 MB/s eta 0:00:01
   ------------------------------ --------- 2.1/2.8 MB 1.1 MB/s eta 0:00:01
   --------------------------------- ------ 2.4/2.8 MB 1.1 MB/s eta 0:00:01
   -------------------------------

In [2]:
from Bio.Align import PairwiseAligner


In [3]:
# Example sequences
seq1 = "ACGGGT"
seq2 = "ACG"

# Create a PairwiseAligner instance
aligner = PairwiseAligner()
aligner.mode = "global"
aligner.match_score = 1  # Matches score +1
aligner.mismatch_score = 0  # Mismatches score 0
aligner.open_gap_score = 0  # Gap opening score
aligner.extend_gap_score = 0  # Gap extension score

# Perform the alignment
alignments = aligner.align(seq1, seq2)

print(type(alignments))

# Get the first alignment (can iterate over alignments if needed)
alignment = alignments[0]
print("1st Alignment:")
print(alignment)

print("All Alignments:")
for i, current_align in enumerate(alignments):
    print(alignments[i])
    print("Score:", alignments[i].score)


<class 'Bio.Align.PairwiseAlignments'>
1st Alignment:
target            0 ACGGGT 6
                  0 |||--- 6
query             0 ACG--- 3

All Alignments:
target            0 ACGGGT 6
                  0 |||--- 6
query             0 ACG--- 3

Score: 3.0
target            0 ACGGGT 6
                  0 ||-|-- 6
query             0 AC-G-- 3

Score: 3.0
target            0 ACGGGT 6
                  0 ||--|- 6
query             0 AC--G- 3

Score: 3.0


In [4]:
print(alignments[0][0], alignments[0][1])
print(alignments[1][0], alignments[1][1])
print(alignments[2][0], alignments[2][1])

ACGGGT ACG---
ACGGGT AC-G--
ACGGGT AC--G-


Using other scorings:

In [5]:
# Create a PairwiseAligner instance
aligner = PairwiseAligner()
aligner.mode = "global"
aligner.match_score = 2  # Matches score
aligner.mismatch_score = -1  # Mismatches score
aligner.open_gap_score = -0.5 # Gap opening score
aligner.extend_gap_score = -0.1  # Gap extension score

#  Perform the alignment
alignments = aligner.align(seq1, seq2)

print("All Alignments:")
for i, current_align in enumerate(alignments):
    print(alignments[i], "Score: {:.2}".format(alignments[i].score))

All Alignments:
target            0 ACGGGT 6
                  0 |||--- 6
query             0 ACG--- 3
 Score: 5.3


explaining the score:
score: 2 (match) + 2 (match) + 2 (match)  -0.5 (gap open) -0.1 (gap extend)  -0.1 (gap extend)
score = 2 + 2 + 2 - 0.5 - 0.1 - 0.1

**Note:** depending on the penalties, a gap in one sequence may be followed by
a gap in the other sequence. If you don’t like this behaviour,
increase the gap-open penalty:

In [None]:
# Create a PairwiseAligner instance
aligner = PairwiseAligner()
aligner.mode = "global"
aligner.match_score = 5  # Matches score +1
aligner.mismatch_score = -4  # Mismatches score 0
aligner.open_gap_score = -1 # Gap opening score
aligner.extend_gap_score = -.1  # Gap extension score

#  Perform the alignment
alignments = aligner.align("A", "T")

print("All Alignments:")
for i, current_align in enumerate(alignments):
    print(alignments[i])
    print("Score: ", alignments[i].score)

All Alignments:
target            0 -A 1
                  0 -- 2
query             0 T- 1

Score:  -2.0
target            0 A- 1
                  0 -- 2
query             0 -T 1

Score:  -2.0


In [6]:
# here it will prefer a mismatch over opening a gap
aligner = PairwiseAligner()
aligner.mode = "global"
aligner.match_score = 5  # Matches score +1
aligner.mismatch_score = -4  # Mismatches score 0
aligner.open_gap_score = -3 # Gap opening score
aligner.extend_gap_score = -.1  # Gap extension score

#  Perform the alignment
alignments = aligner.align("A", "T")

print("All Alignments:")
for i, current_align in enumerate(alignments):
    print(alignments[i])
    print("Score: ", alignments[i].score)

All Alignments:
target            0 A 1
                  0 . 1
query             0 T 1

Score:  -4.0


In [23]:
from Bio import SeqIO


def get_higest_score(match_score, mismatch_score, open_gap_score, 
                     extend_gap_score, seq1, seq2):
    # Create a PairwiseAligner instance
    aligner = PairwiseAligner()
    aligner.mode = "local"
    aligner.match_score = match_score  # Matches score +1
    aligner.mismatch_score = mismatch_score  # Mismatches score 0
    aligner.open_gap_score = open_gap_score # Gap opening score
    aligner.extend_gap_score = extend_gap_score  # Gap extension score
    
    #  Perform the alignment
    alignments = aligner.align(seq1, seq2)
    
    print("All Alignments:")
    max = alignments[0].score
    return max      
        
my_records = []
records = SeqIO.parse("sequences_dna.fasta", "fasta")
for record in records:
    my_records.append(record)


match_score, mismatch_score, open_gap_score, extend_gap_score = 1, 0, 0, 0  
match_score, mismatch_score, open_gap_score, extend_gap_score = 5, 4, -1, -0.1
match_score, mismatch_score, open_gap_score, extend_gap_score = 5, 4, -3, -0.1

baboonon_to_cat = get_higest_score(match_score, mismatch_score, open_gap_score,                                      extend_gap_score, my_records[0].seq, my_records[1].seq)
print(f"Baboon to cat highest score: {baboonon_to_cat}")
baboonon_to_macaque = get_higest_score(match_score, mismatch_score, open_gap_score,                                     extend_gap_score, my_records[0].seq, my_records[2].seq)
print(f"Baboon to macaque highest score: {baboonon_to_macaque}")
cat_to_macaque = get_higest_score(match_score, mismatch_score, open_gap_score,                                      extend_gap_score, my_records[1].seq, my_records[2].seq)   
print(f"Cat to macaque highest score: {cat_to_macaque}")



All Alignments:
Baboon to cat highest score: 6938.600000000002
All Alignments:
Baboon to macaque highest score: 7074.799999999999
All Alignments:
Cat to macaque highest score: 6908.500000000002


In [3]:
from Bio import SeqIO
from Bio.Align import PairwiseAligner


def count_matches(seq1, seq2):
    return sum(1 for a, b in zip(seq1, seq2) if a == b)

def get_highest_score(match_score, mismatch_score, open_gap_score, extend_gap_score, seq1, seq2):
    # Create a PairwiseAligner instance
    aligner = PairwiseAligner()
    aligner.mode = "local"
    aligner.match_score = match_score
    aligner.mismatch_score = mismatch_score
    aligner.open_gap_score = open_gap_score
    aligner.extend_gap_score = extend_gap_score

    # Perform the alignment
    alignments = aligner.align(seq1, seq2)
    return alignments[0]

def run_on_file(file_path):
    # Read sequences from the FASTA file
    records = list(SeqIO.parse(file_path, "fasta"))
    
    # Define scoring parameters
    match_score, mismatch_score, open_gap_score, extend_gap_score = 5, -4, -3, -0.1
    # match_score, mismatch_score, open_gap_score, extend_gap_score = 1, 0, 0, 0 
    
    # Compare each pair of sequences
    for i in range(len(records)):
        for j in range(i + 1, len(records)):
            alignment = get_highest_score(match_score, mismatch_score,open_gap_score, extend_gap_score, records[i].seq, records[j].seq)
            # print(alignment)
            matches = count_matches(alignment[0], alignment[1])
            # print(f"matches: {matches}")
            score =  matches / alignment.shape[1] * 100 
            # Normalize the score
            print(f"{records[i].description} to {records[j].description} highest score: {score} %")
   
run_on_file("sequences_dna.fasta")
run_on_file("all.fasta")

 baboon to  cat highest score: 80.0 %
 baboon to  macaque highest score: 80.45830681094844 %
 cat to  macaque highest score: 74.82866043613707 %
AF451972.1 Pongo pygmaeus Puti mitochondrial D-loop, partial sequence to AF176731.1 Pan troglodytes verus isolate MABEL mitochondrial D-loop, partial sequence highest score: 64.321608040201 %
AF451972.1 Pongo pygmaeus Puti mitochondrial D-loop, partial sequence to X90314.1 H.sapiens mitochondrial DNA for D-loop (isolate WG+ice37+B) highest score: 66.58163265306123 %
AF176731.1 Pan troglodytes verus isolate MABEL mitochondrial D-loop, partial sequence to X90314.1 H.sapiens mitochondrial DNA for D-loop (isolate WG+ice37+B) highest score: 76.566757493188 %
