In [3]:
!pip install biopython

from Bio import SeqIO, Align
import numpy as np

# Read one sequence from each FASTA file
with open('BVBRC_genome_sequence.fasta', 'r') as f:
    seq1 = str(next(SeqIO.parse(f, 'fasta')).seq)[:100]

with open('BVBRC_genome_sequence_HOS52.fasta', 'r') as f:
    seq2 = str(next(SeqIO.parse(f, 'fasta')).seq)[:100]

# 1. BIOPYTHON IMPLEMENTATION
print("=" * 50)
print("BIOPYTHON IMPLEMENTATION")
print("=" * 50)

aligner = Align.PairwiseAligner()
aligner.mode = 'local'
aligner.match_score = 2
aligner.mismatch_score = -1
aligner.gap_score = -2

alignments = aligner.align(seq1, seq2)
for alignment in alignments:
    print(alignment)
    print(f"Score: {alignment.score}")
    break

# 2. MANUAL IMPLEMENTATION
print("\n" + "=" * 50)
print("MANUAL IMPLEMENTATION")
print("=" * 50)

def smith_waterman(seq1, seq2, match=2, mismatch=-1, gap=-2):
    n = len(seq1)
    m = len(seq2)

    # Initialize matrix
    score_matrix = np.zeros((n + 1, m + 1))

    # Fill matrix
    max_score = 0
    max_pos = (0, 0)

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            if seq1[i-1] == seq2[j-1]:
                diagonal = score_matrix[i-1][j-1] + match
            else:
                diagonal = score_matrix[i-1][j-1] + mismatch

            up = score_matrix[i-1][j] + gap
            left = score_matrix[i][j-1] + gap

            # Local alignment: score cannot go below 0
            score_matrix[i][j] = max(0, diagonal, up, left)

            # Track maximum score position
            if score_matrix[i][j] > max_score:
                max_score = score_matrix[i][j]
                max_pos = (i, j)

    # Traceback from maximum score position
    align1 = ""
    align2 = ""
    i, j = max_pos

    while i > 0 and j > 0 and score_matrix[i][j] > 0:
        current = score_matrix[i][j]

        diagonal = score_matrix[i-1][j-1] + (match if seq1[i-1] == seq2[j-1] else mismatch)
        up = score_matrix[i-1][j] + gap
        left = score_matrix[i][j-1] + gap

        if current == diagonal:
            align1 = seq1[i-1] + align1
            align2 = seq2[j-1] + align2
            i -= 1
            j -= 1
        elif current == up:
            align1 = seq1[i-1] + align1
            align2 = "-" + align2
            i -= 1
        else:
            align1 = "-" + align1
            align2 = seq2[j-1] + align2
            j -= 1

    return align1, align2, max_score

align1, align2, score = smith_waterman(seq1, seq2)

print("Local Alignment:")
print(align1)
print(align2)
print(f"\nScore: {score}")

BIOPYTHON IMPLEMENTATION
target            0 ccaacgacgaccca-ccgccgttttgat--gtg-gaacg-gcctgtcaagtgg--ccga-
                  0 |||.-|.||||.||-||.||....-|||--||.-|.|-|-|||---|||.|.|--||||-
query             2 ccag-gccgacacaaccacccaca-gatcagtcagta-gagcc---caattcgtaccgaa

target           52 ttagtg--cttgttgcctcggggttgtttggggtttctggctttgatccg 100
                 60 ||.|.|--|||-||.|.||----|-|.|.|.|.|..|..|||.-||-||| 110
query            56 tttgggggctt-ttacgtc----t-gctcgcgctacccagcta-ga-ccg  98

Score: 49.0

MANUAL IMPLEMENTATION
Local Alignment:
ccaacgacgac-ccaccgccgttttgat--gt-ggaacggcctgtcaagt-g-gccg-a-tt-agtgcttgttgcctcggggttgtttggggtttctggctttgatccg
cc-aggccgacacaaccacc-cacagatcagtcagtagagcc---caattcgtaccgaatttgggggctt-ttacgtc-----tgctcgcgctacccagc-taga-ccg

Score: 49.0
