# In-class activity: Dynamic programming

TODO:

## Random sequences

We will use two examples sequences.

In [1]:
seq1 = "GCATGTAGGCGCTGGACTCGCTAGTAGTACAATGGCCGCCTCAGTGATGCGCGTAACCTAGTACGATGCCTAGTGAATT"
seq2 = "GGCGATAAGTTAAATTGTGTCAAGGGATGTCTTCGGAGTTCGAGCAACTGCATACCCCCAGTTAACGTCGTCC"

## Needleman–Wunsch Alignment


### Initialize matrix

In [2]:
def initialize_matrix(seq1, seq2, gap):
    """Initialize the scoring matrix for global alignment."""
    m = len(seq1)
    n = len(seq2)
    
    # Create an (m+1) x (n+1) matrix filled with zeros using for loops.
    matrix = []
    for i in range(m + 1):
        row = []
        for j in range(n + 1):
            row.append(0)
        matrix.append(row)
    
    # Initialize the first column with gap penalties.
    for i in range(1, m + 1):
        matrix[i][0] = i * gap
    # Initialize the first row with gap penalties.
    for j in range(1, n + 1):
        matrix[0][j] = j * gap

    return matrix

### Fill matrix

In [3]:
def fill_matrix(seq1, seq2, match, mismatch, gap, matrix):
    """
    Fill in the scoring matrix using the Needleman-Wunsch recurrence.
    """
    m = len(seq1)
    n = len(seq2)
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            # Compute the three possible scores.
            diag = matrix[i - 1][j - 1] + (match if seq1[i - 1] == seq2[j - 1] else mismatch)
            up = matrix[i - 1][j] + gap
            left = matrix[i][j - 1] + gap
            
            # Find the maximum score using if statements.
            max_score = diag
            if up > max_score:
                max_score = up
            if left > max_score:
                max_score = left
            
            matrix[i][j] = max_score

### Traceback

In [4]:
def traceback(seq1, seq2, match, mismatch, gap, matrix, i, j):
    """
    Recursively traceback from cell (i, j) to (0, 0) to recover all optimal alignments.
    
    Returns a list of tuples (aligned_seq1, aligned_seq2).
    """
    # Base case: when we reach the top-left cell.
    if i == 0 and j == 0:
        return [("", "")]
    
    alignments = []

    # Check diagonal move: aligning seq1[i-1] with seq2[j-1].
    if i > 0 and j > 0:
        diag_score = matrix[i - 1][j - 1]
        expected = diag_score + (match if seq1[i - 1] == seq2[j - 1] else mismatch)
        if matrix[i][j] == expected:
            diag_alignments = traceback(seq1, seq2, match, mismatch, gap, matrix, i - 1, j - 1)
            for a in diag_alignments:
                # Append the current aligned characters.
                aligned1 = a[0] + seq1[i - 1]
                aligned2 = a[1] + seq2[j - 1]
                alignments.append((aligned1, aligned2))
    
    # Check up move: aligning seq1[i-1] with a gap.
    if i > 0:
        expected = matrix[i - 1][j] + gap
        if matrix[i][j] == expected:
            up_alignments = traceback(seq1, seq2, match, mismatch, gap, matrix, i - 1, j)
            for a in up_alignments:
                aligned1 = a[0] + seq1[i - 1]
                aligned2 = a[1] + "-"
                alignments.append((aligned1, aligned2))
    
    # Check left move: aligning a gap with seq2[j-1].
    if j > 0:
        expected = matrix[i][j - 1] + gap
        if matrix[i][j] == expected:
            left_alignments = traceback(seq1, seq2, match, mismatch, gap, matrix, i, j - 1)
            for a in left_alignments:
                aligned1 = a[0] + "-"
                aligned2 = a[1] + seq2[j - 1]
                alignments.append((aligned1, aligned2))
    
    return alignments

### Putting it together


In [5]:
def remove_duplicates(alignments):
    """
    Remove duplicate alignments from the list.
    """
    seen = {}
    unique = []
    for alignment in alignments:
        if alignment not in seen:
            seen[alignment] = True
            unique.append(alignment)
    return unique

In [6]:
def needleman_wunsch(seq1, seq2, match=1, mismatch=-1, gap=-2):
    """
    Perform global alignment of two sequences using the Needleman–Wunsch algorithm.
    
    Parameters:
        seq1: The first sequence.
        seq2: The second sequence.
        match: Score awarded for a match.
        mismatch: Penalty for a mismatch.
        gap: Penalty for a gap.
    """
    matrix = initialize_matrix(seq1, seq2, gap)
    fill_matrix(seq1, seq2, match, mismatch, gap, matrix)
    final_score = matrix[len(seq1)][len(seq2)]
    alignments = traceback(seq1, seq2, match, mismatch, gap, matrix, len(seq1), len(seq2))
    unique_alignments = remove_duplicates(alignments)
    
    return unique_alignments, final_score

In [10]:
# Call the new smith_waterman function.
alignments, alignment_score = needleman_wunsch(seq1, seq2)

print("=== Needleman-Wunsch alignments ===\n")
print(f"Alignment Score: {alignment_score}\n")

# Iterate through all optimal alignments.
for idx, (aligned1, aligned2) in enumerate(alignments[:5], start=1):
    print(f"Alignment {idx}:")
    print(f"  Seq1 Alignment:  {aligned1}")
    print(f"  Seq2 Alignment:  {aligned2}")
    print()

=== Needleman-Wunsch alignments ===

Alignment Score: -20

Alignment 1:
  Seq1 Alignment:  GCATGTAGGCGCTGGACTCGCTAGTAGTACAATGGCCG-CCTCAGTGATGCGCGTAACCTAGTACGATGCCTAGTGAA--T--T--
  Seq2 Alignment:  G-GCG-ATAAG-TTAAAT---T-GT-GT-CAAGGGATGTCTTCGGAG-TTCGAGCAA-CT-GCA-TACCCCCAGTTAACGTCGTCC

Alignment 2:
  Seq1 Alignment:  GCATGTAGGCGCTGGACTCGCTAGTAGTACAATGGCCG-CCTCAGTGATGCGCGTAACCTAGTACGATGCCTAGTGAA--T--T--
  Seq2 Alignment:  GG-CG-ATAAG-TTAAAT---T-GT-GT-CAAGGGATGTCTTCGGAG-TTCGAGCAA-CT-GCA-TACCCCCAGTTAACGTCGTCC

Alignment 3:
  Seq1 Alignment:  GCATGTAGGCGCTGGACTCGCTAGTAGTACAATGGCCG-CCTCAGTGATGCGCGTAACCTAGTACGATGCCTAGTGAA--T--T--
  Seq2 Alignment:  GGC-G-ATAAG-TTAAAT---T-GT-GT-CAAGGGATGTCTTCGGAG-TTCGAGCAA-CT-GCA-TACCCCCAGTTAACGTCGTCC

Alignment 4:
  Seq1 Alignment:  GCATGTAGGCGCTGGACTCGCTAGTAGTACAATGGCCG-CCTCAGTGATGCGCGTAACCTAGTACGATGCCTAGTGAA--T--T--
  Seq2 Alignment:  GGCGATA--AG-TTAAAT---T-GT-GT-CAAGGGATGTCTTCGGAG-TTCGAGCAA-CT-GCA-TACCCCCAGTTAACGTCGTCC

Alignment 5:
  Seq1 Alignment:  GCATG