## Implementation of the Needleman-Wunsch Algorithm

The following code works given a fasta file containing the two protein sequences one wishes to align

In [None]:
def read_fasta(file_path):
    sequences = {}
    current_sequence = None

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                current_sequence = line[1:]
                sequences[current_sequence] = ''
            else:
                sequences[current_sequence] += line

    return sequences

def initialize_matrix(rows, cols):
    return [[0] * cols for _ in range(rows)]

def needleman_wunsch(seq1, seq2, gap_penalty=-2, match_score=1, mismatch_penalty=-1):
    len_seq1 = len(seq1)
    len_seq2 = len(seq2)

    # Initialize the scoring matrix
    score_matrix = initialize_matrix(len_seq1 + 1, len_seq2 + 1)

    # Initialize the first row and column
    for i in range(len_seq1 + 1):
        score_matrix[i][0] = i * gap_penalty

    for j in range(len_seq2 + 1):
        score_matrix[0][j] = j * gap_penalty

    # Fill in the scoring matrix
    for i in range(1, len_seq1 + 1):
        for j in range(1, len_seq2 + 1):
            match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_penalty)
            delete = score_matrix[i - 1][j] + gap_penalty
            insert = score_matrix[i][j - 1] + gap_penalty

            score_matrix[i][j] = max(match, delete, insert)

    # score_matrix = [row[1:] for row in score_matrix[1:]]

    # Print the scoring matrix
    print("Scoring Matrix:")
    for row in score_matrix:
        print(row)

    # Traceback to find the alignment
    aligned_seq1 = ''
    aligned_seq2 = ''
    i, j = len_seq1, len_seq2

    while i > 0 or j > 0:
        current_score = score_matrix[i][j]
        diagonal_score = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_penalty) if i > 0 and j > 0 else float('-inf')
        up_score = score_matrix[i - 1][j] + gap_penalty if i > 0 else float('-inf')
        left_score = score_matrix[i][j - 1] + gap_penalty if j > 0 else float('-inf')

        if current_score == diagonal_score:
            aligned_seq1 = seq1[i - 1] + aligned_seq1
            aligned_seq2 = seq2[j - 1] + aligned_seq2
            i -= 1
            j -= 1
        elif current_score == up_score:
            aligned_seq1 = seq1[i - 1] + aligned_seq1
            aligned_seq2 = '-' + aligned_seq2
            i -= 1
        else:
            aligned_seq1 = '-' + aligned_seq1
            aligned_seq2 = seq2[j - 1] + aligned_seq2
            j -= 1

    return aligned_seq1, aligned_seq2

if __name__ == "__main__":
    # Example usage
    fasta_file = "protein_sequences.fasta"
    sequences = read_fasta(fasta_file)

    seq1_name, seq2_name = list(sequences.keys())[:2]
    seq1 = sequences[seq1_name]
    seq2 = sequences[seq2_name]

    aligned_seq1, aligned_seq2 = needleman_wunsch(seq1, seq2)

    print(f"Alignment for {seq1_name} and {seq2_name}:")
    print(aligned_seq1)
    print(aligned_seq2)
