In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

%matplotlib inline

**Question 1:** Implement this global alignment model. 

In [36]:
# use the examples from the hint as the actual and observed read
actual_sequence = 'TCACCTACCGCGGTCGGCGCGTCTTCGGCCCG'
observed_read = 'ACACCACCGCGGGTGGCGCGCCCCCGGCTCCG'

# set up score matrix
score_matrix = np.array([[1.85, -2.91, -2.91, -2.91], # A
                     [-2.91, 1.85, -2.91, -2.91], # C
                     [-2.91, -2.91, 1.85, -2.91], # G
                     [0, 0, 0, 0]]) # T

# set up a dictionary to reference the four nucleotides
nts = {"A": 0, "C": 1, "G": 2, "T": 3}

Below is my function for the global alignment model, which takes in an actual sequence, an observed read, and a gap score. It initializes a matrix of zeros, then initializes the first column and first row with the gap_score * position for T's that occur at the start of the sequence; otherwise, the values are set to negative infinity at the first instance of a non-T read. Then, the function iterates through the pair and builds the current optimal alignment of prefix X1..Xi to Y1..Yj. The function then returns the optimal global alignment score.

In [37]:
def align_global(actual, read, gap_score):

    L = len(actual) # X
    M = len(read) # Y

    # initialization of alignment matrix
    S = np.zeros((L+1,M+1))

    # first position (top left) is 0
    S[0,0] = 0

    # actual (first column of matrix S)
    for position in range(1, L+1):
        base = actual[position-1]

        # penalize by gap_score if there is a T
        if base == 'T':
            S[position, 0] = gap_score * position
        else:
            S[position, 0] = -np.inf
            # set the rest of the column to negative infinity after the first non-'T' nucleotide
            S[position+1:, 0] = -np.inf
            break

    # read (first row of matrix S)
    for position in range(1, M+1):
        base = read[position-1]

        # penalize by gap_score if there is a T
        if base == 'T':
            S[0, position] = gap_score * position
        else:
            S[0, position] = -np.inf
            # set the rest of the row to negative infinity after the first non-'T' nucleotide
            S[0, position+1:] = -np.inf
            break
            
    for i in range(1, L+1):
        for j in range(1, M+1):
            # insertions allowed only if read[j-1] is 'T'
            if read[j-1] == 'T':
                insert = S[i, j-1] + gap_score
            else:
                insert = S[i, j-1]-np.inf  # disallow insertion

            # deletions allowed only if actual[i-1] is 'T'
            if actual[i-1] == 'T':
                delete = S[i-1, j] + gap_score
            else:
                delete = S[i-1, j]-np.inf  # disallow deletion

            # diagonals: match/mismatch case
            x = nts[actual[i-1]]
            y = nts[read[j-1]]
            match = S[i-1, j-1] + score_matrix[x][y]

            # update the score in the matrix
            S[i, j] = max(insert, delete, match)

    return S[L, M]

In [38]:
# test our gap score alignment function on the hint sequences
align_global(actual_sequence, observed_read, -5)

np.float64(33.34000000000001)

This matches our score of 33.34 from the problem set page.

**Question 2:** Check that the -5 gap score seems reasonable.

In [39]:
import random
nucleotides = ['A', 'C', 'G', 'T']

In [40]:
error_process = {
    'A': {'A': 0.9, 'C': 0.03, 'G': 0.03, 'T': 0.03},
    'C': {'A': 0.03, 'C': 0.9, 'G': 0.03, 'T': 0.03},
    'G': {'A': 0.03, 'C': 0.03, 'G': 0.9, 'T': 0.03},
    'T': {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}
}

# function to use the probabilistic model error and generate a simulated read
def sim_read(actual):
    # initialize empty list for our observed read
    read = []

    # implement the probabilistic error model
    for base in actual:
        if base == 'T' and random.random() < 0.03125:  # 3.125% deletion chance
            # 'T' deletion
            continue
        if random.random() < 0.03125: # 3.125% insertion chance
            # append the 'T' to the list 
            read.append('T')
        # utilize the given error process matrix to weight the random generator
        else:
            observed_base = random.choices(population=['A', 'C', 'G', 'T'],
                                           # input weights
                                           weights=[error_process[base]['A'], error_process[base]['C'], 
                                           error_process[base]['G'], error_process[base]['T']])[0]
            # append the generated base to the list 
            read.append(observed_base)

    read_pos = ''.join(read)

    # ensure that the reads are all 32 nucleotides long before returning the final read
    # this utilizes the same error process as above!
    if len(read_pos) < 32:
        nts_left = 32 - len(read_pos)
        for i in range(1, nts_left):
            if random.random() < 0.03125: 
                read.append('T')
            else:
                observed_base = random.choices(population=['A', 'C', 'G', 'T'],
                                               # input weights
                                               weights=[error_process[base]['A'], error_process[base]['C'], 
                                               error_process[base]['G'], error_process[base]['T']])[0]
                read.append(observed_base)
    read_pos = ''.join(read)
    return read_pos

In [41]:
# initialize different gap score choices
gap_score_choices = [0, -2, -4, -5, -6, -8, -10]

In [42]:
# initialize an empty list to store the mean global optimal alignment scores for a given gap score
positive_means = []

# length of the sequence
length = 32

# iterate through the gap score choices
for gap_score in gap_score_choices:
    # initialize an empty list to store all of the global optimal alignment scores for a given gap score
    positive_control_scores = []

    # generate 2000 positive simulated pairs
    for i in range(2000):
        # generate our actual positive control sequence (randomly)
        actual_pos = ''.join(random.choices(nucleotides, k = length))

        # generate a read using the probabilistic error model
        read_pos = sim_read(actual_pos)

        # calculate the optimal global alignment score between the actual sequence and the observed read
        score = align_global(actual_pos, read_pos, gap_score)

        positive_control_scores.append(score)
    
    # take the average of all of the global alignment scores foor a given gap score
    mean_score = np.mean(positive_control_scores)

    # store the mean in 'positive_means'
    positive_means.append(mean_score)

In [43]:
# negative control alignment score averages
negative_means = []

# length of the sequence
length = 32

# iterate through the gap score choices
for gap_score in gap_score_choices:

    # initialize an empty list to store all of the global optimal alignment scores for a given gap score
    negative_control_scores = []

    # generate 2000 negative simulated pairs, randomly 
    for i in range(2000):
        neg_1 = ''.join(random.choices(nucleotides, k = length))
        neg_2 = ''.join(random.choices(nucleotides, k = length))

        # calculate the global alignment score between the pair of negative controls
        score = align_global(neg_1, neg_2, gap_score)
        negative_control_scores.append(score)
    mean_score = np.mean(negative_control_scores)
    negative_means.append(mean_score)

In [44]:
# create a dataframe in order to view all of the information side by side
# columns: gap score, mean of the positive controls, mean of the negative controls, difference between the means
df = pd.DataFrame(columns=['gap_score', 'positive_control_mean', 'negative_control_mean', 'difference'])
df['gap_score'] = gap_score_choices
df['negative_control_mean'] = negative_means
df['positive_control_mean'] = positive_means
df['difference'] = df['positive_control_mean'] - df['negative_control_mean'] 
df

Unnamed: 0,gap_score,positive_control_mean,negative_control_mean,difference
0,0,31.97697,-16.801055,48.778025
1,-2,30.336655,-28.4202,58.756855
2,-4,29.560085,-34.99485,64.554935
3,-5,29.553245,-36.12004,65.673285
4,-6,29.35763,-37.61076,66.96839
5,-8,28.4874,-39.735245,68.222645
6,-10,28.071125,-40.645115,68.71624


**Answer:** We want to aim for a significant difference in the means between the positive and negative controls. We would not want to choose -10 as a gap score penalty, because this is a harsh penalty; if we are too harsh on an insertion, we are going to be very strict on alignment and therefore will accrue too many mismatch penalties. If we are too soft on alignment, we won't accumulate enough penalties to get to a score. This changes with the probability of insertion/deletion given: for instance, we would likely be more forgiving in allowing insertions or deletions if there is a higher probability of them occurring. Therefore, -5 would be the best choice.

**Question 3:** Implement the alignment model for genome scanning.

In [28]:
# alignment function with modified recursion and tracking best score's position
def align_local(actual, read, gap_score):
    
    L = len(actual)  
    M = len(read)    

    # initialize the alignment matrix
    S = np.zeros((L+1, M+1))

    # first position (top left) is 0, and we allow alignment to start anywhere (local alignment)
    S[1:, 0] = 0     # change initialization to allow alignment to start anywhere along 'actual'
    S[0, 1:] = 0     # change initialization to allow alignment to start anywhere along 'read'

    # initialize 'best_score' variable to track the best score and its position
    best_score = 0
    best_pos = (0, 0)  # initialize coordinates for the position where the best score is found

    # recursion step to fill the matrix
    for i in range(1, L+1):
        for j in range(1, M+1):
            # insertions allowed only if read[j-1] is 'T'
            if read[j-1] == 'T':
                insert = S[i, j-1] + gap_score
            else:
                insert = S[i, j-1]-np.inf  # disallow insertion

            # deletions allowed only if actual[i-1] is 'T'
            if actual[i-1] == 'T':
                delete = S[i-1, j] + gap_score
            else:
                delete = S[i-1, j]-np.inf  # disallow deletion

            # diagonals: match/mismatch case
            x = nts[actual[i-1]]
            y = nts[read[j-1]]
            match = S[i-1, j-1] + score_matrix[x][y]

            # update the score in the matrix; allow alignment to start anywhere (hence max with 0)
            S[i, j] = max(0, insert, delete, match)

            # update best score and its position
            if S[i, j] > best_score:
                best_score = S[i, j]
                best_pos = (i, j)

    # termination: the best score, and the position of the best score
    return best_score, best_pos

**Question 4:** Demonstrate proof of principle.

In [29]:
# function to convert FASTA file to a string
def converter(file_name):

    # read and open the file
    with open(file_name, 'r') as file:
        sequence = ''
        for line in file:

            # skip over the sequence name
            if line.startswith('>'):
                continue

            # strip whitespace/linebreaks from every line and then add to the sequence string
            else:
                sequence += line.strip()

    # return the cleaned, isolated DNA sequence
    return sequence

In [30]:
# convert the three files and store them in their respective names
citius = converter('Citius.txt')
tpa4 = converter('TPA4.txt')
vulture = converter('Vulture.txt')

# set our example read and gap score
example_read = 'GCAGCGGAGGAACCAGACATACTGACGAGCCGT'
gap_score = -5

Below, we use the <code>align_local</code> function to search each genome with the 'GCAGCGGAGGAACCAGACATACTGACGAGCCGT' example read and return the best score and position of the best score in each of the three genomes.

In [31]:
align_local(citius, example_read, gap_score)

(np.float64(33.10000000000002), (23901, 32))

In [13]:
align_local(tpa4, example_read, gap_score)

(np.float64(21.140000000000004), (5810, 18))

In [14]:
align_local(vulture, example_read, gap_score)

(np.float64(17.439999999999998), (34565, 18))

**Answer:** It appears that Citius is the genome that the read 'GCAGCGGAGGAACCAGACATACTGACGAGCCGT' came from, which we can conclude after seeing that it has the highest best alignment score (33.1) from the Citius genome. The end position of the read is (23901, 32).

In [47]:
%load_ext watermark
%watermark -v -m -p numpy,matplotlib,seaborn,pandas,jupyter

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.12.5
IPython version      : 8.27.0

numpy     : 2.1.1
matplotlib: 3.9.2
seaborn   : 0.13.2
pandas    : 2.2.2
jupyter   : 1.1.1

Compiler    : Clang 16.0.6 
OS          : Darwin
Release     : 21.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 4
Architecture: 64bit

