# Practical 2

In [32]:
import numpy as np
import pandas as pd
from collections import Counter
pd.set_option('display.max_columns', None)

def read_sequences(path: str) -> list[str]:
    with open(path, "r") as f:
        seqs = f.read()
    seqs = seqs.replace("\n", "").split(">")[1:]
    seqs = [x.split("bp")[1] for x in seqs]

    # check that all sequences have same length
    n_nucleotides = len(seqs[0])
    for s in seqs:
        if len(s) != n_nucleotides:
            raise Exception("Sequences have different lengths")
    
    return seqs

seqs = read_sequences("handins/handin2/sequences.fa")
seqs

['GTCGACTGCACTCGCCCCCACGAGAGAACAGTATTTAAGGAGCTGCGAAGGTCCAAGTCACCGATTATTGTCTCAGTGCAGTTGTCAGTTGCAGTTCAGCAGACGGGCTAACGAGTACTTGCATCTCTTCAAATTTACTTAATTGATCAAGTAAGTAGCAAAAGGGCACCCAATTAAAGGAAATTCTTGTTTAATTGAATTTATTATGCAAGTGCGGAAATAAAATGACAGTATTAAATAGTAAATATTTTGTAAAATCATATATAATCAAATTTATTCAATCAGAACTAATTCAAGCTGTCACAAGTAGTGCGAACTCAATTAATTGGCATCGAATTAAAATTTGGAGTCCTGTGCCGCATATTCGTCTTGGAAAATCACCTGTTAGTTAACTTCTAAAAATAGGAATTTTAACATAACTCGTCCCTGTTAATCGGCGCCGTGCCTTCGTTAGCTATCTCAAAAGCGAGCGCGTGCAGACGAGCAGTAATTTTCCAAGCATCAGGCATAGTTGGGCATAAATTATAAACATACAAACC..................................GAATACTAATATAGAAAAAGCTTTGCCGGTACAAAATCCCAAACAAAAACAAACCGTGTGTGCCGAAAAATAAA.....................................AATAAACCATAAACTAGGCAGCGCTGCCGTCGCCGGCTGAGCAGCCTGCGTACATAGCCGAGATCGCGTAACGGTAGATAATGAAAAGCTCTACGTAACCGAAGCTTCTGCTGTACGGATCTTCCTATAAATACGGGGCCGACACGAACTGGAAACCAACAACTAACGGAGCCCTCTTCCAATTGAAACAGATCGAAAGAGCCTGCTAAAGCAAAAAAGAAGTCACCATGTCGTTTACTTTGACCAACAAGAACGTGATTTTCGTTGCCGGTCTGGGAGGCATTGGTCTGGACACCAGCAAGGAGCTGCTCAAG

The dots (.) are nucleotides unable to be determined. These can be disregarded.
There is also some N's in there? what to do about those?

## 1.
(40 points) Given a file sequences.fa of genomic sequences, write a Python script to extract the segregating sites
from the sequences into a binary matrix. Give the code of your Python script as your answer to this question,
using the LATEX package listings.

In [33]:
# map each character in each sequence into a integer representation
nucleotide_mapping = {"A": 0, "C": 1, "G": 2, "T": 3, ".": 4, "N": 5}
mapped_seqs = []

for s in seqs:
    s_mapped = [nucleotide_mapping[x] for x in [x for x in s]]
    mapped_seqs.append(s_mapped)
mapped_seqs = np.array(mapped_seqs)

# filter out unused columns because of "."
filtered_seqs = []
for col in range(mapped_seqs.shape[1]):
    col_data = mapped_seqs[:, col]
    if (4 in col_data) or (5 in col_data):
        continue
    else:
        filtered_seqs.append(col_data)

filtered_seqs = np.array(filtered_seqs)
filtered_seqs

array([[2, 2, 2, ..., 2, 2, 2],
       [3, 3, 3, ..., 3, 3, 3],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [3, 3, 3, ..., 3, 3, 3],
       [3, 3, 0, ..., 3, 3, 3]])

In [34]:
base_seq = []
mutant_seq = []
for c in filtered_seqs:
    counter = Counter(c)
    base_seq.append(max(counter, key=counter.get))
    mutant_seq.append(min(counter, key=counter.get))

print("Using the mapping above, the sequences are:")
print(f"Base Sequence: {base_seq}\nMutant Sequence: {mutant_seq}")

Using the mapping above, the sequences are:
Base Sequence: [2, 3, 1, 2, 0, 1, 3, 2, 1, 0, 1, 3, 1, 2, 1, 1, 1, 1, 1, 0, 1, 2, 0, 2, 0, 2, 0, 0, 1, 0, 2, 3, 0, 3, 3, 3, 0, 0, 2, 2, 0, 2, 1, 3, 2, 1, 2, 0, 0, 2, 2, 3, 1, 1, 0, 0, 2, 3, 1, 0, 1, 1, 1, 0, 3, 3, 0, 3, 3, 2, 3, 1, 3, 1, 0, 2, 3, 2, 1, 0, 2, 3, 3, 2, 3, 1, 0, 2, 3, 3, 2, 1, 0, 2, 3, 3, 1, 0, 2, 1, 0, 2, 0, 1, 2, 2, 2, 1, 3, 0, 0, 1, 2, 0, 2, 3, 0, 1, 3, 3, 2, 1, 0, 3, 1, 3, 1, 3, 3, 1, 0, 0, 0, 3, 3, 3, 0, 1, 3, 3, 0, 0, 3, 3, 2, 0, 3, 1, 0, 0, 2, 3, 0, 0, 2, 3, 0, 2, 1, 0, 0, 0, 0, 2, 2, 2, 1, 0, 1, 1, 1, 0, 0, 3, 3, 0, 0, 0, 2, 2, 0, 0, 0, 3, 3, 1, 3, 3, 2, 3, 3, 3, 0, 0, 3, 3, 2, 0, 0, 3, 3, 3, 0, 3, 3, 0, 3, 2, 1, 0, 0, 2, 3, 2, 1, 2, 2, 0, 0, 0, 3, 0, 0, 0, 0, 3, 2, 0, 1, 0, 2, 3, 0, 3, 3, 0, 0, 0, 3, 0, 2, 3, 0, 0, 0, 3, 0, 3, 3, 3, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 3, 0, 3, 0, 3, 0, 0, 3, 1, 0, 0, 0, 3, 3, 3, 0, 3, 3, 1, 0, 0, 3, 1, 0, 2, 0, 0, 1, 3, 0, 0, 3, 3, 1, 0, 0, 2, 1, 3, 2, 3, 1, 0, 1, 0, 0, 2, 3, 0, 2, 3, 2, 1, 2

In [35]:
# create matrix representation:
final_matrix = []

for i in range(len(seqs)):
    return_l = []
    for v, base in zip(filtered_seqs[:, i], base_seq):
        if v != base:
            return_l.append(1) # segregating site
        else:
            return_l.append(0)
        
    final_matrix.append(return_l)

final_matrix = np.array(np.array(final_matrix))

## 2.
(5 points) How many genomic sequences are there?

In [36]:
len(seqs)

11

## 3.
(5 points) How many segregating sites do they have?

In [37]:
np.sum(final_matrix == 1)

118

## 4. 
(40 points) Given a file sequences.fa of genomic sequences, write a Python script to determine whether there is a perfect phylogeny for the segregating sites of the sequences.

In [40]:
def check_perfect_phylogeny(matrix: np.array) -> bool:
    rows, cols = matrix.shape
    
    # set to hold the positions of ones in the matrix
    O = set()
    
    L = np.zeros((rows, cols))  # Initialize to zeros
    
    # Add positions of ones in the matrix to the set O 
    for r in range(rows):
        for c in range(cols):
            if matrix[r][c] == 1:
                O.add((r, c))
    
    # Calculate the L matrix based on the positions in O
    for i, j in O:
        for k in range(j):
            if (i, k) in O:
                L[i, j] = k+1
    
    
    L_ = np.zeros(cols)
    
     # For each column, find the largest value in L for that column among rows that are in set O
    for j in range(cols):
        largest = 0
        for i in range(rows):
            if (i, j) in O and L[i, j] > largest:
                largest = L[i, j]
        L_[j] = largest
    for i, j in O:
        if L[i, j] != L_[j]:
            return False
            
    return True

In [39]:
check_perfect_phylogeny(final_matrix)

False

## 5. What is the running time of your script, as a function of the number n of genomic sequences and the number m of segregating sites?

WORST CASES:

constructing O (first loop): n*m

calculating L (second loop): n*m^2

calculating L_ (third loop): n*m

final check (fourth loop): n*m

so the running time of the function is O(n * m^2)

## 6. What is the best possible running time of an algorithm to solve the perfect phylogeny problem?

By sorting the columns in decreasing order and removing repeated columns you can get the running time to O(nm).