In [1]:
!pip install biopython



In [2]:
from Bio import SeqIO
import numpy as np
from collections import Counter

In [3]:
def read_fasta(file_path):
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        sequences.append(str(record.seq))
    return sequences


In [4]:
s = read_fasta('./sequences.fa')

In [5]:
filter_indices = []

for i in range(len(s[0])):
        
        # Create a list of all characters in the ith column across all sequences
        col_seq = [seq[i] for seq in s]
        
        
        if '.' in col_seq or 'N' in col_seq:
            
            filter_indices.append(i)
new_seq = ''
for seq in enumerate(s):
    for index in range(len(seq[1])):
        if index not in filter_indices:
            new_seq += seq[1][index]
    s[seq[0]] = new_seq
    new_seq = []

## write a Python script to extract the segregating sites from the sequences into a binary matrix

In [6]:
def make_base_seq(sequences):
    seg_sites = []
    base_seq = ''
    
    # Loop through every column
    for i in range(len(sequences[0])):
        
        # Create a list of all characters in the ith column across all sequences
        col_seq = [seq[i] for seq in sequences]
        
        # Get the most frequent characters in ith column
        most_frequent = max(set(col_seq), key = col_seq.count)
        
        
        # Add this most frequent character to the base sequence
        base_seq += most_frequent
    
    return base_seq

In [7]:
def seg_sites_matrix(base_seq, sequences):
    matrix = []
    
    
    
    # Loop through every sequence
    for seq in sequences:
        row = []
        # Loop through each character in a sequence
        for i in range(len(seq)):
            
            # If the character is the same as the character in the base sequence, append 0 else 1
            if seq[i] == base_seq[i]:
                row.append(0)
            else:
                row.append(1)
        matrix.append(row)
    return np.array(matrix)

In [8]:
base_seq = make_base_seq(s)

## How many genomic sequences are there?

In [9]:
len(s)

11

## How many segregating sites do they have?

In [10]:
binary_matrix = seg_sites_matrix(base_seq, s)
binary_matrix.sum()

118

## write a Python script to determine whether there is a perfect phylogeny for the segregating sites of the sequences.

In [11]:
def check_perfect_phylogeny(matrix):
    rows, cols = matrix.shape
    
    # set to hold the positions of ones in the matrix
    O = set()
    
    L = np.zeros((rows, cols))  # Initialize to zeros
    
    # Add positions of ones in the matrix to the set O 
    for r in range(rows):
        for c in range(cols):
            if matrix[r][c] == 1:
                O.add((r, c))
    
    # Calculate the L matrix based on the positions in O
    for i, j in O:
        for k in range(j):
            if (i, k) in O:
                L[i, j] = k+1
    
    
    L_ = np.zeros(cols)
    
     # For each column, find the largest value in L for that column among rows that are in set O
    for j in range(cols):
        largest = 0
        for i in range(rows):
            if (i, j) in O and L[i, j] > largest:
                largest = L[i, j]
        L_[j] = largest
    for i, j in O:
        if L[i, j] != L_[j]:
            return False
            
    return True

In [12]:
check_perfect_phylogeny(binary_matrix)

False

## What is the running time of your script, as a function of the number n of genomic sequences and the number m of segregating sites?

WORST CASES:

constructing O (first loop): n*m

calculating L (second loop): n*m^2

calculating L_ (third loop): n*m

final check (fourth loop): n*m

so the running time of the function is O(n * m^2)

## What is the best possible running time of an algorithm to solve the perfect phylogeny problem?

By sorting the columns in decreasing order and removing repeated columns you can get the running time to O(nm).