# Practical 2

In [79]:
import numpy as np
import pandas as pd
from collections import Counter
pd.set_option('display.max_columns', None)

def read_sequences(path: str) -> list[str]:
    with open(path, "r") as f:
        seqs = f.read()
    seqs = seqs.replace("\n", "").split(">")[1:]
    seqs = [x.split("bp")[1] for x in seqs]

    # check that all sequences have same length
    n_nucleotides = len(seqs[0])
    for s in seqs:
        if len(s) != n_nucleotides:
            raise Exception("Sequences have different lengths")
    
    return seqs

seqs = read_sequences("handins/handin2/sequences.fa")
seqs

['GTCGACTGCACTCGCCCCCACGAGAGAACAGTATTTAAGGAGCTGCGAAGGTCCAAGTCACCGATTATTGTCTCAGTGCAGTTGTCAGTTGCAGTTCAGCAGACGGGCTAACGAGTACTTGCATCTCTTCAAATTTACTTAATTGATCAAGTAAGTAGCAAAAGGGCACCCAATTAAAGGAAATTCTTGTTTAATTGAATTTATTATGCAAGTGCGGAAATAAAATGACAGTATTAAATAGTAAATATTTTGTAAAATCATATATAATCAAATTTATTCAATCAGAACTAATTCAAGCTGTCACAAGTAGTGCGAACTCAATTAATTGGCATCGAATTAAAATTTGGAGTCCTGTGCCGCATATTCGTCTTGGAAAATCACCTGTTAGTTAACTTCTAAAAATAGGAATTTTAACATAACTCGTCCCTGTTAATCGGCGCCGTGCCTTCGTTAGCTATCTCAAAAGCGAGCGCGTGCAGACGAGCAGTAATTTTCCAAGCATCAGGCATAGTTGGGCATAAATTATAAACATACAAACC..................................GAATACTAATATAGAAAAAGCTTTGCCGGTACAAAATCCCAAACAAAAACAAACCGTGTGTGCCGAAAAATAAA.....................................AATAAACCATAAACTAGGCAGCGCTGCCGTCGCCGGCTGAGCAGCCTGCGTACATAGCCGAGATCGCGTAACGGTAGATAATGAAAAGCTCTACGTAACCGAAGCTTCTGCTGTACGGATCTTCCTATAAATACGGGGCCGACACGAACTGGAAACCAACAACTAACGGAGCCCTCTTCCAATTGAAACAGATCGAAAGAGCCTGCTAAAGCAAAAAAGAAGTCACCATGTCGTTTACTTTGACCAACAAGAACGTGATTTTCGTTGCCGGTCTGGGAGGCATTGGTCTGGACACCAGCAAGGAGCTGCTCAAG

In [85]:
# map each character in each sequence into a integer representation
nucleotide_mapping = {"A": 0, "C": 1, "G": 2, "T": 3, ".": 4, "N": 5}
mapped_seqs = []

for s in seqs:
    s_mapped = [nucleotide_mapping[x] for x in [x for x in s]]
    mapped_seqs.append(s_mapped)
mapped_seqs = np.array(mapped_seqs)

# filter out unused columns because of "."
filtered_seqs = []
for col in range(mapped_seqs.shape[1]):
    col_data = mapped_seqs[:, col]
    if 4 in col_data: # add 5 for the N's
        continue
    else:
        filtered_seqs.append(col_data)

filtered_seqs = np.array(filtered_seqs)
filtered_seqs

array([[2, 2, 2, ..., 2, 2, 2],
       [3, 3, 3, ..., 3, 3, 3],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [3, 3, 5, ..., 3, 3, 3],
       [2, 2, 5, ..., 2, 2, 2],
       [2, 2, 5, ..., 2, 2, 2]])

In [86]:
base_seq = []
mutant_seq = []
for c in filtered_seqs:
    counter = Counter(c)
    base_seq.append(max(counter, key=counter.get))
    mutant_seq.append(min(counter, key=counter.get))

print(f"Base Sequence: {base_seq}\nMutant Sequence: {mutant_seq}")
    

Base Sequence: [2, 3, 1, 2, 0, 1, 3, 2, 1, 0, 1, 3, 1, 2, 1, 1, 1, 1, 1, 0, 1, 2, 0, 2, 0, 2, 0, 0, 1, 0, 2, 3, 0, 3, 3, 3, 0, 0, 2, 2, 0, 2, 1, 3, 2, 1, 2, 0, 0, 2, 2, 3, 1, 1, 0, 0, 2, 3, 1, 0, 1, 1, 1, 0, 3, 3, 0, 3, 3, 2, 3, 1, 3, 1, 0, 2, 3, 2, 1, 0, 2, 3, 3, 2, 3, 1, 0, 2, 3, 3, 2, 1, 0, 2, 3, 3, 1, 0, 2, 1, 0, 2, 0, 1, 2, 2, 2, 1, 3, 0, 0, 1, 2, 0, 2, 3, 0, 1, 3, 3, 2, 1, 0, 3, 1, 3, 1, 3, 3, 1, 0, 0, 0, 3, 3, 3, 0, 1, 3, 3, 0, 0, 3, 3, 2, 0, 3, 1, 0, 0, 2, 3, 0, 0, 2, 3, 0, 2, 1, 0, 0, 0, 0, 2, 2, 2, 1, 0, 1, 1, 1, 0, 0, 3, 3, 0, 0, 0, 2, 2, 0, 0, 0, 3, 3, 1, 3, 3, 2, 3, 3, 3, 0, 0, 3, 3, 2, 0, 0, 3, 3, 3, 0, 3, 3, 0, 3, 2, 1, 0, 0, 2, 3, 2, 1, 2, 2, 0, 0, 0, 3, 0, 0, 0, 0, 3, 2, 0, 1, 0, 2, 3, 0, 3, 3, 0, 0, 0, 3, 0, 2, 3, 0, 0, 0, 3, 0, 3, 3, 3, 3, 2, 3, 0, 0, 0, 0, 3, 1, 0, 3, 0, 3, 0, 3, 0, 0, 3, 1, 0, 0, 0, 3, 3, 3, 0, 3, 3, 1, 0, 0, 3, 1, 0, 2, 0, 0, 1, 3, 0, 0, 3, 3, 1, 0, 0, 2, 1, 3, 2, 3, 1, 0, 1, 0, 0, 2, 3, 0, 2, 3, 2, 1, 2, 0, 0, 1, 3, 1, 0, 0, 3, 3, 0, 0, 3, 3, 2, 

In [None]:
# TODO refactor to work with numpy array
def extract_base_and_mutant_seq(sequences: np.ndarray):
    base_seq = []
    mutant_seq = []
    
    for i in range(len(sequences[0])): # for each column in the lists of sequences (matrix)
        current_column_nucleotides = []
        for j in range(len(sequences)):
            current_column_nucleotides.append(sequences[j][i])
        
        if "." in current_column_nucleotides:
            continue
        
        # get base and mutant
        base_seq += max(set(current_column_nucleotides), key = current_column_nucleotides.count)
        mutant_seq += min(set(current_column_nucleotides), key = current_column_nucleotides.count)
    
    return base_seq, mutant_seq
        
base_seq, mutant_seq = extract_base_and_mutant_seq(seqs)

print(f"Base sequences: {base_seq}\nMutant sequence: {mutant_seq}")

Base sequences: GTCGACTGCACTCGCCCCCACGAGAGAACAGTATTTAAGGAGCTGCGAAGGTCCAAGTCACCCATTATTGTCTCAGTGCAGTTGTCAGTTGCAGTTCAGCAGACGGGCTAACGAGTACTTGCATCTCTTCAAATTTACTTAATTGATCAAGTAAGTAGCAAAAGGGCACCCAATTAAAGGAAATTCTTGTTTAATTGAATTTATTATGCAAGTGCGGAAATAAAATGACAGTATTAAATAGTAAATATTTTGTAAAATCATATATAATCAAATTTATTCAATCAGAACTAATTCAAGCTGTCACAAGTAGTGCGAACTCAATTAATTGGCATCGAATTAAAATTTGGAGGCCTGTGCCGCATATTCGTCTTGGAAAATCACCTGTTAGTTAACTTCTAAAAATAGGAATTTTAACATAACTCGTCCCTGTTAATCGGCGCCGTGCCTTCGTTAGCTATCTCAAAAGCGAGCGCGTGCAGACGAGCAGTAATTTTCCAAGCATCAGGCATAGAATACTAATATAGAAAAAGCTTTGCCGGTACAAAATCCCAAACAAAAACAAACCGTGTGTGCCGAAAAATAAAAATAAACCATAAACTAGGCAGCGCTGCCGTCGCCGGCTGAGCAGCCTGCGTACATAGCCGAGATCGCGTAACGGTAGATAATGAAAAGCTCTACGTAACCGAAGCTTCTGCTGTACGGATCTTCCTATAAATACGGGGCCGACACGAACTGGAAACCAACAACTAACGGAGCCCTCTTCCAATTGAAACAGATCGAAAGAGCCTGCTAAAGCAAAAAAGAAGTCACCATGTCGTTTACTTTGACCAACAAGAACGTGATTTTCGTGGCCGGTCTGGGAGGCATTGGTCTGGACACCAGCAAGGAGCTGCTCAAGCGCGATCTGAAGGTAACTATGCGATGCCCACAGGCTCCATGCAGCGATGGAGGTTAATCTCGTGTATTCAATCCTAGAACCTGGTG

The dots (.) are nucleotides unable to be determined. These can be disregarded.

## Part 1
(40 points) Given a file sequences.fa of genomic sequences, write a Python script to extract the segregating sites
from the sequences into a binary matrix. Give the code of your Python script as your answer to this question,
using the LATEX package listings.

In [78]:
def extract_base_and_mutant_seq(sequences: list[str]) -> tuple[str, str]:
    base_seq = ""
    mutant_seq = ""
    
    for i in range(len(sequences[0])): # for each column in the lists of sequences (matrix)
        current_column_nucleotides = []
        for j in range(len(sequences)):
            current_column_nucleotides.append(sequences[j][i])
        
        if "." in current_column_nucleotides:
            continue
        
        # get base and mutant
        base_seq += max(set(current_column_nucleotides), key = current_column_nucleotides.count)
        mutant_seq += min(set(current_column_nucleotides), key = current_column_nucleotides.count)
    
    return base_seq, mutant_seq
        
base_seq, mutant_seq = extract_base_and_mutant_seq(seqs)

print(f"Base sequences: {base_seq}\nMutant sequence: {mutant_seq}")

Base sequences: GTCGACTGCACTCGCCCCCACGAGAGAACAGTATTTAAGGAGCTGCGAAGGTCCAAGTCACCCATTATTGTCTCAGTGCAGTTGTCAGTTGCAGTTCAGCAGACGGGCTAACGAGTACTTGCATCTCTTCAAATTTACTTAATTGATCAAGTAAGTAGCAAAAGGGCACCCAATTAAAGGAAATTCTTGTTTAATTGAATTTATTATGCAAGTGCGGAAATAAAATGACAGTATTAAATAGTAAATATTTTGTAAAATCATATATAATCAAATTTATTCAATCAGAACTAATTCAAGCTGTCACAAGTAGTGCGAACTCAATTAATTGGCATCGAATTAAAATTTGGAGGCCTGTGCCGCATATTCGTCTTGGAAAATCACCTGTTAGTTAACTTCTAAAAATAGGAATTTTAACATAACTCGTCCCTGTTAATCGGCGCCGTGCCTTCGTTAGCTATCTCAAAAGCGAGCGCGTGCAGACGAGCAGTAATTTTCCAAGCATCAGGCATAGAATACTAATATAGAAAAAGCTTTGCCGGTACAAAATCCCAAACAAAAACAAACCGTGTGTGCCGAAAAATAAAAATAAACCATAAACTAGGCAGCGCTGCCGTCGCCGGCTGAGCAGCCTGCGTACATAGCCGAGATCGCGTAACGGTAGATAATGAAAAGCTCTACGTAACCGAAGCTTCTGCTGTACGGATCTTCCTATAAATACGGGGCCGACACGAACTGGAAACCAACAACTAACGGAGCCCTCTTCCAATTGAAACAGATCGAAAGAGCCTGCTAAAGCAAAAAAGAAGTCACCATGTCGTTTACTTTGACCAACAAGAACGTGATTTTCGTGGCCGGTCTGGGAGGCATTGGTCTGGACACCAGCAAGGAGCTGCTCAAGCGCGATCTGAAGGTAACTATGCGATGCCCACAGGCTCCATGCAGCGATGGAGGTTAATCTCGTGTATTCAATCCTAGAACCTGGTG