In [9]:
import pandas as pd
import baltic as bt
import numpy as np
import os
import sys
from collections import Counter

module_dir = "/Users/monclalab1/Documents/scripts/"
sys.path.append(module_dir)

from fasta_editing import fasta_to_df, fasta_writer

In [10]:
df = fasta_to_df("utr_h3nx_na.fasta")

In [11]:
df['strain'] = df['header'].str.split("|").str[0].str.extract(r'>(.*)')
df['sequence'] = df['sequence'].str.lower()

#starting UTR
# the sequences that have the UTR will not start with atg or a gap
df_with_start_utr = df[~df['sequence'].str.startswith(('atg', '-'))].copy()
df_with_start_utr['start_UTR'] = df_with_start_utr['sequence'].str.split("atg", n=1).str[0]
start_utr_dict = pd.Series(df_with_start_utr['start_UTR'].values, index=df_with_start_utr['strain']).to_dict()

# print(start_utr_dict.values())


#ending UTR

def extract_end_utr_by_position(sequence):
    if len(sequence) > 1448:  # Ensure sequence extends beyond position 1466 (1-based index)
        utr_sequence = sequence[1448:]  # Extract sequence after position 1466
        return utr_sequence if utr_sequence.strip("-") else None  # Ignore gap-only UTRs
    return None  # Return None if sequence isn't long enough

df["end_utr"] = df["sequence"].apply(extract_end_utr_by_position)
end_utr_dict = df.dropna(subset=["end_utr"]).set_index("strain")["end_utr"].to_dict()

print(end_utr_dict.values())

dict_values(['gctttcgcaattttagaaaaaactccttgt-ttctact-', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gcttcgcaaattttag-----------------------', 'gcttcgcaaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gcttcgcaaattttag-----------------------', 'gcttcgcaatttttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttag-----------------------', 'gctttcgcaattttagaaaaaaaactccttgtttctact'

In [15]:
mytree, mymeta = bt.loadJSON("canine_build/auspice/canine_h3nx_na.json")


Tree height: 17.931000
Tree length: 199.609000
annotations present

Numbers of objects in tree: 524 (230 nodes and 294 leaves)



In [16]:
# NODE_0000102 is the branch of interest with 4 mutations 
# T16A, Y67H, H155Y, R283Q

target_node= "NODE_0000102"

# all strains descending from branch
for k in mytree.Objects:
    
    if k.name == target_node:
        
        subtree = mytree.subtree(k)
        
        descending_leaves = {leaf.name for leaf in subtree.getExternal()}

# all strains preceding branch 

preceding_leaves = set()

for j in mytree.Objects:
    
    if j.name in descending_leaves:
        continue
        
    elif j.is_leaf():
        preceding_leaves.add(j.name)
        

In [17]:
# consensus starting UTR for all descending and preceding leaves
descending_start_utr_seq = [start_utr_dict[strain] for strain in descending_leaves if strain in start_utr_dict]
preceding_start_utr_seq = [start_utr_dict[strain] for strain in preceding_leaves if strain in start_utr_dict]

def get_consensus(utr_list):
    """Computes the consensus sequence from a list of aligned UTR sequences."""
    if not utr_list:
        return None

    consensus_sequence = []

    for position in zip(*utr_list):  
        nucleotide_counts = Counter(position)
        print(nucleotide_counts)
        most_common_nucleotide = nucleotide_counts.most_common(1)[0][0]
        consensus_sequence.append(most_common_nucleotide) 

    return "".join(consensus_sequence)

print("Consensus starting UTR before branch:")
start_consensus_pre_utr = get_consensus(preceding_start_utr_seq)
print(start_consensus_pre_utr)

print(" ")

print("Consensus starting UTR after branch:")
start_consensus_des_utr = get_consensus(descending_start_utr_seq)
print(start_consensus_des_utr)


Consensus starting UTR before branch:
Counter({'a': 53})
Counter({'g': 53})
Counter({'c': 53})
Counter({'a': 50, 'g': 3})
Counter({'a': 53})
Counter({'a': 53})
Counter({'a': 53})
Counter({'g': 53})
Counter({'c': 53})
Counter({'a': 53})
Counter({'g': 53})
Counter({'g': 53})
Counter({'a': 53})
Counter({'g': 53})
Counter({'t': 51, 'g': 2})
Counter({'a': 50, 'g': 3})
Counter({'a': 53})
Counter({'a': 51, 'g': 2})
Counter({'a': 52, 'g': 1})
agcaaaagcaggagtaaaa
 
Consensus starting UTR after branch:
Counter({'a': 65})
Counter({'g': 65})
Counter({'c': 65})
Counter({'a': 56, 'n': 6, 'r': 2, 'g': 1})
Counter({'a': 65})
Counter({'a': 65})
Counter({'a': 65})
Counter({'g': 65})
Counter({'c': 65})
Counter({'a': 65})
Counter({'g': 65})
Counter({'g': 65})
Counter({'a': 65})
Counter({'g': 65})
Counter({'t': 65})
Counter({'a': 65})
Counter({'a': 65})
Counter({'a': 65})
Counter({'a': 65})
agcaaaagcaggagtaaaa


In [18]:
# consensus starting UTR for all descending and preceding leaves
descending_end_utr_seq = [end_utr_dict[strain] for strain in descending_leaves if strain in end_utr_dict]
preceding_end_utr_seq = [end_utr_dict[strain] for strain in preceding_leaves if strain in end_utr_dict]

def get_consensus(utr_list):
    """Computes the consensus sequence from a list of aligned UTR sequences."""
    if not utr_list:
        return None

    consensus_sequence = []

    for position in zip(*utr_list):  
        nucleotide_counts = Counter(position)
        print(nucleotide_counts)
        most_common_nucleotide = nucleotide_counts.most_common(1)[0][0]
        consensus_sequence.append(most_common_nucleotide) 

    return "".join(consensus_sequence)

print("Consensus ending UTR before branch:")
end_consensus_pre_utr = get_consensus(preceding_end_utr_seq)
print(end_consensus_pre_utr)

print(" ")

print("Consensus ending UTR after branch:")
end_consensus_des_utr = get_consensus(descending_end_utr_seq)
print(end_consensus_des_utr)


Consensus ending UTR before branch:
Counter({'g': 54})
Counter({'c': 54})
Counter({'t': 54})
Counter({'t': 54})
Counter({'t': 54})
Counter({'c': 54})
Counter({'g': 54})
Counter({'c': 54})
Counter({'a': 54})
Counter({'a': 54})
Counter({'t': 54})
Counter({'t': 54})
Counter({'t': 54})
Counter({'t': 54})
Counter({'a': 54})
Counter({'g': 54})
Counter({'a': 54})
Counter({'a': 54})
Counter({'a': 54})
Counter({'a': 54})
Counter({'a': 54})
Counter({'a': 54})
Counter({'a': 51, 'c': 2, '-': 1})
Counter({'c': 47, 'a': 4, 't': 2, '-': 1})
Counter({'t': 47, 'c': 6, '-': 1})
Counter({'c': 49, 't': 4, '-': 1})
Counter({'c': 51, 't': 2, '-': 1})
Counter({'t': 49, 'c': 4, '-': 1})
Counter({'t': 51, 'g': 2, '-': 1})
Counter({'g': 47, 't': 6, '-': 1})
Counter({'-': 47, 'g': 4, 't': 3})
Counter({'t': 53, '-': 1})
Counter({'t': 51, '-': 3})
Counter({'t': 50, '-': 3, 'c': 1})
Counter({'c': 52, 't': 1, '-': 1})
Counter({'t': 52, 'a': 1, '-': 1})
Counter({'a': 52, 'c': 1, '-': 1})
Counter({'c': 52, 't': 1, '-'

In [19]:
#all starting UTRs

uni_start_utr_sequences = [start_utr_dict[strain] for strain in start_utr_dict]

def get_consensus(utr_list):
    """Computes the consensus sequence from a list of aligned UTR sequences."""
    if not utr_list:
        return None

    consensus_sequence = []

    for position in zip(*utr_list):  
        nucleotide_counts = Counter(position)
        print(nucleotide_counts)
        most_common_nucleotide = nucleotide_counts.most_common(1)[0][0]
        consensus_sequence.append(most_common_nucleotide) 

    return "".join(consensus_sequence)

uni_start_consensus_utr = get_consensus(uni_start_utr_sequences)

print("Consensus UTR across tree:", uni_start_consensus_utr)


Counter({'a': 134})
Counter({'g': 134})
Counter({'c': 134})
Counter({'a': 121, 'n': 7, 'g': 4, 'r': 2})
Counter({'a': 134})
Counter({'a': 134})
Counter({'a': 134})
Counter({'g': 134})
Counter({'c': 134})
Counter({'a': 134})
Counter({'g': 134})
Counter({'g': 134})
Counter({'a': 134})
Counter({'g': 134})
Counter({'t': 132, 'g': 2})
Counter({'a': 130, 'g': 4})
Counter({'a': 134})
Counter({'a': 132, 'g': 2})
Counter({'a': 132, 'g': 2})
Consensus UTR across tree: agcaaaagcaggagtaaaa


In [20]:
#all ending UTRs

uni_end_utr_sequences = [end_utr_dict[strain] for strain in end_utr_dict]

def get_consensus(utr_list):
    """Computes the consensus sequence from a list of aligned UTR sequences."""
    if not utr_list:
        return None

    consensus_sequence = []

    for position in zip(*utr_list):  
        nucleotide_counts = Counter(position)
        print(nucleotide_counts)
        most_common_nucleotide = nucleotide_counts.most_common(1)[0][0]
        consensus_sequence.append(most_common_nucleotide) 

    return "".join(consensus_sequence)

uni_end_consensus_utr = get_consensus(uni_end_utr_sequences)

print("Consensus ending UTR across tree:", uni_end_consensus_utr)


Counter({'g': 209})
Counter({'c': 209})
Counter({'t': 209})
Counter({'t': 208, 'g': 1})
Counter({'t': 196, 'c': 12, '-': 1})
Counter({'c': 196, 'g': 12, '-': 1})
Counter({'g': 195, 'c': 12, '-': 1, 't': 1})
Counter({'c': 196, 'a': 12, '-': 1})
Counter({'a': 208, '-': 1})
Counter({'a': 207, 't': 1, '-': 1})
Counter({'t': 208, '-': 1})
Counter({'t': 208, '-': 1})
Counter({'t': 207, 'c': 1, '-': 1})
Counter({'t': 207, '-': 2})
Counter({'a': 207, '-': 2})
Counter({'g': 206, '-': 2, 'a': 1})
Counter({'a': 142, '-': 67})
Counter({'a': 142, '-': 67})
Counter({'a': 142, '-': 67})
Counter({'a': 142, '-': 67})
Counter({'a': 142, '-': 67})
Counter({'a': 142, '-': 67})
Counter({'a': 132, '-': 68, 'c': 9})
Counter({'c': 113, '-': 68, 'a': 19, 't': 9})
Counter({'t': 113, '-': 68, 'c': 28})
Counter({'c': 116, '-': 74, 't': 19})
Counter({'c': 126, '-': 74, 't': 9})
Counter({'t': 116, '-': 74, 'c': 19})
Counter({'t': 126, '-': 74, 'g': 9})
Counter({'g': 107, '-': 74, 't': 28})
Counter({'-': 158, 't': 3