In [2]:
%load_ext autoreload
%autoreload 2

import seaborn as sns
from Bio import AlignIO
import beak.alignments.utils



First, let's load an alignment

In [5]:
filepath = '/Users/micaholivas/Downloads/sp_Q9KJX5_ALPH_ELIME_Alkaline_phosphatase_PafA_OS_Elizabethkingia_meningoseptica_OX_238_GN_pafA_PE_1_SV_1.a3m'
aln = AlignIO.read(filepath, "fasta")

ValueError: Sequences must all be the same length

Now we'll "ungap" the alignment by removing significantly gapped positions

In [None]:
aln = beak.alignments.utils.ungap_aln(aln)
aln

Great! Let's see what the consensus sequence is from this alignment

In [None]:
consensus = beak.alignments.utils.get_consensus(aln)
print(consensus)

Let's make a position-specific scoring matrix

In [None]:
pssm = beak.alignments.utils.alignment_to_pssm(aln, freq=True)
pssm

Now, for a query sequence, get the frequency of each residue at each aligned position

In [None]:
my_seq = "MSTAQSLKSVDYEVFGRVQGVCFRMYTEDEARKIGVVGWVKNTSKGTVTGQVQGPEDKVNSMKSWLSKVGSPSSRIDRTNFSNEKTISKLEYSNFSIRY"
arr = beak.alignments.utils.single_sequence_aln_frequencies(my_seq, pssm, check_positions=True)
print(arr)

Get the conservation of each position

In [None]:
import matplotlib.pyplot as plt

conservation = beak.alignments.utils.conservation_from_aln(aln)
conservation = beak.alignments.utils.single_sequence_aln_frequencies(my_seq, pssm, check_positions=False)

In [None]:
fig, axs = plt.subplots(figsize=(8,0.5), dpi=200)
# plt.imshow(conservation.reshape(86,1).T, aspect='auto')
plt.imshow(conservation.reshape(99,1).T, aspect='auto')
plt.colorbar(shrink=1, aspect=2)
plt.yticks([])
plt.xlabel('Alignment Position')
axs.xaxis.tick_top()
axs.xaxis.set_label_position('top')
plt.tick_params(axis='x', which='major', pad=-0.5)
plt.show()

Use PSSM to get composition of sequence at position i

In [None]:
beak.alignments.utils.aln_to_dict(aln)

In [None]:
pssm.iloc[37]

In [None]:
import pandas as pd

new_aln_file = '/Users/micaholivas/Downloads/quick_acyps_aligned.fasta'
new_aln = AlignIO.read(new_aln_file, "fasta")

tsv_file = '/Users/micaholivas/Downloads/uniprotkb_acylphosphatase_AND_reviewed_2025_05_02.tsv'
df = pd.read_csv(tsv_file, sep='\t')
df.iloc[1]['Taxonomic lineage']

In [None]:
length_cutoff = 100

# Drop sequences longer than 110 AA from df_expanded
df = df[df['Length'] <= length_cutoff]

# Drop sequences longer than 110 AA (not counting gaps) from new_aln
from Bio.SeqRecord import SeqRecord

filtered_records = [
    record for record in new_aln
    if len(str(record.seq).replace("-", "")) <= length_cutoff
]
from Bio.Align import MultipleSeqAlignment
new_aln = MultipleSeqAlignment(filtered_records)

In [None]:

import re

def parse_taxonomic_lineage(lineage_str):
    # Split by comma, then extract name and rank using regex
    items = [item.strip() for item in lineage_str.split(',')]
    parsed = {}
    for item in items:
        match = re.match(r"(.+?) \((.+?)\)$", item)
        if match:
            name, rank = match.groups()
            parsed[rank] = name
        else:
            # If no rank, use as is
            parsed['no rank'] = item
    return parsed

# Apply to the column and create a DataFrame
tax_df = df['Taxonomic lineage'].apply(parse_taxonomic_lineage).apply(pd.Series)

# Concatenate with original DataFrame if needed
df_expanded = pd.concat([df, tax_df], axis=1)

df_expanded = df_expanded.drop(columns=['Organism', 'Taxonomic lineage'])

df_expanded.sample(5)

Ungap the alignment

In [None]:
new_aln = beak.alignments.utils.ungap_aln(new_aln)
for record in new_aln:
    print(record.seq)

Now, merge the aligned sequences into the df

In [None]:
from Bio import SeqIO

def extract_entry_id(header):
    parts = header.split('|')
    if len(parts) >= 3:
        return parts[1]
    else:
        return header  # fallback if not in expected format

# Step 1: Extract IDs and sequences from new_aln
aln_records = [(extract_entry_id(record.id), str(record.seq)) for record in new_aln]
aln_df = pd.DataFrame(aln_records, columns=['Entry', 'Aligned_sequence'])

# Step 2: Merge with expanded TSV DataFrame
merged_df = df_expanded.merge(aln_df, on='Entry', how='left')

merged_df.sample(5)
# ...existing code...

Now that we've merged our taxonomic information into the df, compute a PSSM for each

In [None]:
pssms_by_tax_rank = beak.alignments.utils.pssms_by_taxon(merged_df, 'kingdom')

In [None]:
import holoviews as hv
import panel as pn
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap

hv.extension('bokeh')

def interactive_pssm_heatmap(
    pssms_by_tax_rank,
    consensus,
    rank='kingdom',
    phosphate_loop={9,11,12,13,14},
    catalytic={17,35}
):
    """
    Interactive PSSM heatmap explorer by taxonomic rank.

    Args:
        pssms_by_tax_rank: dict of {taxon: PSSM DataFrame}
        consensus: consensus sequence string
        rank: taxonomic rank label for y-axis
        phosphate_loop: set of positions to highlight as phosphate loop
        catalytic: set of positions to highlight as catalytic residues

    Returns:
        pn.Column Panel layout for interactive exploration
    """
    # Prepare amino acid set
    aas = set()
    for pssm in pssms_by_tax_rank.values():
        aas.update(pssm.columns)
    aas = sorted(aas)

    # Custom colormap: 0 is white, 1 is blue
    cmap = LinearSegmentedColormap.from_list("white_blue", ["white", "blue"])

    def highlight_consensus(consensus, pos):
        html = ""
        for i, aa in enumerate(consensus):
            style = "font-size:18px"
            if i == pos:
                style += ";background-color:yellow; color:black; font-weight:bold"
            if i in phosphate_loop:
                style += ";color:green; font-weight:bold"
            if i in catalytic:
                style += ";color:red; font-weight:bold"
            html += f"<span style='{style}'>{aa}</span>"
        return f"<div style='font-family:monospace; word-break:break-all'>{html}</div>"

    def plot_heatmap(position=0):
        # Build DataFrame for the selected position
        heatmap_data = []
        index = []
        for sk, pssm in pssms_by_tax_rank.items():
            row = []
            for aa in aas:
                row.append(pssm.iloc[position][aa] if aa in pssm.columns else 0)
            heatmap_data.append(row)
            index.append(sk)
        heatmap_df = pd.DataFrame(heatmap_data, columns=aas, index=index)
        tidy = heatmap_df.reset_index().melt(id_vars='index', var_name='AA', value_name='Frequency')
        heatmap = hv.HeatMap(tidy, kdims=['AA', 'index'], vdims='Frequency').opts(
            cmap=cmap,
            colorbar=True,
            clim=(0, 1),
            xrotation=0,
            yrotation=0,
            xlabel='Amino Acid',
            ylabel=rank.capitalize(),
            colorbar_opts={'title': 'Frequency'},
            tools=['hover'],
            width=800,
            height=300,
            line_color='black',
            show_grid=True,
            toolbar='above',
            labelled=['x', 'y', 'colorbar'],
            xaxis='top',
            fontsize={'xticks': 14, 'yticks': 14, 'ylabel': 14, 'xlabel': 14, 'title': 16}
        )
        return heatmap.opts(title=f"Aligned Position {position+1}")

    slider = pn.widgets.IntSlider(name='Aligned Position', start=0, end=len(consensus)-1, value=0)

    @pn.depends(slider)
    def consensus_view(position):
        return pn.pane.HTML(highlight_consensus(consensus, position), width=800)

    dmap = hv.DynamicMap(pn.bind(plot_heatmap, position=slider))

    return pn.Column(
        slider,
        dmap,
        pn.pane.Markdown("## Consensus sequence (highlighted position):"),
        consensus_view
    )

# Example usage:
panel = interactive_pssm_heatmap(pssms_by_tax_rank, consensus, rank='kingdom')
panel.servable()

In [None]:
merged_df['superkingdom'].unique()