# Example of extracting, annotating, and clustering P4-like defense hotspots from _E. coli_ genomes

__Reference__:

Rousset, François, Florence Depardieu, Solange Miele, Julien Dowding, Anne-Laure Laval, Erica Lieberman, Daniel Garry, Eduardo P. C. Rocha, Aude Bernheim, and David Bikard. “Phages and Their Satellites Encode Hotspots of Antiviral Systems.” Cell Host & Microbe 30, no. 5 (May 11, 2022): 740-753.e5. [https://doi.org/10.1016/j.chom.2022.02.018](https://doi.org/10.1016/j.chom.2022.02.018).

## Workflow overview

![Overview of P4 defense hotspot workflow](../media/P4_workflow_web.png)


In [3]:
import pandas as pd
import subprocess
import seaborn as sns
import os
os.environ['QT_QPA_PLATFORM']='offscreen'
import ete3
from ete3 import Tree, TreeStyle, NodeStyle, faces, SeqMotifFace, TextFace, AttrFace
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp

In [2]:
%%bash
# Download Pfam-A.hmm

mkdir -p ../resources

if [ ! -f ../resources/Pfam-A.hmm ]; then
    wget -O ../resources/Pfam-A.hmm.gz https://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam36.0/Pfam-A.hmm.gz --no-check-certificate
    gunzip ../resources/Pfam-A.hmm.gz
fi



In [3]:
# get the Psu profile in its own file
# could also just download the files from the InterPro website.
! hmmer_select.py -i ../resources/Pfam-A.hmm -o Psu.hmm --field name --exact Psu

Found 1 matching profiles.


In [7]:
%%bash 
# Download genomes


# 543 is the taxid for Enterobacteriaceae
# 562 is the taxid for Escherichia coli

# I'm not sure exactly which one of these comes closest to what was used in Rousset et al. 2022. They report 20,125 E. coli genomes as of August 2020
# I'm working in November 2023, and everything I try either gives me way too many genomes or too few.
if [ ! -f ../resources/e_coli_non_redundant_genomes.gb ]; then
    # WARNING, this file will be > 30 GB, if you want to test on a smaller file, you can use the --num_recs option, for example: --num_recs 500 will download the first 500 genomes.
    domainator_db_download.py --db ncbi_nonredundant_proks --include_taxids 562 -o ../resources/e_coli_non_redundant_genomes.gb # ~2,718 genomes # used for Domainator paper
fi

rm -f target_genomes.gb
ln -s ../resources/e_coli_non_redundant_genomes.gb target_genomes.gb

#!domainator_db_download.py --db ncbi_complete_genome_proks --include_taxids 543 -o enterobacteria_complete_genomes.gb
#!domainator_db_download.py --db ncbi_representative_proks --include_taxids 543 -o enterobacteria_representative_genomes.gb
#!domainator_db_download.py --db ncbi_all --include_taxids 562 -o e_coli_all_genomes.gb # this is more than 200,000 genomes.
#!domainator_db_download.py --db ncbi_complete_genome_proks --include_taxids 562 -o e_coli_complete_genomes.gb # 4,101 genomes
#!domainator_db_download.py --db ncbi_representative_proks --include_taxids 562 -o e_coli_representative_genomes.gb # 2 genomes



In [9]:
# Find Psu domains in the genomes and extract 20 CDSs downstream of each Psu hit.
! domain_search.py -i target_genomes.gb -r Psu.hmm -o Psu_hits.gb --no_annotations --cds_down 20 --cpu 0 --deduplicate --max_region_overlap 0.1 -e 1e-3




In [10]:
# add pfam annotations to the Psu regions
! domainate.py -i Psu_hits.gb -r ../resources/Pfam-A.hmm -o Psu_hits.pfam.gb --cpu 0 --max_overlap 0.6 -e 1e-3



In [11]:
# extract regions downstream of Phage_integrase
# we use --strand r to only extract contigs where the Phange_integrase is on the reverse strand (opposite strand of the Psu hit)
# we use --keep_direction to keep the Phange_integrase on the forward strand
! select_by_cds.py -i Psu_hits.pfam.gb -o P4_like.gb --domains Phage_integrase --cds_down 20 --max_region_overlap 0.1 --keep_direction --strand r 

In [12]:
# Trim the Psu and Phage_integrase domains from the ends of the regions, so we're only looking at the putative defense genes.
! trim_contigs.py -i P4_like.gb --domains Psu Phage_integrase -o P4_like.trimmed.gb



In [13]:
# Extract all peptides from the region
! extract_peptides.py -i P4_like.trimmed.gb -o prots.fasta --fasta_out

In [14]:
# Cluster the peptides at 40% identity, to define orthogroups
! cd-hit -n 2 -c 0.40 -M 0 -T 0 -d 0 -i prots.fasta -o prots.cdhit40.fasta # -s 0.7 

Program: CD-HIT, V4.8.1 (+OpenMP), May 15 2023, 22:49:31
Command: cd-hit -n 2 -c 0.40 -M 0 -T 0 -d 0 -i prots.fasta -o
         prots.cdhit40.fasta

Started: Tue Feb 13 10:49:19 2024
                            Output                              
----------------------------------------------------------------
total number of CPUs in the system is 20
Actual number of CPUs to be used: 20

total seq: 2960
longest and shortest : 2643 and 11
Total letters: 715520
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 1M
Buffer          : 20 X 16M = 332M
Table           : 2 X 0M = 0M
Miscellaneous   : 0M
Total           : 333M

Table limit with the given memory limit:
Max number of representatives: 547213
Max number of word counting entries: 29456713

# comparing sequences from          0  to        134
---------- new table with       36 representatives
# comparing sequences from        134  to        262
99.3%---------- new table with        6 representativ

In [15]:
# reannotate the regions with the orthogroup annotations
! domainate.py -i P4_like.trimmed.gb -r prots.cdhit40.fasta -o P4_like.og.gb --cpu 0 --max_overlap 0.6 -e 1e-3

In [16]:
# create tabular reports of the annotations
! enum_report.py -i P4_like.og.gb -o annotations.tsv --cds_count --length --architecture_detailed --domains --domain_descriptions --qualifier CDS protein_id --databases Pfam-A --column_names "Contig" "CDSs" "Length (bp)" "Domain architecture" "Unique Pfam domains" "Pfam domain descriptions" "Protein IDs"
! enum_report.py -i P4_like.og.gb -o orthogroup_annotations.tsv --architecture_detailed --databases "prots.cdhit40" --column_names "Contig" "Orthogroup architecture"

In [17]:
# Find jaccard similarity between contigs based on orthogroup contents
! compare_contigs.py -i P4_like.og.gb --ai 0.0 --ji 1.0 --sparse P4_like_ji.hdf5 --databases prots.cdhit40


In [18]:
# Display a matrix report of the jaccard similarity matrix
! matrix_report.py -i P4_like_ji.hdf5 -o /dev/stdout

Matrix Report
Total values: 828100
Non-zero values: 114592
Mean: 0.6458612911375757
Median: 0.6666666666666666
Min: 0.07142857142857142
Max: 1.0
        

      ------------------------------------------------------------------------------------------------------
      |                                           Matrix Values                                            |
      ------------------------------------------------------------------------------------------------------

 37558|                                                  o
 35581|                                                  o
 33604|                                                  o
 31628|                                                  o
 29651|                                                  o
 27674|                                                  o
 25698|                                                  o
 23721|                                                  o
 21744|                                  o   

In [19]:
# Create a sequence similarity network as a cytoscape xgmml file, including the annotations.
# After the SSN is built, it can be visualized with Cytoscape. Use the Layout->Prefuse Force Directed Layout to visualize the network.
! build_ssn.py -i P4_like_ji.hdf5 --xgmml P4_like.xgmml  --lb 0.7 --metadata annotations.tsv --cluster --color_by "Domain architecture" --cluster_tsv clusters.tsv

  @numba.jit(
  @numba.jit(


In [20]:
# The rest of this goes beyond what is depicted in the flow diagram at the top.

# color the genbank file by orthogroup
! color_genbank.py -i P4_like.og.gb -o P4_like.og.colored.gb --color_both

In [6]:
# separate the contigs into separate files by ssn cluster
clusters_table = pd.read_table("clusters.tsv", names=["Name", "cluster"])

In [None]:
for i in range(0, 12):
    idx = i + 1
    clusters_table[clusters_table["cluster"] == idx]["Name"].to_csv(f"cluster_{idx}.tsv", sep="\t", index=False, header=False)
    subprocess.run(["select_by_contig.py", "-i", "P4_like.og.colored.gb", "-o", f"P4_like.og.cluster_{idx}.gb", "--contigs_file", f"cluster_{idx}.tsv"])

In [13]:
# report the Pfam domains, and the Psu domain hit information, including identity, score, and evalue
! extract_peptides.py -i P4_like.gb --search_hits --fasta_out -o P4_like_Psu_seqs.fasta --keep_name
! domainate.py -i P4_like_Psu_seqs.fasta -r ../resources/WP_000446153.fasta  -o P4_like_Psu_seqs.WP_000446153.gb --cpu 0 --max_domains 1 -e 10

! enum_report.py -i P4_like_Psu_seqs.WP_000446153.gb -o P4_like.WP_000446153.tsv --score --identity

In [14]:
# Supplementary table

Psu_scores = pd.read_csv("P4_like.WP_000446153.tsv", sep="\t")

pfam_annotations = pd.read_table("annotations.tsv", sep="\t")

# join with clusters table
#"Contig" "CDSs" "Length (bp)" "Domain architecture" "Unique Pfam domains" "Pfam domain descriptions" "Protein IDs"
pfam_annotations = pd.merge(pfam_annotations, clusters_table, left_on="Contig", right_on="Name", how="left")[["Contig", "CDSs", "Length (bp)", "Domain architecture", "Unique Pfam domains", "Pfam domain descriptions", "Protein IDs", "cluster"]]
pfam_annotations.rename(columns={"cluster": "Cluster"}, inplace=True)

# for contig column delete everything after the last "_"
pfam_annotations["Contig"] = pfam_annotations["Contig"].str.split("_").str[:-1].str.join("_")
pfam_annotations = pd.merge(pfam_annotations, Psu_scores[["contig","score", "identity"]], left_on="Contig", right_on="contig", how="left")
pfam_annotations.rename(columns={"score": "Psu score", "identity": "Psu identity"}, inplace=True)

orthogroup_annotations = pd.read_table("orthogroup_annotations.tsv", sep="\t")
orthogroup_annotations["Contig"] = orthogroup_annotations["Contig"].str.split("_").str[:-1].str.join("_")
orthogroup_annotations = orthogroup_annotations[["Contig", "Orthogroup architecture"]]
pfam_annotations = pd.merge(pfam_annotations, orthogroup_annotations, on="Contig", how="left")

# for contig column delete everything after the first "_"
pfam_annotations["Contig"] = pfam_annotations["Contig"].str.split("_").str[0]



pfam_annotations.to_csv("pfam_annotations_table.tsv", sep="\t", index=False)

# count the number of clusters, and average the Psu scores for each cluster
pfam_annotations[["Cluster", "Psu score", "Psu identity"]].groupby("Cluster").agg(["count", "mean"]).reset_index().to_csv("cluster_counts.tsv", sep="\t", index=False)


In [24]:
# get an example contig from each cluster
! rm example_contigs.gb
with open('example_contigs.gb', 'a') as outfile_file:
    for i in range(0, 12):
        idx = i + 1
        subprocess.run(["select_by_contig.py", "-i", f"P4_like.og.cluster_{idx}.gb", "--first", "1"], stdout=outfile_file)

rm: cannot remove 'example_contigs.gb': No such file or directory


In [25]:
# orthogroup architecture tree

orthogroup_annotations = pd.read_table("orthogroup_annotations.tsv", sep="\t")
# get one row for each orthogroup architecture
orthogroup_annotations = orthogroup_annotations.groupby("Orthogroup architecture").first().reset_index()

In [26]:
# make a tree of representatives of each contig architecture
orthogroup_annotations["Contig"].to_csv("orthogroup_representative_contigs.tsv", sep="\t", index=False, header=False)
subprocess.run(["select_by_contig.py", "-i", "P4_like.og.gb", "-o", "P4_like.og.representatives.gb", "--contigs_file", "orthogroup_representative_contigs.tsv"])
! compare_contigs.py -i P4_like.og.representatives.gb --ai 0.0 --ji 1.0 --sparse P4_like_og_ji.hdf5 --databases prots.cdhit40
! transform_matrix.py -i P4_like_og_ji.hdf5 --mode "score_dist" --dense P4_like_og_ji_dist.hdf5 --dense_text P4_like_og_ji_dist.tsv
! build_tree.py -i P4_like_og_ji_dist.hdf5 --newick P4_like_og_ji_dist.newick --quiet

  @numba.jit(
  @numba.jit(


In [27]:
# collect metadata for the tree
orthogroup_representative_tree_annotations = pd.merge(orthogroup_annotations, clusters_table, left_on="Contig", right_on="Name", how="left")
# replace ":" with "_" in "Contig"
orthogroup_representative_tree_annotations["Contig"] = orthogroup_representative_tree_annotations["Contig"].str.replace(":", "_")
# set the index to "Contig"
orthogroup_representative_tree_annotations.set_index("Contig", inplace=True, drop=True)
orthogroup_representative_tree_annotations.to_csv("orthogroup_representative_tree_annotations.tsv", sep="\t", index=True)

In [28]:
# render the tree

LINE_THICKNESS = 5

   
def mylayout(node, metadata):
    node.img_style["hz_line_width"] = LINE_THICKNESS # Change the horizotal lines stroke size
    node.img_style["vt_line_width"] = LINE_THICKNESS # Change the vertical lines stroke size
    if node.is_leaf():
        node.name = node.name.strip("'")
        row = metadata.loc[node.name]
        node.name = row["Orthogroup architecture"]
        fgcolor="black"
        N=AttrFace("name",fsize=30, fgcolor=fgcolor)

        faces.add_face_to_node(N, node, 0, position="aligned")
        faces.add_face_to_node(TextFace(" " + str(row["cluster"]) + "    ", fsize=30), node, column=1, aligned=False)


t = ete3.Tree("P4_like_og_ji_dist.newick")

ns = NodeStyle()
ts = TreeStyle()
ts.show_leaf_name = False
ts.layout_fn = lambda x: mylayout(x, orthogroup_representative_tree_annotations)
ts.mode = "c"
#ts.force_topology = True
#ts.rotation = -95
# ts.branch_vertical_margin = 0

ns["size"] = 0

for n in t.traverse():
    n.set_style(ns)

#t.render("%%inline", tree_style=ts)
t.render("clusters_tree.jpg", tree_style=ts)
# suppress output
;

''

In [2]:
from domainator.plot_contigs import show_contigs

show_contigs("../../test/data/pDONR201_multi_genemark.gb")



    <html>
    <head>
        <title>Domainator Contigs Plot</title>
        <script src="https://d3js.org/d3.v7.min.js"></script>
        <style>
            .tooltip {
                position: absolute;
                text-align: left;
                padding: 8px;
                font: 12px sans-serif;
                background: lightblue;
                border: 0px;
                border-radius: 8px;
                pointer-events: none;
                opacity: 0;
            }
            .contig-label {
                font: 32px sans-serif;
            }
            .feature-label {
                font: 26px sans-serif;
            }
        </style>
    </head>
    <body>
        <div id="details" style="padding: 5px; border: 1px solid #ccc; margin-top: 5; width: 1200; height: 75; overflow: scroll;"></div>
        <div id="tooltip" class="tooltip"></div>
        <div>
        <svg width="1200" height="600" style="padding: 5px; border: 1px solid #ccc; margin-top: 20px; o

In [3]:
!pwd



/home/sean/scripts/python/domainator/example_notebooks/Rousset_P4_defense_hotspots
