# Annotating and clustering CARF-domain containing proteins

__reference__:

Makarova, Kira S, Albertas Timinskas, Yuri I Wolf, Ayal B Gussow, Virginijus Siksnys, Česlovas Venclovas, and Eugene V Koonin. “Evolutionary and Functional Classification of the CARF Domain Superfamily, Key Sensors in Prokaryotic Antivirus Defense.” Nucleic Acids Research 48, no. 16 (September 18, 2020): 8828–47. [https://doi.org/10.1093/nar/gkaa635](https://doi.org/10.1093/nar/gkaa635).

## Workflow overview

![Overview of CARF clustering](../media/CARF_workflow_web.png)

In [1]:
import subprocess
import pandas as pd
from domainator.Bio.SeqIO.FastaIO import SimpleFastaParser
from glob import glob
from pathlib import Path

In [5]:
%%bash
# Download Pfam-A.hmm

mkdir -p ../resources

if [ ! -f ../resources/Pfam-A.hmm ]; then
    wget -O ../resources/Pfam-A.hmm.gz https://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam36.0/Pfam-A.hmm.gz --no-check-certificate
    gunzip ../resources/Pfam-A.hmm.gz
fi

# Downlad supplementary data for https://doi.org/10.1093/nar/gkaa635.

if [ ! -d ../resources/CARF_SAVED ]; then
    wget https://ftp.ncbi.nlm.nih.gov/pub/makarova/Supplement/CARF/Supplementary_Data_File_1.tgz -O ../resources/Supplementary_Data_File_1.tgz --no-check-certificate
    tar x -C ../resources -f ../resources/Supplementary_Data_File_1.tgz
fi

# Download CARF.fasta
if [ ! -f ../resources/CARF_full.fasta ]; then
    wget https://ftp.ncbi.nlm.nih.gov/pub/makarova/Supplement/CARF/CARF.fasta.gz -O ../resources/CARF.fasta.gz --no-check-certificate
    gunzip ../resources/CARF.fasta.gz
    mv ../resources/CARF.fasta ../resources/CARF_full.fasta
fi

# download Makarova et al. 2020 table S1. Already in github repository as ..resources/CARF_metadata.tsv



--2024-02-13 11:02:21--  https://ftp.ncbi.nlm.nih.gov/pub/makarova/Supplement/CARF/Supplementary_Data_File_1.tgz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 130.14.250.7, 2607:f220:41e:250::10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1096439 (1.0M) [application/x-gzip]
Saving to: ‘../resources/Supplementary_Data_File_1.tgz’

     0K .......... .......... .......... .......... ..........  4% 1.55M 1s
    50K .......... .......... .......... .......... ..........  9% 3.21M 0s
   100K .......... .......... .......... .......... .......... 14%  200M 0s
   150K .......... .......... .......... .......... .......... 18% 3.32M 0s
   200K .......... .......... .......... .......... .......... 23%  195M 0s
   250K .......... .......... .......... .......... .......... 28%  107M 0s
   300K .......... .......... .......... .......... .......... 32%  133M 0s
   

In [9]:
# remove consensus sequences from CARF
# consensus is the first sequence in each file

!mkdir -p CARF_SAVED_no_consensus
for f in glob("../resources/CARF_SAVED/*.FASTA"):
    f = Path(f)
    f_basename = f.stem
    #CARF_name = f_basename.split(".")[1]
    with open(f, "r") as handle:
        with open(f"CARF_SAVED_no_consensus/{f_basename}.fasta", "w") as out:
            for i, (title, seq) in enumerate(SimpleFastaParser(handle)):
                if i != 0:
                    out.write(f">{f_basename}_{title}\n{seq}\n")
    subprocess.run(["hmmer_build.py", "-i", f"CARF_SAVED_no_consensus/{f_basename}.fasta", "-o", f"CARF_SAVED_no_consensus/{f_basename}.hmm", "--name", f"{f_basename}", "--acc", f"{f_basename}", "--desc", f"{f_basename}"])

In [10]:
# remove consensus sequences from CARF
# consensus is the first sequence in each file
!mkdir -p effector_no_consensus
for f in glob("../resources/effector_domain/*.FASTA"):
    f = Path(f)
    f_basename = f.stem
    with open(f, "r") as handle:
        with open(f"effector_no_consensus/{f_basename}.fasta", "w") as out:
            for i, (title, seq) in enumerate(SimpleFastaParser(handle)):
                if i != 0:
                    out.write(f">{title}\n{seq}\n")
    subprocess.run(["hmmer_build.py", "-i", f"effector_no_consensus/{f_basename}.fasta", "-o", f"effector_no_consensus/{f_basename}.hmm", "--name", f"{f_basename}", "--acc", f"{f_basename}", "--desc", f"{f_basename}"])

In [11]:
# concatenate hmm files
!rm -f CARF_SAVED_no_consensus/CARF.hmm
!cat CARF_SAVED_no_consensus/*.hmm > CARF_SAVED_no_consensus/CARF.hmm
#!ln -s CARF_SAVED_no_consensus/CARF_SAVED_domains.hmm CARF_SAVED_no_consensus/CARF_SAVED_copy.hmm # make a symlink so that we can search twice with the same database in different ways with the same 
#!cp CARF_SAVED_no_consensus/CARF_SAVED_domains.hmm CARF_SAVED_no_consensus/CARF_SAVED_copy.hmm # make a symlink so that we can search twice with the same database in different ways with the same 

!rm -f effector_no_consensus/effectors.hmm
!cat effector_no_consensus/*.hmm > effector_no_consensus/effectors.hmm

In [17]:
metadata = pd.read_csv("../resources/CARF_metadata.tsv", sep="\t")
metadata["Final group assignments"].fillna("", inplace=True)

In [18]:
metadata

Unnamed: 0,Local ID,Other Protein ID,Profile superfamily,Profile ID,E-value,Identity cluster,Order of leaves in full sequence tree,Superclusters based on CARF/SAVED domain tree,Final group assignments,Genome Name,Protein Length,"CDD asignments, automatic with E-value 0.0001 (CARF only)",Domain organization,TMHMM result,Current genbank annotation,Potential Ring Nucleases,RN group
0,MK_CARF_00011497,WP_014788072.1,CARF,CARF.89.FASTA,3.180000e-57,3606,2524,CL1_CARF,CARF_m1,Thermococcus cleftensis,160,,CARF,,hypothetical protein,RN(1),RN_2_5
1,MK_CARF_00101059,YP_006424225.1,CARF,CARF.89.FASTA,3.180000e-57,3606,2525,CL1_CARF,CARF_m1,Thermococcus cleftensis,160,,CARF,,hypothetical protein CL1_0219,RN(1),RN_2_5
2,MK_CARF_00011498,WP_088864412.1,CARF,CARF.89.FASTA,3.180000e-57,1115,2526,CL1_CARF,CARF_m1,Thermococcus barossii,160,,CARF,,hypothetical protein,RN(1),RN_2_5
3,MK_CARF_00011442,WP_088180185.1,CARF,CARF.89.FASTA,3.890000e-58,2701,2527,CL1_CARF,CARF_m1,Thermococcus sp. 5-4,160,,CARF,,hypothetical protein,RN(1),RN_2_5
4,MK_CARF_00011410,WP_088866426.1,CARF,CARF.89.FASTA,6.760000e-59,2988,2528,CL1_CARF,CARF_m1,Thermococcus radiotolerans,160,,CARF,,hypothetical protein,RN(1),RN_2_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7138,MK_CARF_00012566,WP_051667291.1,SAVED,SAVED.104.FASTA,5.840000e-39,3587,5367,cl1_SAVED,,Bacillus lehensis,492,,SAVED_ABhydrolase,,alpha/beta hydrolase,Non-RN,
7139,MK_CARF_ADD00000372,PYQ55858.1,SAVED,SAVED.98.FASTA,,135a,6210,cl1_SAVED,,Acidobacteria bacterium,555,,SAVED_HATPase_Alba,,hypothetical protein DMF53_27210,Non-RN,
7140,MK_CARF_ADD00000373,WP_052549029.1,SAVED,SAVED.98.FASTA,,139a,6211,cl1_SAVED,,Enhygromyxa salina,551,,SAVED_HATPase_Alba,,SAVED domain-containing protein,Non-RN,
7141,MK_CARF_00013068,WP_021722632.1,SAVED,SAVED.104.FASTA,4.040000e-20,3740,6212,cl1_SAVED,,Lactococcus lactis,413,,SAVED,,hypothetical protein,Non-RN,


In [19]:
# Select CARF and only CARF, exclude proteins with previously annotated fusions
CARF_metadata = metadata[(metadata["Profile superfamily"] == "CARF") & ((metadata["Domain organization"] == "CARF"))]
len(CARF_metadata["Local ID"].tolist())

1844

In [None]:
# Get just the CARF sequences, not the SAVED sequences, and not the RtcR CARFs.
subprocess.run(["select_by_contig.py", "-i", "../resources/CARF_full.fasta", "-o", "no_effector.gb", "--contigs"] + CARF_metadata["Local ID"].tolist())

In [23]:
# annotate sequences with the best CARF domain
subprocess.run(["domainate.py", "-i", "no_effector.gb", "-r", "CARF_SAVED_no_consensus/CARF.hmm", "-o", "CARF_loc.gb", "-e", "1e-3"])


CompletedProcess(args=['domainate.py', '-i', 'no_effector.gb', '-r', 'CARF_SAVED_no_consensus/CARF.hmm', '-o', 'CARF_loc.gb', '-e', '1e-3'], returncode=0)

In [24]:
# extract the longest regions ( >= 50 aa), not containing the best CARF domain.
! extract_unannotated.py -i CARF_loc.gb -o unannotated.gb --largest --lb 50 --keep_name

In [25]:
# add pfam and effector domain annotations
! domainate.py -i unannotated.gb -r effector_no_consensus/effectors.hmm -e 0.001 --max_overlap 0.6 -o tmp.gb

! domainate.py -i tmp.gb -r ../resources/Pfam-A.hmm -e 0.001 --max_overlap 0.6 -o new_annots.gb

! rm -f tmp.gb

In [26]:
# Calculate pairwise sequence similarity scores between all proteins
! seq_dist.py -i new_annots.gb  -r new_annots.gb  --sparse scores.hdf5 --mode efi_score 

In [27]:
# Tabular report where each row is a single contig
! enum_report.py -i new_annots.gb --domains --domain_descriptions -o annotations.tsv --length --sequence 

In [28]:
# Summary of contig lengths and domain content
! summary_report.py -i new_annots.gb --html summary.html  -o /dev/stdout

Contig Stats

contigs: 1524
CDSs: 0
CDSs per 10 kb:  0.00

    --------------------------------------------------------------------------------------------------------
    |                                            Contig Lengths                                            |
    --------------------------------------------------------------------------------------------------------

 248|    o                                               
 235|    o                                               
 222|    o                                               
 209|    o                                               
 196|    o                                               
 183|    o                                               
 170|   oo                                               
 157|   oo                                               
 144|   ooo                                              
 131|   ooo                                              
 118|   ooo                      

In [29]:
# Summary of similarity matrix
! matrix_report.py -i scores.hdf5 -o /dev/stdout

Matrix Report
Total values: 2322576
Non-zero values: 78500
Mean: 25.97977482447599
Median: 16.35729514783766
Min: 4.878998615944094
Max: 270.06738811011354
        

      ------------------------------------------------------------------------------------------------------
      |                                           Matrix Values                                            |
      ------------------------------------------------------------------------------------------------------

 24957|  o                                                
 23644|  o                                                
 22330|  o                                                
 21017|  o                                                
 19703|  o                                                
 18390|  o                                                
 17076|  o                                                
 15763|  o                                                
 14449|  o                        

In [30]:
# Generate an SSN, colored by CARF group.
# After the SSN is built, it can be visualized with Cytoscape. Use the Layout->Prefuse Force Directed Layout to visualize the network.
! build_ssn.py -i scores.hdf5 --lb 5 --xgmml network.xgmml --metadata annotations.tsv ../resources/CARF_metadata.tsv --color_by "Final group assignments" --cluster_tsv clusters.tsv --cluster

In [None]:
### The rest of these steps are not depicted in the flow chart figure


# make MSAs and compare clusters to Pfam
clusters_table = pd.read_table("clusters.tsv", names=["Name", "cluster"])
for i in range(0, 12):
    idx = i + 1
    clusters_table[clusters_table["cluster"] == idx]["Name"].to_csv(f"cluster_{idx}.tsv", sep="\t", index=False, header=False)
    subprocess.run(["select_by_contig.py", "-i", "new_annots.gb", "-o", f"effector_cluster_{idx}.gb", "--contigs_file", f"cluster_{idx}.tsv"])
    subprocess.run(["genbank_to_fasta.py", "-i", f"effector_cluster_{idx}.gb", "-o", f"effector_cluster_{idx}.fasta"])
    subprocess.run(["mafft", "--maxiterate", "1000", "--globalpair", f"effector_cluster_{idx}.fasta"], stdout=open(f"effector_cluster_{idx}.mafft.fasta", "w"))
for i in range(0, 12):
    idx = i + 1
    subprocess.run(["hmmer_build.py", "-i", f"effector_cluster_{idx}.mafft.fasta", "-o", f"effector_cluster_{idx}.mafft.hmm", "--name", f"effector_cluster_{idx}", "--acc", f"effector_cluster_{idx}", "--desc", f"effector_cluster_{idx}"])
!cat effector_cluster_*.mafft.hmm > effector_clusters.hmm

!hmmer_compare.py -i effector_clusters.hmm -r ../resources/Pfam-A.hmm -o effector_clusters_vs_pfam.tsv --cpu 8 -k 10
    
;

In [32]:
# create table of similarity between clusters and Makarova et al. 2020 effectors
!hmmer_compare.py -i effector_clusters.hmm -r effector_no_consensus/effectors.hmm -o effector_clusters_vs_makarova_effectors.tsv --cpu 8 -k 10

In [33]:
# pull random representatives from each cluster, to run AlphaFold2 on and do further annotation.
import random
seed  = 15
seeded_random = random.Random(seed)

with open("cluster_representatives.fasta", "w") as out_handle:
    for i in range(0, 12):
        cluster = i + 1
        with open(f"effector_cluster_{cluster}.fasta", "r") as in_handle:
            seqs = [(title, seq) for title,seq in  SimpleFastaParser(in_handle)]
            seeded_random.shuffle(seqs)
            out_handle.write(f">Cluster{cluster}_{seqs[0][0]}\n{seqs[0][1]}\n")
            out_handle.write(f">Cluster{cluster}_{seqs[1][0]}\n{seqs[1][1]}\n")


In [34]:
# supplementary table generation

! enum_report.py -i new_annots.gb  --length --domains --domain_descriptions -o annotations.Pfam.tsv --sequence --databases Pfam-A  --column_names "Name" "Length" "Pfam 36 Domains" "Pfam 36 Domain Descriptions" "Sequence"
! enum_report.py -i new_annots.gb  --domains -o annotations.effector.tsv --databases effectors --column_names "Name" "CARF effector Domains (Makarova et al. 2020)"
#! enum_report.py -i new_annots.gb --domains --domain_descriptions -o annotations.Pfam.tsv --length --sequence --databases Pfam-A

In [35]:

pfam_annots = pd.read_table("annotations.Pfam.tsv", sep="\t")
effector_annots = pd.read_table("annotations.effector.tsv", sep="\t")
Pfam_with_effector = pd.merge(pfam_annots, effector_annots, on="Name", how="outer")
clusters_table = pd.read_table("clusters.tsv", names=["Name", "cluster"])
Pfam_with_effector_and_clusters = pd.merge(Pfam_with_effector, clusters_table, on="Name", how="outer")
CARF_supplementary_table_2 = pd.merge(Pfam_with_effector_and_clusters, metadata[["Local ID","Other Protein ID", "Final group assignments"]], left_on="Name", right_on="Local ID", how="left")

def count_join(list_of_strings:list, input_sep = "; ", output_sep = ", "):
    out = dict() #counts
    for s in list_of_strings:
        if s == "":
            continue
        parts = s.split(input_sep)
        for part in parts:
            if part in out:
                out[part] += 1
            else:
                out[part] = 1
    out_tuple = [(k, v) for k, v in out.items()]
    out_tuple.sort(key=lambda x: x[1], reverse=True)
    return output_sep.join([f"{k} ({v})" for k, v in out_tuple])


# count name, join with comma Pfam 36 domains and CARF effector domains (Makarova et al. 2020), skip blank lines so that there aren't a bunch of commas at the end of the line
# 
CARF_supplementary_table_2.fillna("", inplace=True)
#CARF_supplementary_table_1 = CARF_supplementary_table_2[["cluster", "Pfam 36 Domains", "CARF effector Domains (Makarova et al. 2020)", "Name"]].groupby("cluster").agg({"Name": "count", "Pfam 36 Domains": lambda x: ", ".join(x), "CARF effector Domains (Makarova et al. 2020)": lambda x: ", ".join(x)}).reset_index()
CARF_supplementary_table_1 = CARF_supplementary_table_2[["cluster", "Pfam 36 Domains", "CARF effector Domains (Makarova et al. 2020)", "Final group assignments", "Name"]].groupby("cluster").agg({"Name": "count", "Final group assignments": lambda x: count_join(x), "Pfam 36 Domains": lambda x: count_join(x), "CARF effector Domains (Makarova et al. 2020)": lambda x: count_join(x)}).reset_index()
# rename "Name to "count"
CARF_supplementary_table_1.rename(columns={"Name": "count", "Final group assignments": "CARF group"}, inplace=True)

In [36]:
CARF_supplementary_table_2.columns

Index(['Name', 'Length', 'Pfam 36 Domains', 'Pfam 36 Domain Descriptions',
       'Sequence', 'CARF effector Domains (Makarova et al. 2020)', 'cluster',
       'Local ID', 'Other Protein ID', 'Final group assignments'],
      dtype='object')

In [37]:
CARF_supplementary_table_1.to_csv("CARF_supplementary_table_1.tsv", sep="\t", index=False)
CARF_supplementary_table_2.to_csv("CARF_supplementary_table_2.tsv", sep="\t", index=False)