# Encapsulin Hits against Phage Capsids

We've searched ≈5500 existing encapsulin sequences against ≈13,000 phage capsid sequences. Let's analyze the identity distributuon of these best hits, similar to what we've done previously in `encapsulin_phage_hits.ipynb`.

In [6]:
import pandas as pd

hits_df = pd.read_csv("../natural_encapsulin_phage_hits.tsv", sep="\t", names=["Query", "Target", "Identity", "Alignment Length", "Mismatches", "Gap Openings",
                                                              "Query Start", "Query End", "Target Start", "Target End", "E-Value", "Bitscore"])

hits_df.head()

Unnamed: 0,Query,Target,Identity,Alignment Length,Mismatches,Gap Openings,Query Start,Query End,Target Start,Target End,E-Value,Bitscore
0,family_1_tr|A0A3C0YTJ9|A0A3C0YTJ9_9BACT,tr|D6PSX7|D6PSX7_9CAUD Putative capsid protein...,0.262,179,117,6,73,241,79,252,5.416e-08,53
1,family_2_tr|L8EQJ9|L8EQJ9_STRR1,tr|A0A6N1NNH2|A0A6N1NNH2_9VIRU Type 2A encapsu...,0.33,215,142,2,247,460,74,287,2.664e-32,132
2,family_2_tr|N9D6U3|N9D6U3_ACICA,tr|A0A6N1NRT6|A0A6N1NRT6_9VIRU Membrane protei...,0.357,277,175,2,17,292,13,287,7.593999999999999e-50,179
3,family_2_tr|Q2J566|Q2J566_FRACC,tr|A0A6N1NRT6|A0A6N1NRT6_9VIRU Membrane protei...,0.334,215,138,4,246,457,74,286,5.673e-27,115
4,family_2_tr|S3JGR5|S3JGR5_MICAE,tr|A0A6N1NRT6|A0A6N1NRT6_9VIRU Membrane protei...,0.329,279,181,3,18,294,13,287,4.1780000000000004e-43,159


In [7]:
binned_df = hits_df.sort_values(by="Identity", ascending=False).drop_duplicates(subset="Query").loc[:, ["Query", "Identity"]]
binned_df["Identity"] = pd.cut(binned_df["Identity"], [0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1], labels=[str(i / 10) for i in range(2, 11)])
binned_df["Identity"].value_counts()

0.3    3027
0.4     527
0.2     230
0.5       0
0.6       0
0.7       0
0.8       0
0.9       0
1.0       0
Name: Identity, dtype: int64

In [8]:
hits_df.describe()

Unnamed: 0,Identity,Alignment Length,Mismatches,Gap Openings,Query Start,Query End,Target Start,Target End,E-Value,Bitscore
count,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0,3791.0
mean,0.317789,229.603535,145.885518,3.496439,154.066737,377.027961,66.284094,290.476655,5.540184e-05,116.761013
std,0.034247,49.464022,32.960392,2.553166,120.858737,93.524059,52.115811,28.378918,0.0001794862,44.866808
min,0.184,45.0,28.0,0.0,1.0,45.0,1.0,118.0,4.182e-53,37.0
25%,0.303,214.0,135.0,2.0,16.0,292.0,13.0,286.0,7.763000000000001e-43,98.0
50%,0.327,223.0,147.0,3.0,219.0,445.0,65.0,287.0,2.0510000000000002e-29,123.0
75%,0.34,277.0,175.0,5.0,246.0,459.0,74.0,287.0,8.0825e-22,158.0
max,0.419,335.0,201.0,15.0,377.0,526.0,272.0,437.0,0.0009803,188.0


In [9]:
from Bio import SeqIO

phage_capsid_hits  = set(binned_df["Query"].unique())
all_encapsulins = set([str(record.id) for record in SeqIO.parse("../family_1_2_3_natural_encapsulins.fasta", "fasta")])
missing_encapsulins = all_encapsulins.difference(phage_capsid_hits)

missing_encapsulins_df = pd.DataFrame([{"Query": mgyp, "Identity": "No Hit"} for mgyp in missing_encapsulins])
binned_df = pd.concat([binned_df, missing_encapsulins_df])
binned_df

Unnamed: 0,Query,Identity
471,family_1_tr|A0A358AU76|A0A358AU76_9FIRM,0.4
1349,family_1_tr|A0A2H1JUR0|A0A2H1JUR0_9MICO,0.4
1002,family_1_tr|A0A1X6WW20|A0A1X6WW20_9MICO,0.4
3566,family_1_tr|B2V6Y3|B2V6Y3_SULSY,0.4
2287,family_2_tr|A0A349JP39|A0A349JP39_9CYAN,0.4
...,...,...
1970,family_1_tr|A0A1V6A1S9|A0A1V6A1S9_9BACT,No Hit
1971,family_1_tr|F4G378|F4G378_METCR,No Hit
1972,family_1_tr|A0A3M2KZW2|A0A3M2KZW2_9NOCA,No Hit
1973,family_1_tr|A0A2N8LLA7|A0A2N8LLA7_9MYCO,No Hit


In [10]:
import plotly.express as px

fig = px.bar(binned_df.groupby("Identity").count().reset_index(), 
             x="Identity", y="Query", 
             color_discrete_sequence=["rgb(95, 70, 144)"],
             category_orders={"Identity": ["No Hit", "0.2", "0.3", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9", "1.0"]},
             labels={"Query": "Count"})

fig.update_layout(
    template="plotly_white",
    width=1400,
    height=700,
    font=dict(size=18),
    title="Sequence Identity of Existing Encapsulins with Phage Capsids",
)

fig.update_traces(marker_line_width=1,marker_line_color="white")

fig.write_image("../plots/phage_capsid_identities.svg")
fig.write_image("../plots/phage_capsid_identities.png")
fig.show()