In [12]:
# Data manipulation 
import pandas as pd 
import numpy as np
import os

# Data viz
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

# bifunctional

The set of IDs comes from: [Genomic analysis of bifunctional Class C-Class D β-lactamases in environmental bacteria](https://pubmed.ncbi.nlm.nih.gov/29846396/)
1. WP_082565248.1|CD2-1| class C beta-lactamase
1. WP_082507116.1|CD4-2| class C beta-lactamase
1. WP_082591432.1|CD4-1| class C beta-lactamase	
1. SFG43659.1|CD3-1| Beta-lactamase class D [Duganella sp. CF458]
1. OEZ55387.1|CD1-1| beta-lactamase [Duganella sp. HH105]	
1. SHH20105.1|CD5-1| Beta-lactamase class D [Massilia sp. CF038]	
1. WP_082552146.1|CD6-1| class C beta-lactamase		
1. KQW93884.1|CD7-1| hypothetical protein ASC94_15090 [Massilia sp. Root418
1. ACH58991.1|LRA13-1| bifunctional class C beta-lactamase-class D beta-lactamase fusion protein LRA-13

In [13]:
# load seqs annots
df_annot = pd.read_csv("../results/tables/df_annot_all.csv", sep = "\t")

# load likelihoods from carp and merge dset
df_plm = pd.read_pickle("../results/embeddings/all_plm.pkl")
df_plm = df_plm[["seq_id", "carp640M_logp"]]
df_annot_plm = pd.merge(df_annot, df_plm, on = "seq_id")

# load tsne coords and merge annots
df_sbl = pd.read_csv("../results/dim_redo/splitted_classes/tsne/tsne_2d_plm_sbl_esm1b.csv")
df_sbl = df_sbl.merge(df_annot_plm, on = "seq_id")
df_sbl_a = df_sbl[df_sbl.bla_class == "Class A"]
df_sbl_c = df_sbl[df_sbl.bla_class == "Class C"]
df_sbl_d = df_sbl[df_sbl.bla_class == "Class D"]
df_mbl = pd.read_csv("../results/dim_redo/splitted_classes/tsne/tsne_2d_plm_mbl_esm1b.csv")
df_mbl = df_mbl.merge(df_annot_plm, on = "seq_id")

# load per class dsets and merge annots
# note: df_b and df_mbl are exactly the same dset 
# protein_family_header col was removed to avoid col duplication in the merged dset
df_a = pd.read_csv("../results/dim_redo/splitted_classes/tsne/tsne_2d_perclass_Class_A_esm1b.csv")
df_c = pd.read_csv("../results/dim_redo/splitted_classes/tsne/tsne_2d_perclass_Class_C_esm1b.csv")
df_d = pd.read_csv("../results/dim_redo/splitted_classes/tsne/tsne_2d_perclass_Class_D_esm1b.csv")
df_b = pd.read_csv("../results/dim_redo/splitted_classes/tsne/tsne_2d_perclass_Class_B_esm1b.csv")
del df_a["protein_family_header"]
del df_c["protein_family_header"]
del df_d["protein_family_header"]
del df_b["protein_family_header"]
df_a = df_a.merge(df_annot_plm, on = "seq_id")
df_c = df_c.merge(df_annot_plm, on = "seq_id")
df_d = df_d.merge(df_annot_plm, on = "seq_id")
df_b = df_b.merge(df_annot_plm, on = "seq_id")

# load and merge representative dsets
df_sbl90 = pd.read_csv("../results/dim_redo/splitted_classes/tsne/tsne_2d_clust90_sbl_esm1b.csv")
df_mbl90 = pd.read_csv("../results/dim_redo/splitted_classes/tsne/tsne_2d_clust90_mbl_esm1b.csv")
df_sbl90 = df_sbl90.merge(df_annot, on = "seq_id")
df_mbl90 = df_mbl90.merge(df_annot, on = "seq_id")

In [14]:
def do_iscatter(df, annot):
    
    """
    do a scatterplot given a df and hue
    """
   
    # plot
    fig = px.scatter(
    df, x = 'tSNE1_esm1b', y = 'tSNE2_esm1b',
    height = 800, width = 900, color = annot,
    hover_name = "protein_name", 
    color_discrete_sequence = px.colors.qualitative.Plotly,
    hover_data = ["protein_family_header", 'bla_class'])

    fig.update_traces(marker=dict(size=24, line=dict(width=1,color='black')),selector=dict(mode='markers'))
    fig.update_layout(template="plotly_white")
    #fig.update_xaxes(showgrid=True, gridwidth=0.5, gridcolor='LightGrey',mirror=True)
    #fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='LightGrey', mirror=True)
    fig.update_yaxes(showline=True, linewidth=1, linecolor='LightGrey')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='LightGrey')
    fig.show()

In [15]:
ids = ["WP_082565248.1", "WP_082507116.1", "WP_082591432.1", "SFG43659.1",
       "OEZ55387.1", "SHH20105.1", "WP_082552146.1", "KQW93884.1", "ACH58991.1"]

filters = [
    df_sbl["#name"].str.contains('|'.join(ids)),
    ~df_sbl["#name"].str.contains('|'.join(ids))
]

values = ["Yes", "No"]

df_sbl["Match"] = np.select(filters, values)

In [16]:
do_iscatter(df_sbl[df_sbl.bla_class.str.contains("Class C")], "Match")