# Summary
Read the class info file for MiBIG entries, and report probability tables.

In [1]:
import os
import urllib
import glob
import json
import time
import numpy as np
import pandas as pd
from collections import OrderedDict

In [2]:
mibig_folder = "/mnt/scratch/louwe015/mibig_json_2.0/"

base_path = "/mnt/scratch/louwe015/NPLinker/classifying/mibig_classifications/"
out_base = os.path.split(base_path[:-1])[0]
mibig_classes = os.path.join(out_base, "MIBiG2.0_compounds_with_AS_BGC_CF_NPC_classes.txt")
print("mibig_classes exists is", os.path.isfile(mibig_classes))

mibig_classes exists is True


## Reading file
Making dict of {chem_id: [[bgc_classes], [chem_classes]]}, where bgc_classes is [class, subclass, AS_class], and chem_classes is the CF/NPC classes in order of input file

In [3]:
classes_dict = {}
with open(mibig_classes) as inf:
    header = inf.readline()
    print(header)
    for line in inf:
        elems = line.strip().split("\t")
        chem_id = elems.pop(0)
        class_base = elems.pop(0).split(',')
        classes = [cls.partition(':')[0] for cls in class_base]
        sub_classes = [cls for cls in class_base if cls.split(":")[1]]
        as_classes = elems.pop(0).split(',')

        bgc_classes = [classes, sub_classes, as_classes]        
        chem_classes = [chem_cls.split('; ') for chem_cls in elems[2:]]
        classes_dict[chem_id] = [bgc_classes, chem_classes]

print(classes_dict[chem_id])  # example
print(classes_dict['BGC0000020_maytansine'])
classes_dict['BGC0000001_abyssomicin C']

compound_name	class:subclass	as_classes	smiles	inchi_key	cf_kingdom	cf_superclass	cf_class	cf_subclass	cf_direct_parent	npc_class	npc_superclass	npc_pathway	npc_isglycoside

[[['Other'], [], ['phosphonate']], [[''], [''], [''], [''], [''], [''], [''], [''], ['0']]]
[[['Polyketide'], [], ['T1PKS', 'NRPS-like']], [['Organic compounds'], ['Phenylpropanoids and polyketides'], ['Macrolactams'], [''], ['Macrolactams'], ['Ansa macrolides'], ['Macrolides'], ['Polyketides'], ['0']]]


[[['Polyketide'], ['Polyketide:Other'], ['T1PKS']],
 [['Organic compounds'],
  ['Organoheterocyclic compounds'],
  ['Oxanes'],
  [''],
  ['Oxanes'],
  ['Spirotetronate macrolides'],
  ['Macrolides'],
  ['Polyketides'],
  ['0']]]

In [37]:
classes_dict['BGC0000199_arimetamycin B']

[[['Polyketide', 'Saccharide'],
  ['Polyketide:Tetracycline', 'Saccharide:hybrid/tailoring'],
  ['T2PKS', 'oligosaccharide']],
 [['Organic compounds'],
  ['Phenylpropanoids and polyketides'],
  ['Anthracyclines'],
  [''],
  ['Anthracyclines'],
  ['Angucyclines', 'Anthraquinones and anthrones'],
  ['Polycyclic aromatic polyketides'],
  ['Polyketides'],
  ['1']]]

In [4]:
# creating legend from the header
s_h = header.strip().split('\t')
legend_bgc = ['mibig_classes']+s_h[1:3]
legend_chem = s_h[5:]
print(legend_bgc, legend_chem)

['mibig_classes', 'class:subclass', 'as_classes'] ['cf_kingdom', 'cf_superclass', 'cf_class', 'cf_subclass', 'cf_direct_parent', 'npc_class', 'npc_superclass', 'npc_pathway', 'npc_isglycoside']


## Do comparisons between bgc and chem classes

In [5]:
from collections import defaultdict
def rec_dd():
    """Initialises a recurring defaultdict"""
    return defaultdict(rec_dd)

In [6]:
# aggregate pairwise class matrices for all compounds
result = rec_dd()
# loop through each mibig compound
for mibig_chem_id, (bgc_classes, chem_classes) in classes_dict.items():
# get all combinations of classes for this compound
    for i, bgc_cat in enumerate(legend_bgc):
        init_bgc_class = bgc_classes[i]
        if not init_bgc_class or init_bgc_class == ['']:
            continue

        bgc_class = init_bgc_class[:]  # if no exceptions, just assign classes

        # do some cleanup for mibig classes
        if bgc_cat == "mibig_classes":
            # group pks-nrp hybrids for MIBiG classes
            hyb_count = len([1 for init_bgc_c in init_bgc_class \
                             if any([test in init_bgc_c.lower() for test in ['nrp', 'pks', 'polyketide']])])
            if hyb_count >= 2:
                # if hybrid, reconstruct the bgc_class
                bgc_class = []
                bgc_class.append("PKS-NRP_Hybrids")
                # append other classes if there are more
                for init_bgc_c in init_bgc_class:
                    if not any([test in init_bgc_c.lower() for test in ['nrp', 'pks', 'polyketide']]):
                        bgc_class.append(init_bgc_c)

            # replace Alkaloid with Other in bgc_class
            bgc_class = ["Other" if bgc_c == "Alkaloid" else bgc_c for bgc_c in bgc_class]

        for j, chem_cat in enumerate(legend_chem):
            chem_class = chem_classes[j]
            if not chem_class or chem_class == ['']:
                continue

            for bgc_c in bgc_class:
                for chem_c in chem_class:
                    try:
                        result[bgc_cat][chem_cat][bgc_c][chem_c] += 1
                    except TypeError:
                        result[bgc_cat][chem_cat][bgc_c][chem_c] = 1

In [7]:
mibig_cls_npc_pathway = result['mibig_classes']['npc_pathway']  # MiBIG BGC class and NPC pathway
mibig_cls_npc_pathway

defaultdict(<function __main__.rec_dd()>,
            {'Polyketide': defaultdict(<function __main__.rec_dd()>,
                         {'Polyketides': 548,
                          'Shikimates and Phenylpropanoids': 20,
                          'Alkaloids': 72,
                          'Amino acids and Peptides': 24,
                          'Terpenoids': 35,
                          'Fatty acids': 14,
                          'Carbohydrates': 9}),
             'Other': defaultdict(<function __main__.rec_dd()>,
                         {'Polyketides': 36,
                          'Alkaloids': 193,
                          'Carbohydrates': 46,
                          'Amino acids and Peptides': 90,
                          'Shikimates and Phenylpropanoids': 21,
                          'Fatty acids': 19,
                          'Terpenoids': 5}),
             'Saccharide': defaultdict(<function __main__.rec_dd()>,
                         {'Carbohydrates': 60,
           

In [8]:
result.keys(),result['mibig_classes']['cf_subclass']

(dict_keys(['mibig_classes', 'class:subclass', 'as_classes']),
 defaultdict(<function __main__.rec_dd()>,
             {'Polyketide': defaultdict(<function __main__.rec_dd()>,
                          {'Terpene glycosides': 13,
                           'Depsipeptides': 1,
                           'Furanocoumarins': 7,
                           'Terpene lactones': 10,
                           'Carbohydrates and carbohydrate conjugates': 52,
                           'Fatty alcohols': 8,
                           'Nitrobenzenes': 4,
                           'Carbonyl compounds': 26,
                           '1-benzopyrans': 14,
                           'Isoindolines': 2,
                           'Amino acids, peptides, and analogues': 8,
                           'Medium-chain hydroxy acids and derivatives': 1,
                           'Imidothiolactones': 1,
                           'Diterpenoids': 4,
                           'Alcohols and polyols': 3,
         

In [9]:
df_mibig_cls_npc_pathway = pd.DataFrame.from_dict(mibig_cls_npc_pathway, dtype=int).fillna(0)
df_mibig_cls_npc_pathway

Unnamed: 0,Polyketide,Other,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Polyketides,548,36,64.0,19.0,177,115.0,4.0
Shikimates and Phenylpropanoids,20,21,3.0,1.0,6,9.0,0.0
Alkaloids,72,193,0.0,49.0,58,99.0,2.0
Amino acids and Peptides,24,90,9.0,2.0,187,320.0,113.0
Terpenoids,35,5,4.0,136.0,2,0.0,0.0
Fatty acids,14,19,0.0,0.0,12,2.0,0.0
Carbohydrates,9,46,60.0,1.0,1,8.0,0.0


In [10]:
df_mibig_cls_npc_pathway.style.background_gradient('Greens')

Unnamed: 0,Polyketide,Other,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Polyketides,548,36,64.0,19.0,177,115.0,4.0
Shikimates and Phenylpropanoids,20,21,3.0,1.0,6,9.0,0.0
Alkaloids,72,193,0.0,49.0,58,99.0,2.0
Amino acids and Peptides,24,90,9.0,2.0,187,320.0,113.0
Terpenoids,35,5,4.0,136.0,2,0.0,0.0
Fatty acids,14,19,0.0,0.0,12,2.0,0.0
Carbohydrates,9,46,60.0,1.0,1,8.0,0.0


In [11]:
s = df_mibig_cls_npc_pathway.style.bar(color=['#5fba7d'], axis=0, align='zero')
# s.style.bar(color=['#5fba7d'], axis=1, align='zero')
s

Unnamed: 0,Polyketide,Other,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Polyketides,548,36,64.0,19.0,177,115.0,4.0
Shikimates and Phenylpropanoids,20,21,3.0,1.0,6,9.0,0.0
Alkaloids,72,193,0.0,49.0,58,99.0,2.0
Amino acids and Peptides,24,90,9.0,2.0,187,320.0,113.0
Terpenoids,35,5,4.0,136.0,2,0.0,0.0
Fatty acids,14,19,0.0,0.0,12,2.0,0.0
Carbohydrates,9,46,60.0,1.0,1,8.0,0.0


In [12]:
# rules for BGC -> pathway (read column to row)
df_mibig_cls_npc_pathway/df_mibig_cls_npc_pathway.sum(axis=0)

Unnamed: 0,Polyketide,Other,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Polyketides,0.759003,0.087805,0.457143,0.091346,0.399549,0.207957,0.033613
Shikimates and Phenylpropanoids,0.027701,0.05122,0.021429,0.004808,0.013544,0.016275,0.0
Alkaloids,0.099723,0.470732,0.0,0.235577,0.130926,0.179024,0.016807
Amino acids and Peptides,0.033241,0.219512,0.064286,0.009615,0.422122,0.578662,0.94958
Terpenoids,0.048476,0.012195,0.028571,0.653846,0.004515,0.0,0.0
Fatty acids,0.019391,0.046341,0.0,0.0,0.027088,0.003617,0.0
Carbohydrates,0.012465,0.112195,0.428571,0.004808,0.002257,0.014467,0.0


In [13]:
# rules for pathway -> bgc (read column to row)
df_mibig_cls_npc_pathway.T/df_mibig_cls_npc_pathway.sum(axis=1)

Unnamed: 0,Polyketides,Shikimates and Phenylpropanoids,Alkaloids,Amino acids and Peptides,Terpenoids,Fatty acids,Carbohydrates
Polyketide,0.569055,0.333333,0.15222,0.032215,0.192308,0.297872,0.072
Other,0.037383,0.35,0.408034,0.120805,0.027473,0.404255,0.368
Saccharide,0.066459,0.05,0.0,0.012081,0.021978,0.0,0.48
Terpene,0.01973,0.016667,0.103594,0.002685,0.747253,0.0,0.008
PKS-NRP_Hybrids,0.183801,0.1,0.122622,0.251007,0.010989,0.255319,0.008
NRP,0.119418,0.15,0.209302,0.42953,0.0,0.042553,0.064
RiPP,0.004154,0.0,0.004228,0.151678,0.0,0.0,0.0


In [14]:
df_mibig_cls_cf_superclass = pd.DataFrame.from_dict(result['mibig_classes']['cf_superclass'], dtype=int).fillna(0)
df_mibig_cls_cf_superclass

Unnamed: 0,Polyketide,Other,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Organoheterocyclic compounds,110.0,149.0,6.0,45.0,67.0,55.0,0.0
Lipids and lipid-like molecules,83.0,20.0,3.0,93.0,23.0,3.0,0.0
Organic acids and derivatives,31.0,83.0,8.0,5.0,119.0,281.0,58.0
Phenylpropanoids and polyketides,159.0,26.0,17.0,1.0,45.0,3.0,6.0
Organic oxygen compounds,85.0,20.0,78.0,10.0,18.0,7.0,0.0
Alkaloids and derivatives,2.0,11.0,0.0,3.0,3.0,1.0,0.0
Benzenoids,109.0,17.0,10.0,10.0,8.0,14.0,0.0
Organosulfur compounds,1.0,0.0,0.0,0.0,2.0,0.0,1.0
"Organic 1,3-dipolar compounds",3.0,2.0,0.0,0.0,0.0,0.0,0.0
Hydrocarbon derivatives,1.0,4.0,0.0,1.0,0.0,0.0,0.0


In [15]:
# rules for BGC -> superclass (read column to row)
df_mibig_cls_cf_superclass/df_mibig_cls_cf_superclass.sum(axis=0)

Unnamed: 0,Polyketide,Other,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Organoheterocyclic compounds,0.186125,0.429395,0.04918,0.263158,0.233449,0.147849,0.0
Lipids and lipid-like molecules,0.14044,0.057637,0.02459,0.54386,0.080139,0.008065,0.0
Organic acids and derivatives,0.052453,0.239193,0.065574,0.02924,0.414634,0.755376,0.637363
Phenylpropanoids and polyketides,0.269036,0.074928,0.139344,0.005848,0.156794,0.008065,0.065934
Organic oxygen compounds,0.143824,0.057637,0.639344,0.05848,0.062718,0.018817,0.0
Alkaloids and derivatives,0.003384,0.0317,0.0,0.017544,0.010453,0.002688,0.0
Benzenoids,0.184433,0.048991,0.081967,0.05848,0.027875,0.037634,0.0
Organosulfur compounds,0.001692,0.0,0.0,0.0,0.006969,0.0,0.010989
"Organic 1,3-dipolar compounds",0.005076,0.005764,0.0,0.0,0.0,0.0,0.0
Hydrocarbon derivatives,0.001692,0.011527,0.0,0.005848,0.0,0.0,0.0


In [16]:
# rules for superclass -> bgc (read column to row)
df_mibig_cls_cf_superclass.T/df_mibig_cls_cf_superclass.sum(axis=1)

Unnamed: 0,Organoheterocyclic compounds,Lipids and lipid-like molecules,Organic acids and derivatives,Phenylpropanoids and polyketides,Organic oxygen compounds,Alkaloids and derivatives,Benzenoids,Organosulfur compounds,"Organic 1,3-dipolar compounds",Hydrocarbon derivatives,Organic nitrogen compounds,Hydrocarbons,"Lignans, neolignans and related compounds","Nucleosides, nucleotides, and analogues",Organohalogen compounds,Organic Polymers
Polyketide,0.25463,0.368889,0.052991,0.618677,0.389908,0.1,0.64881,0.25,0.6,0.166667,0.166667,0.625,0.5,0.0,0.0,0.0
Other,0.344907,0.088889,0.14188,0.101167,0.091743,0.55,0.10119,0.0,0.4,0.666667,0.166667,0.0,0.5,1.0,1.0,0.0
Saccharide,0.013889,0.013333,0.013675,0.066148,0.357798,0.0,0.059524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Terpene,0.104167,0.413333,0.008547,0.003891,0.045872,0.15,0.059524,0.0,0.0,0.166667,0.0,0.375,0.0,0.0,0.0,0.0
PKS-NRP_Hybrids,0.155093,0.102222,0.203419,0.175097,0.082569,0.15,0.047619,0.5,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
NRP,0.127315,0.013333,0.480342,0.011673,0.03211,0.05,0.083333,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.1875
RiPP,0.0,0.0,0.099145,0.023346,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8125


In [17]:
# rules for mibig class -> subclass (read column to row)
result['mibig_classes']['cf_subclass']
df_mibig_cls_cf_subclass = pd.DataFrame.from_dict(result['mibig_classes']['cf_subclass'], dtype=int)
df_mibig_cls_cf_subclass

Unnamed: 0,Polyketide,Other,Saccharide,Terpene,PKS-NRP_Hybrids,NRP,RiPP
Terpene glycosides,13.0,,2.0,2.0,,,
Depsipeptides,1.0,1.0,2.0,,43.0,101.0,2.0
Furanocoumarins,7.0,,,,,,
Terpene lactones,10.0,,,3.0,1.0,,
Carbohydrates and carbohydrate conjugates,52.0,17.0,76.0,,6.0,6.0,
...,...,...,...,...,...,...,...
"1,4-benzodiazepines",,,,,,13.0,
Peptoid-peptide hybrids,,,,,,2.0,
Terphenyls,,,,,,5.0,
Benzazocines,,,,,,2.0,


In [18]:
scores_mibig_cls_cf_subclass = df_mibig_cls_cf_subclass/df_mibig_cls_cf_subclass.sum(axis=0)
scores_mibig_cls_cf_subclass["Other"].sort_values(ascending=False)

Amino acids, peptides, and analogues         0.234483
Indoles                                      0.089655
Carbazoles                                   0.068966
Carbohydrates and carbohydrate conjugates    0.058621
Pyridoindoles                                0.048276
                                               ...   
1,4-benzodiazepines                               NaN
Peptoid-peptide hybrids                           NaN
Terphenyls                                        NaN
Benzazocines                                      NaN
Isoquinoline quinones                             NaN
Name: Other, Length: 151, dtype: float64

In [19]:
df_mibig_cls_cf_subclass["Other"].sum(), 1/228

(290.0, 0.0043859649122807015)

In [20]:
bgc_class = 'as_classes'
spec_class = 'cf_subclass'
bgc_choose = "indole"
df_res = pd.DataFrame.from_dict(result[bgc_class][spec_class], dtype=int)
df_scores = df_res/df_res.sum(axis=0)
df_scores[bgc_choose].sort_values(ascending=False)


Pyridoindoles                           0.275862
Carbazoles                              0.206897
Indoles                                 0.149425
Amino acids, peptides, and analogues    0.103448
Pyrroloindoles                          0.057471
                                          ...   
Tetraterpenoids                              NaN
Steroidal glycosides                         NaN
Hopanoids                                    NaN
Branched unsaturated hydrocarbons            NaN
Alpha hydroxy acids and derivatives          NaN
Name: indole, Length: 133, dtype: float64

In [21]:
bgc_class = 'as_classes'
spec_class = 'npc_superclass'
bgc_choose = "indole"
df_res = pd.DataFrame.from_dict(result[bgc_class][spec_class], dtype=int)
df_scores = df_res/df_res.sum(axis=0)
df_res[bgc_choose].sort_values(ascending=False)[:20]


Tryptophan alkaloids               74.0
Peptide alkaloids                  13.0
Histidine alkaloids                 4.0
Polycyclic aromatic polyketides     3.0
Small peptides                      3.0
Oligopeptides                       2.0
Polyols                             1.0
Meroterpenoids                      1.0
Anthranilic acid alkaloids          1.0
Macrolides                          NaN
Chromanes                           NaN
Coumarins                           NaN
Xanthones                           NaN
Linear polyketides                  NaN
Ornithine alkaloids                 NaN
Cyclic polyketides                  NaN
Saccharides                         NaN
Naphthalenes                        NaN
Polyethers                          NaN
Pseudoalkaloids                     NaN
Name: indole, dtype: float64

## Visualisation

In [22]:
label_dict = OrderedDict()
label_dict.update({key:i for i, key in enumerate(mibig_cls_npc_pathway.keys())})
cur = len(label_dict)
for vals in mibig_cls_npc_pathway.values():
    for val in vals.keys():
        if val not in label_dict:
            label_dict[val] = cur
            cur += 1
label_vals = list(label_dict.keys())
label_dict, label_vals

(OrderedDict([('Polyketide', 0),
              ('Other', 1),
              ('Saccharide', 2),
              ('Terpene', 3),
              ('PKS-NRP_Hybrids', 4),
              ('NRP', 5),
              ('RiPP', 6),
              ('Polyketides', 7),
              ('Shikimates and Phenylpropanoids', 8),
              ('Alkaloids', 9),
              ('Amino acids and Peptides', 10),
              ('Terpenoids', 11),
              ('Fatty acids', 12),
              ('Carbohydrates', 13)]),
 ['Polyketide',
  'Other',
  'Saccharide',
  'Terpene',
  'PKS-NRP_Hybrids',
  'NRP',
  'RiPP',
  'Polyketides',
  'Shikimates and Phenylpropanoids',
  'Alkaloids',
  'Amino acids and Peptides',
  'Terpenoids',
  'Fatty acids',
  'Carbohydrates'])

In [23]:
label_colours = [
    "rgba(0,63,92,1)",
    "rgba(62,70,101,1)",
    "rgba(98,77,108,1)",
    "rgba(128,85,111,1)",
    "rgba(156,94,110,1)",
    "rgba(181,104,105,1)",
    "rgba(204,117,96,1)",
    "rgba(224,131,81,1)",
    "rgba(241,148,59,1)",
    "rgba(255,166,0,1)"]

In [24]:
colours_labels = [
    'Polyketide',
    'PKS-NRP_Hybrids',
    'NRP',
    'RiPP',
    'Saccharide',
    'Other',
    'Terpene',
    'Polyketides',
    'Shikimates and Phenylpropanoids',
    'Alkaloids',
    'Amino acids and Peptides',
    'Terpenoids',
    'Fatty acids',
    'Carbohydrates']

In [25]:
colour_dict = {
    'Polyketide': label_colours[0],
    'PKS-NRP_Hybrids': label_colours[1],
    'NRP': label_colours[2],
    'RiPP': label_colours[3],
    'Saccharide': label_colours[6],
    'Other': label_colours[7],
    'Terpene': label_colours[-1],
    'Polyketides': label_colours[0],
    'Amino acids and Peptides': label_colours[2],
    'Fatty acids': label_colours[4],
    'Shikimates and Phenylpropanoids': label_colours[5],
    'Alkaloids': label_colours[7],
    'Carbohydrates': label_colours[8],
    'Terpenoids': label_colours[-1]}

In [26]:
# colour_dict = {}
# colour_list = []
# for i, lab in enumerate(colours_labels):
#     try:
#         cur_col = label_colours[i]
#     except IndexError:
#         cur_col = 'blue'
#     colour_dict[lab] = cur_col
#     colour_list.append(cur_col)

In [27]:
source_vals = []
target_vals = []
value_vals = []
colour_vals = []
opac = '0.4'

for bgc_c, vals in mibig_cls_npc_pathway.items():
    for chem_c, count in vals.items():
        source_vals.append(label_dict[bgc_c])
        target_vals.append(label_dict[chem_c])
        value_vals.append(count)
        col = colour_dict[bgc_c]
        if col.startswith("rgba"):
            col = col.rpartition(',')[0] + f',{opac})'
        colour_vals.append(col)

In [33]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = label_vals,
      color = [colour_dict[lab] for lab in label_vals]
    ),
    link = dict(
      source = source_vals, # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = target_vals,
      value = value_vals,
      color = colour_vals,
  ))])

fig_title = "MIBiG BGC classes vs NPClassifier pathways"
fig.update_layout(title_text=fig_title, font_size=10)
fig.show()
fig.write_html(os.path.join(out_base, '_'.join(fig_title.split(' '))) + ".html")

In [36]:
fig.update_layout(
#     autosize=False,
    width=700,
    height=500,
    font_size=12.2
)
fig.show()
fig.write_image(os.path.join(out_base, '_'.join(fig_title.split(' '))) + "_small.svg")

## Plotting function

In [38]:
from typing import Dict, Union
from copy import deepcopy

def make_class_sankey_plot(class_dict_ori: Dict[str, Dict[str, int]],
                           plot_title: str = "MIBiG BGC classes vs NPClassifier pathways",
                           cutoff: Union[None, int] = None) -> go.Figure:
    class_dict = deepcopy(class_dict_ori)
    if cutoff:
        for key, inner_dict in class_dict.items():
            inner_copy = deepcopy(list(inner_dict.items()))
            for inner_key, value in inner_copy:
                if value < cutoff:
                    del class_dict[key][inner_key]

    label_dict = OrderedDict()
    label_dict.update({key:i for i, key in enumerate(class_dict.keys())})
    cur = len(label_dict)
    for vals in class_dict.values():
        for val in vals.keys():
            if val not in label_dict:
                label_dict[val] = cur
                cur += 1
    label_vals = list(label_dict.keys())

    source_vals = []
    target_vals = []
    value_vals = []

    for bgc_c, vals in class_dict.items():
        for chem_c, count in vals.items():
            source_vals.append(label_dict[bgc_c])
            target_vals.append(label_dict[chem_c])
            value_vals.append(count)

    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = label_vals
        ),
        link = dict(
          source = source_vals, # indices correspond to labels, eg A1, A2, A1, B1, ...
          target = target_vals,
          value = value_vals
      ))])

    fig.update_layout(title_text=plot_title, font_size=10)
    return fig

In [39]:
print(result.keys())
result[list(result.keys())[0]].keys()

dict_keys(['mibig_classes', 'class:subclass', 'as_classes'])


dict_keys(['cf_kingdom', 'cf_superclass', 'cf_class', 'cf_direct_parent', 'npc_class', 'npc_superclass', 'npc_pathway', 'npc_isglycoside', 'cf_subclass'])

In [40]:
classes_result_dict = result['mibig_classes']['npc_isglycoside']
classes_result_dict

defaultdict(<function __main__.rec_dd()>,
            {'Polyketide': defaultdict(<function __main__.rec_dd()>,
                         {'0': 502, '1': 178}),
             'Other': defaultdict(<function __main__.rec_dd()>,
                         {'1': 50, '0': 345}),
             'Saccharide': defaultdict(<function __main__.rec_dd()>,
                         {'1': 122, '0': 19}),
             'Terpene': defaultdict(<function __main__.rec_dd()>,
                         {'0': 157, '1': 21}),
             'PKS-NRP_Hybrids': defaultdict(<function __main__.rec_dd()>,
                         {'0': 298, '1': 31}),
             'NRP': defaultdict(<function __main__.rec_dd()>,
                         {'0': 387, '1': 34}),
             'RiPP': defaultdict(<function __main__.rec_dd()>,
                         {'0': 113, '1': 2})})

In [57]:
classes_result_dict = result['mibig_classes']['cf_superclass']
result_fig_title = "MIBiG classes vs ClassyFire superclass"
result_fig = make_class_sankey_plot(classes_result_dict, result_fig_title)
# result_fig.write_html(os.path.join(out_base, '_'.join(result_fig_title.split(' '))) + ".html")
result_fig.update_layout(
#     autosize=False,
    width=700,
    height=700,
    font_size=12.2
)
result_fig.write_image(os.path.join(out_base, '_'.join(result_fig_title.split(' '))) + ".svg")
result_fig

In [59]:
# need to do some naming cleanup to make this plot visible
from copy import deepcopy
# only select results with counts >5
cutoff = 5
as_classes_cutoff = deepcopy(result['as_classes']['cf_class'])
for key, inner_dict in as_classes_cutoff.items():
    inner_copy = deepcopy(list(inner_dict.items()))
    for inner_key, value in inner_copy:
        if value < cutoff:
            del as_classes_cutoff[key][inner_key]
cur_fig_title = "antiSMASH BGC classes vs CF class cutoff 5"
fig_result = make_class_sankey_plot(as_classes_cutoff, cur_fig_title)
fig_result.update_layout(
#     autosize=False,
    width=800,
    height=800,
    font_size=12.2
)
fig_result.write_image(os.path.join(out_base, '_'.join(cur_fig_title.split(' '))) + ".svg")
fig_result

In [51]:
# need to do some naming cleanup to make this plot visible
from copy import deepcopy
# only select results with counts >5
cutoff = 5
as_classes_cutoff = deepcopy(result['as_classes']['npc_superclass'])
for key, inner_dict in as_classes_cutoff.items():
    inner_copy = deepcopy(list(inner_dict.items()))
    for inner_key, value in inner_copy:
        if value < cutoff:
            del as_classes_cutoff[key][inner_key]
fig_result = make_class_sankey_plot(as_classes_cutoff, "antiSMASH BGC classes vs NPC superclass cutoff 5")
fig_result.update_layout(
#     autosize=False,
    width=800,
    height=800,
    font_size=12.2
)
fig_result

In [43]:
result['as_classes']['npc_pathway']

defaultdict(<function __main__.rec_dd()>,
            {'T1PKS': defaultdict(<function __main__.rec_dd()>,
                         {'Polyketides': 474,
                          'Shikimates and Phenylpropanoids': 18,
                          'Alkaloids': 104,
                          'Carbohydrates': 11,
                          'Amino acids and Peptides': 204,
                          'Terpenoids': 23,
                          'Fatty acids': 17}),
             'NRPS-like': defaultdict(<function __main__.rec_dd()>,
                         {'Polyketides': 68,
                          'Alkaloids': 32,
                          'Shikimates and Phenylpropanoids': 11,
                          'Amino acids and Peptides': 8,
                          'Carbohydrates': 7,
                          'Fatty acids': 9,
                          'Terpenoids': 4}),
             'oligosaccharide': defaultdict(<function __main__.rec_dd()>,
                         {'Carbohydrates': 2,
         

In [None]:
from copy import deepcopy
as_npc_pathway = deepcopy(result['as_classes']['npc_pathway'])
as_npc_pathway_sub = {key:val for key, val in as_npc_pathway.items()\
                      if any([test in key.lower() for test in ['nrp', 'pks', 'polyketide']])}
as_npc_pathway_sub

In [None]:
as_npc_pathway_sub_title = "antiSMASH PKS-NRPS classes vs NPClassifier pathways"
as_npc_pathway_sub_fig_result = make_class_sankey_plot(
    as_npc_pathway_sub, as_npc_pathway_sub_title)
as_npc_pathway_sub_fig_result.write_html(os.path.join(out_base, '_'.join(as_npc_pathway_sub_title.split(' '))) + ".html")
as_npc_pathway_sub_fig_result

In [None]:
from copy import deepcopy
as_cf_superclass = deepcopy(result['as_classes']['cf_superclass'])
as_cf_superclass_sub = {key:val for key, val in as_cf_superclass.items()\
                      if any([test in key.lower() for test in ['nrp', 'pks', 'polyketide']])}
as_cf_superclass_sub

In [None]:
as_cf_superclass_sub_title = "antiSMASH PKS-NRPS classes vs ClassyFire superclasses"
as_cf_superclass_sub_fig_result = make_class_sankey_plot(
    as_cf_superclass_sub, as_cf_superclass_sub_title)
as_cf_superclass_sub_fig_result.write_html(os.path.join(out_base, '_'.join(as_cf_superclass_sub_title.split(' '))) + ".html")
as_cf_superclass_sub_fig_result

## Get fractions per pairing

In [None]:
cur_class_dict = deepcopy(result['mibig_classes']['cf_superclass'])
frac_class_dict = {}
for key, inner_dict in cur_class_dict.items():
    inner_tups = list(inner_dict.items())
    inner_total = 0
    frac_class_dict[key] = {}
    for inner_key, val in inner_tups:
        inner_total += val
    for inner_key, val in inner_tups:
        frac_class_dict[key][inner_key] = round(val/inner_total, 3)
frac_class_dict