# Summary
Read the class info file for MiBIG entries, and report probability tables.

In [33]:
import os
import urllib
import glob
import json
import time
import numpy as np
import pandas as pd

In [2]:
mibig_folder = "/mnt/scratch/louwe015/mibig_json_2.0/"

base_path = "/mnt/scratch/louwe015/NPLinker/classifying/mibig_classifications/"
out_base = os.path.split(base_path[:-1])[0]
mibig_classes = os.path.join(out_base, "All_MIBiG_compounds_with_AS_BGC_CF_NPC_classes.txt")
print("mibig_classes exists is", os.path.isfile(mibig_classes))

mibig_classes exists is True


## Reading file
Making dict of {chem_id: [[bgc_classes], [chem_classes]]}, where bgc_classes is [class, subclass, AS_class], and chem_classes is the CF/NPC classes in order of input file

In [5]:
classes_dict = {}
with open(mibig_classes) as inf:
    header = inf.readline()
    print(header)
    for line in inf:
        elems = line.strip().split("\t")
        chem_id = elems.pop(0)
        class_base = elems.pop(0).split(',')
        classes = [cls.partition(':')[0] for cls in class_base]
        sub_classes = [cls for cls in class_base if cls.split(":")[1]]
        as_classes = elems.pop(0).split(',')

        bgc_classes = [classes, sub_classes, as_classes]        
        chem_classes = [chem_cls.split('; ') for chem_cls in elems[2:]]
        classes_dict[chem_id] = [bgc_classes, chem_classes]

print(classes_dict[chem_id])  # example
print(classes_dict['BGC0000020_maytansine'])
classes_dict['BGC0000001_abyssomicin C']

compound_name	class:subclass	AS_classes	smiles	inchi_key	kingdom	superclass	class	subclass	direct_parent	class_results	superclass_results	pathway_results	isglycoside

[[['Other'], [], ['phosphonate']], [[''], [''], [''], [''], [''], [''], [''], [''], ['0']]]
[[['Polyketide'], [], ['T1PKS', 'NRPS-like']], [['Organic compounds'], ['Phenylpropanoids and polyketides'], ['Macrolactams'], [''], ['Macrolactams'], ['Ansa macrolides'], ['Macrolides'], ['Polyketides'], ['0']]]


[[['Polyketide'], ['Polyketide:Other'], ['T1PKS']],
 [['Organic compounds'],
  ['Organoheterocyclic compounds'],
  ['Oxanes'],
  [''],
  ['Oxanes'],
  ['Spirotetronate macrolides'],
  ['Macrolides'],
  ['Polyketides'],
  ['0']]]

In [24]:
# creating legend from the header
s_h = header.strip().split('\t')
legend_bgc = ['mibig_class']+s_h[1:3]
legend_chem = s_h[5:]
print(legend_bgc, legend_chem)

['mibig_class', 'class:subclass', 'AS_classes'] ['kingdom', 'superclass', 'class', 'subclass', 'direct_parent', 'class_results', 'superclass_results', 'pathway_results', 'isglycoside']


## Do comparisons between bgc and chem classes

In [18]:
from collections import defaultdict
def rec_dd():
    """Initialises a recurring defaultdict"""
    return defaultdict(rec_dd)

In [25]:
# aggregate pairwise class matrices for all compounds
result = rec_dd()
for chem_id, classes in classes_dict.items():
    bgc_classes, chem_classes = classes
    
    for i, bgc_cat in enumerate(legend_bgc):
        bgc_class = bgc_classes[i]
        
        for j, chem_cat in enumerate(legend_chem):
            chem_class = chem_classes[j]
            
            for bgc_c in bgc_class:
                for chem_c in chem_class:
                    try:
                        result[bgc_cat][chem_cat][bgc_c][chem_c] += 1
                    except TypeError:
                        result[bgc_cat][chem_cat][bgc_c][chem_c] = 1

In [31]:
mibig_cls_npc_pathway = result['mibig_class']['pathway_results']  # MiBIG BGC class and NPC pathway
mibig_cls_npc_pathway

defaultdict(<function __main__.rec_dd()>,
            {'Polyketide': defaultdict(<function __main__.rec_dd()>,
                         {'Polyketides': 725,
                          '': 17,
                          'Shikimates and Phenylpropanoids': 26,
                          'Alkaloids': 130,
                          'Amino acids and Peptides': 211,
                          'Terpenoids': 37,
                          'Fatty acids': 26,
                          'Carbohydrates': 10}),
             'Other': defaultdict(<function __main__.rec_dd()>,
                         {'Polyketides': 35,
                          'Carbohydrates': 46,
                          'Amino acids and Peptides': 78,
                          'Alkaloids': 96,
                          'Shikimates and Phenylpropanoids': 21,
                          '': 9,
                          'Fatty acids': 19,
                          'Terpenoids': 5}),
             'Alkaloid': defaultdict(<function __main__.re

In [46]:
df_mibig_cls_npc_pathway = pd.DataFrame.from_dict(mibig_cls_npc_pathway, dtype=int).fillna(0)
df_mibig_cls_npc_pathway

Unnamed: 0,Polyketide,Other,Alkaloid,Saccharide,Terpene,NRP,RiPP
Polyketides,725,35,1.0,64.0,19.0,292,4.0
,17,9,0.0,7.0,0.0,15,2.0
Shikimates and Phenylpropanoids,26,21,0.0,3.0,1.0,15,0.0
Alkaloids,130,96,97.0,0.0,49.0,157,2.0
Amino acids and Peptides,211,78,12.0,9.0,2.0,507,113.0
Terpenoids,37,5,0.0,4.0,136.0,2,0.0
Fatty acids,26,19,0.0,0.0,0.0,14,0.0
Carbohydrates,10,46,0.0,60.0,1.0,9,0.0


In [48]:
df_mibig_cls_npc_pathway.style.background_gradient('Greens')

Unnamed: 0,Polyketide,Other,Alkaloid,Saccharide,Terpene,NRP,RiPP
Polyketides,725,35,1.0,64.0,19.0,292,4.0
,17,9,0.0,7.0,0.0,15,2.0
Shikimates and Phenylpropanoids,26,21,0.0,3.0,1.0,15,0.0
Alkaloids,130,96,97.0,0.0,49.0,157,2.0
Amino acids and Peptides,211,78,12.0,9.0,2.0,507,113.0
Terpenoids,37,5,0.0,4.0,136.0,2,0.0
Fatty acids,26,19,0.0,0.0,0.0,14,0.0
Carbohydrates,10,46,0.0,60.0,1.0,9,0.0


In [64]:
s = df_mibig_cls_npc_pathway.style.bar(color=['#5fba7d'], axis=0, align='zero')
# s.style.bar(color=['#5fba7d'], axis=1, align='zero')
s

Unnamed: 0,Polyketide,Other,Alkaloid,Saccharide,Terpene,NRP,RiPP
Polyketides,725,35,1.0,64.0,19.0,292,4.0
,17,9,0.0,7.0,0.0,15,2.0
Shikimates and Phenylpropanoids,26,21,0.0,3.0,1.0,15,0.0
Alkaloids,130,96,97.0,0.0,49.0,157,2.0
Amino acids and Peptides,211,78,12.0,9.0,2.0,507,113.0
Terpenoids,37,5,0.0,4.0,136.0,2,0.0
Fatty acids,26,19,0.0,0.0,0.0,14,0.0
Carbohydrates,10,46,0.0,60.0,1.0,9,0.0
