In [45]:
from pgtools import panaroo_parser
from pgtools.gff_parser import parse_gff, parse_GFFs_dir, Pangenome_Gffs
from pgtools.gfa_parser import parse_gfa1
from pgtools.maf_parser import parse_maf, MAF
from pgtools.utils import intersection_len, contains
from pgtools.pangenome import Pangenome
import os
import argparse
from Bio.Seq import Seq

In [46]:
maf_dir = "/home/pampuch/studia/magisterka/test_data/klebsiella_subset_old/panaroo_new.maf"
gff_dir = "/home/pampuch/studia/magisterka/test_data/klebsiella_subset_old/gff/"

In [47]:
pg_maf = parse_maf(maf_dir)
pg_maf = MAF(list(pg_maf.seq_collections)[:30])

In [8]:
pg_maf.detect_soft_core()

In [9]:
for seq_coll in pg_maf.seq_collections:
    print(seq_coll.soft_core)

False
False
True
True
False
False
True
False
True
True
False
True
False
True
False
False
True
True
False
True
False
False
False
True
True
False
False
True
False
True


In [10]:
pg_maf.map_to_gff(gff_dir)

In [33]:
def annotations_to_csv(pg, gff_dir, csv_name = "annotations_summary.csv"):
    res_csv = open(csv_name, "w")
    res_csv.write("cluster id,cluster size,core status,seq name,mapped annotations\n")
    res_csv.flush()
    pg.detect_soft_core()
    pg.map_to_gff(gff_dir)
    for seq_coll in pg.seq_collections:
        id = seq_coll.id
        clust_size = len(seq_coll)
        core_status = seq_coll.soft_core
        for seq in seq_coll.sequences:
            # annotation len is also an iimportant aspect, but can be easily retrived from gff file
            # print(seq.seq_name, [ann.annotation_id for ann in seq.mapped_annotations], seq_coll.soft_core)
            annots = ";".join([ann.annotation_id for ann in seq.mapped_annotations])
            res_csv.write(f"{id},{clust_size},{core_status},{seq.seq_name},{annots}\n")
            res_csv.flush()
    res_csv.close()

In [34]:
annotations_to_csv(pg_maf, gff_dir)

In [17]:
for seq_coll in pg_maf.seq_collections:
    print(seq_coll.id)
    for seq in seq_coll.sequences:
        print(seq.seq_name, [ann.annotation_id for ann in seq.mapped_annotations], seq_coll.soft_core)

300
5150_1_3.contig00070 ['AOMCCKAJ_05284'] False
5151_2_6.contig00040 ['NALFBGJE_03847'] False
5151_6_6.contig00050 ['NJAAJKJA_05060'] False
5193_2_6.contig00088 ['FGIHKDBE_05317'] False
5197_7_4.contig00091 ['IMGDPOBN_05180'] False
5235_1_4.contig00216 [] False
5235_6_12.contig00298 ['LIDJBJFH_04408'] False
5299_1_3.contig00161 ['OBOFFKJP_04413'] False
5299_7_4.contig00195 ['MOOPCBGB_05234'] False
3592
5193_8_2.contig00030 ['MLKBEPLK_03626'] False
5299_7_4.contig00102 ['MOOPCBGB_04130'] False
5197_2_1.contig00031 ['PDEDMGPD_04033'] False
5197_7_4.contig00022 ['IMGDPOBN_03170', 'IMGDPOBN_03171', 'IMGDPOBN_03172'] False
5150_1_3.contig00006 ['AOMCCKAJ_01660', 'AOMCCKAJ_01661'] False
5197_7_5.contig00009 ['EHMCBADO_02394'] False
5235_2_1.contig00142 [] False
5150_2_2.contig00001 ['ILAAFBJJ_00065'] False
5235_5_12.contig00265 ['KKAGLBML_04142'] False
5150_3_5.contig00003 ['JGFEPBDE_00570'] False
5235_6_11.contig00051 ['NNHDGMCP_01684'] False
5151_2_6.contig00005 ['NALFBGJE_01010'] False


## Extracting core from Panaroo

In [36]:
panaroo_dir = "/home/pampuch/studia/magisterka/test_data/klebsiella_subset_old/panaroo_out/"
panaroo_model = panaroo_parser.parse_panaroo_output(panaroo_dir, gff_dir)

In [43]:
def panaroo_annotations_to_csv(pg, gff_dir, csv_name = "panaroo_annotations_summary.csv"):
    res_csv = open(csv_name, "w")
    res_csv.write("cluster name,cluster size,core status,seq name,CDS\n")
    res_csv.flush()
    pg.detect_soft_core()
    # pg.map_to_gff(gff_dir)
    for seq_coll in pg.seq_collections:
        id = seq_coll.cluster_name
        clust_size = len(seq_coll)
        core_status = seq_coll.soft_core
        for seq in seq_coll.sequences:
            # annotation len is also an iimportant aspect, but can be easily retrived from gff file
            # print(seq.seq_name, [ann.annotation_id for ann in seq.mapped_annotations], seq_coll.soft_core)
            annots = ";".join([ann_id for ann_id in seq.annotation_ids])
            res_csv.write(f"{id},{clust_size},{core_status},{seq.seq_name},{annots}\n")
            res_csv.flush()
    res_csv.close()

In [44]:
panaroo_annotations_to_csv(panaroo_model, gff_dir)