In [None]:
%%bash
pip install xlsxwriter

In [None]:
from pathlib import Path
import yaml
import pandas as pd
import networkx as nx

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
PROJECT_NAME = "TIGR03997_with_reference"
PROJECT_NAME2 = "mq_saccharopolyspora"
output_dir = Path("assets/supplementary_materials")
output_dir.mkdir(parents=True, exist_ok=True)

## Data S1
Data S1. Input and result tables related to the PEP on qc_saccharaopolyspora 
- Tab 1: sample.csv listing 42 genomes as input for config file
- Tab 2: NCBI metadata  
- Tab 3: CheckM results on quality assessment 
- Tab 4: SeqFu results on quality assessment 
- Tab 5: GTDB-tk and GTDB results on taxonomic definition

In [None]:
outfile = output_dir / "DATA_S1.xlsx"

In [None]:
PROJECT_NAME = "qc_saccharopolyspora"
PROJECT_CONFIG_DIR = bgcflow_dir / f"config/{PROJECT_NAME}"
REPORT_DIR = bgcflow_dir / f"data/processed/{PROJECT_NAME}"
df_tab1 = pd.read_csv(PROJECT_CONFIG_DIR / "samples.csv")
df_tab2 = pd.read_csv(REPORT_DIR / "tables/df_ncbi_meta.csv")
df_tab3 = pd.read_csv(REPORT_DIR / "tables/df_checkm_stats.csv")
df_tab4 = pd.read_csv(REPORT_DIR / "tables/df_seqfu_stats.csv")
df_tab5 = pd.read_csv(REPORT_DIR / "tables/df_gtdb_meta.csv").set_index("genome_id", drop=False)
df_tab5_gtdbtk = pd.read_csv(REPORT_DIR / "tables/gtdbtk.bac120.summary.tsv", sep="\t").set_index("user_genome", drop=False)

for i in df_tab5.index:
    if df_tab5.loc[i, 'Domain'] == "d__":
        for num, c in enumerate(['Domain','Phylum','Class','Order','Family','Genus', 'Organism']):
            values = df_tab5_gtdbtk.loc[i, "classification"].split(";")
            df_tab5.loc[i, c] = values[num]
    
    if df_tab5.loc[i, 'Organism'] == "s__":
        df_tab5.loc[i, "Organism"] = f"{df_tab5.loc[i, 'Genus'].strip('g__')} sp."

    if df_tab5.loc[i, 'Species'] == "s__":
        df_tab5.loc[i, "Species"] = f"{df_tab5.loc[i, 'Organism'].split()[-1]}"

In [None]:
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df_tab1.to_excel(writer, index=None, sheet_name='Sheet1')
df_tab2.to_excel(writer, index=None, sheet_name='Sheet2')
df_tab3.to_excel(writer, index=None, sheet_name='Sheet3')
df_tab4.to_excel(writer, index=None, sheet_name='Sheet4')
df_tab5.to_excel(writer, index=None, sheet_name='Sheet5')
writer.close()

## Data S2
Data S2. Input and result tables related to the PEP on mq_saccharaopolyspora
- Tab 1: sample.csv listing 26 genomes as input for config file
- Tab 2: MASH distance
- Tab 3: Prokka result
- Tab 4: Results of BGCs from antiSMASH
- Tab 5: Results of GCFs based on BiG-SCAPE using different cutoffs 
ffs 


In [None]:
outfile = output_dir / "DATA_S2.xlsx"

PROJECT_NAME = "mq_saccharopolyspora"
PROJECT_CONFIG_DIR = bgcflow_dir / f"config/{PROJECT_NAME}"
REPORT_DIR = bgcflow_dir / f"data/processed/{PROJECT_NAME}"
bigscape_dir = REPORT_DIR / "bigscape/for_cytoscape_antismash_6.1.1"

In [None]:
df_tab1 = pd.read_csv(PROJECT_CONFIG_DIR / "samples.csv")
df_tab2 = pd.read_csv(REPORT_DIR / "mash/df_mash.csv")
df_tab3 = pd.read_csv("assets/tables/Figure_S8_prokka.csv", index_col=0)
df_tab3.index.name = "genome_id"
df_tab4 = pd.read_csv(REPORT_DIR / "tables/df_regions_antismash_6.1.1.csv")
df_tab5 = pd.read_csv([i for i in bigscape_dir.glob("*df_clusters_0.30*")][0])
df_tab6 = pd.read_csv([i for i in bigscape_dir.glob("*df_clusters_0.40*")][0])
df_tab7 = pd.read_csv([i for i in bigscape_dir.glob("*df_clusters_0.50*")][0])
df_tab8 = pd.read_csv([i for i in bigscape_dir.glob("*df_network_0.30*")][0])
df_tab9 = pd.read_csv([i for i in bigscape_dir.glob("*df_network_0.40*")][0])
df_tab10 = pd.read_csv([i for i in bigscape_dir.glob("*df_network_0.50*")][0])

In [None]:
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df_tab1.to_excel(writer, index=None, sheet_name='Sheet1')
df_tab2.to_excel(writer, index=None, sheet_name='Sheet2')
df_tab3.to_excel(writer, index=None, sheet_name='Sheet3')
df_tab4.to_excel(writer, index=None, sheet_name='Sheet4')
df_tab5.to_excel(writer, index=None, sheet_name='Sheet5')
df_tab6.to_excel(writer, index=None, sheet_name='Sheet6')
df_tab7.to_excel(writer, index=None, sheet_name='Sheet7')
df_tab8.to_excel(writer, index=None, sheet_name='Sheet8')
df_tab9.to_excel(writer, index=None, sheet_name='Sheet9')
writer.close()

## Data S3
Data S3. Results of BiG-FAM and ARTS database related to the PEP on mq_saccharaopolyspora 
- Tab 1: Hits against the BiG-FAM GCFs calculated using query BiG-SLICE rul
- Tab 2: List of BiG-FAM GCFs hitse
- Ta3 2: Hits against ARTS profiles
- T4b 3: Table with all nodes in the enriched network (Figure 5)
- 5ab 4: Table with all edges represented in the enriched network (Figur 5)


In [None]:
outfile = output_dir / "DATA_S3.xlsx"

PROJECT_NAME = "mq_saccharopolyspora"
PROJECT_CONFIG_DIR = bgcflow_dir / f"config/{PROJECT_NAME}"
REPORT_DIR = bgcflow_dir / f"data/processed/{PROJECT_NAME}"
bigfam_dir = REPORT_DIR / "bigslice/query_as_6.1.1"

In [None]:
df_tab1 = pd.read_csv(bigfam_dir / "query_network.csv")
df_tab2 = pd.read_csv("assets/tables/Figure_4_bigfam_models.csv", index_col=0)
df_tab2.index.name = "gcf_id"
df_tab3 = pd.read_csv("assets/tables/Figure_4_df_arts_hits.csv", index_col=0)

In [None]:
df_tab3

In [None]:
nx.read_graphml("assets/data/Figure_5_integrated_network_0.30.graphml")