In [1]:
from utils import * 
import os 
from src.ggkbase import * 
import glob
import matplotlib as mpl 
from src.files import BLASTFileJSON, FASTAFile, InterProScanFileTSV, GenBankFile, fasta_get_genome_size
import subprocess
import networkx as nx
import json

%load_ext autoreload
%autoreload 2

# Notebook for all the setup stuff. 

In [2]:
# Want to evaluate the likelihood of other potential hosts. Look for all other Methanoperedens in the bioreactors. 
# For some reason, it seems like things aren't appropriately-labeled in the coassembly. 
organism_info_df = list()
for path in glob.glob('../data/ggkbase/*organism_info.tsv'):
    organism_info_df.append(load_organism_info(path))
organism_info_df = pd.concat(organism_info_df)
mp_organism_info_df = organism_info_df[organism_info_df.taxonomy.str.contains('Methanoperedens') | organism_info_df.index.str.contains('Methanoperedens')].copy()


In [3]:
def run_fastani(input_dir:str='../data/ggkbase/contigs', output_dir:str='../data/ggkbase'):
    input_list_path = os.path.join('.', 'fastani_input.txt')
    output_path = os.path.join(output_dir, 'fastani.tsv')
    input_paths = list(glob.glob(os.path.join(input_dir, '*')))
    with open(input_list_path, 'w') as f:
        f.write('\n'.join(input_paths))
    cmd = f'fastANI --ql {input_list_path} --rl {input_list_path} -o {output_path}'
    subprocess.run(cmd, shell=True, check=True)
    os.remove(input_list_path)
    
# run_fastani()

def load_fastani(path:str='../data/ggkbase/fastani.tsv'):
    cols = ['query_genome_path', 'ref_genome_path', 'ani', 'n_aligned', 'n_fragments']
    fastani_df = pd.read_csv(path, sep='\t', names=cols)
    fastani_df['query_genome_id'] = fastani_df.query_genome_path.apply(lambda path : os.path.basename(path).replace('.contigs.fa', ''))
    fastani_df['ref_genome_id'] = fastani_df.ref_genome_path.apply(lambda path : os.path.basename(path).replace('.contigs.fa', ''))
    return fastani_df

In [4]:
threshold = 99
mp_genome_ids = mp_organism_info_df.index.tolist()

fastani_df = load_fastani()
fastani_df = fastani_df[fastani_df.ani > threshold].copy()
fastani_df = fastani_df[fastani_df.query_genome_id.isin(mp_genome_ids) | fastani_df.ref_genome_id.isin(mp_genome_ids)].copy()

edges = list(zip(fastani_df.query_genome_id, fastani_df.ref_genome_id))
graph = nx.Graph(edges)
strains = list(nx.connected_components(graph))
print('Num. separate Methanoperedens strains:', len(strains))

strain_ids = {genome_id:f'mp_{i + 1}' for i, genome_ids in enumerate(strains) for genome_id in genome_ids}

mp_organism_info_df['strain_id'] = mp_organism_info_df.index.map(strain_ids)
# Use the biggest assembly as the representative. These are the ones used to create the genome_id_to_ggkbase_id_map.
mp_organism_info_df = mp_organism_info_df.sort_values('bin_length', ascending=False).drop_duplicates('strain_id')
mp_organism_info_df = mp_organism_info_df.reset_index()

Num. separate Methanoperedens strains: 5


In [28]:
genome_id_to_ggkbase_id_map = pd.read_csv('genome_metadata.csv').set_index('genome_id').ggkbase_id.to_dict()

In [7]:
get_contig_index = lambda gene_id : gene_id.split('.')[-1].split('_')[0]

for genome_id, ggkbase_id in genome_id_to_ggkbase_id_map.items():
    fasta_file = FASTAFile.from_file(f'../data/ggkbase/contigs/{ggkbase_id}.contigs.fa')
    fasta_file.ids = [f'{genome_id}.{i + 1}' for i in range(len(fasta_file.ids))] # Rename the contigs
    fasta_file.write(f'../data/data/{genome_id}.fn')

    genbank_file = GenBankFile.from_file(f'../data/ggkbase/genbank/{ggkbase_id}.gbk')
    genbank_file.df['gene_id'] = [f'{genome_id}.{gene_id}' for gene_id in genbank_file.df.gene_id] # Rename the gene IDs.
    genbank_file.df['contig_id'] = [f'{genome_id}.{get_contig_index(gene_id)}' for gene_id in genbank_file.df.gene_id] # Rename the contig IDs.
    subprocess.run(f'cp ../data/ggkbase/genbank/{ggkbase_id}.gbk ../data/data/{genome_id}.gbk', check=True, shell=True)
    genbank_file.to_gff(f'../data/data/{genome_id}.gff')
    genbank_file.to_fasta(f'../data/data/{genome_id}.fa')

In [14]:
with open('../data/data/all.fn', 'w') as f:
    paths = [path for path in glob.glob('../data/data/*fn') if (os.path.basename(path) != 'all.fn')]
    for path in paths:
        with open(path, 'r') as _f: 
            f.write(_f.read())
        
with open('../data/data/all.fa', 'w') as f:
    paths = [path for path in glob.glob('../data/data/*fa') if (os.path.basename(path) != 'all.fa')]
    for path in paths:
        with open(path, 'r') as _f: 
            f.write(_f.read())

gff_header = '##gff-version  3\n'
with open('../data/data/all.gff', 'w') as f:
    f.write(gff_header)
    paths = [path for path in glob.glob('../data/data/*gff') if (os.path.basename(path) != 'all.gff')]
    for path in paths:
        with open(path, 'r') as _f: 
            f.write(_f.read().replace(gff_header, ''))

In [25]:
genome_id_df

Unnamed: 0,genome_id,ggkbase_id
0,mp_4,SR-VP_11_27_2022_S1_80cm_Methanoperedens_44_5
1,mp_1,SR-VP_05_06_2024_ck_bottom_Methanoperedens_44_47
2,mp_3,SR-VP_05_06_2024_N_top_Methanoperedens_44_14
3,mp_5,SR-VP_05_06_2024_N_top_Candidatus_Methanopered...
4,mp_2,SR-VP_05_06_2024_ck_bottom_Methanoperedens_41_16
5,jupiter_mini_borg_1,SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_...
6,jupiter_mini_borg_2,SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_...
7,jupiter_mini_borg_3,SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_...
8,jupiter_mini_borg_4,SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_...
9,jupiter_mini_borg_6,SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_...
