In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import re 
from utils import * 
import os 
from src.ggkbase import * 
import glob
import matplotlib as mpl 
from src.files import BLASTFileJSON, FASTAFile, InterProScanFileTSV, GenBankFile, fasta_get_genome_size
import subprocess
import shutil
import networkx as nx

%load_ext autoreload
%autoreload 2

In [2]:

# Want to evaluate the likelihood of other potential hosts. Look for all other Methanoperedens in the bioreactors. 
# For some reason, it seems like things aren't appropriately-labeled in the coassembly. 
organism_info_df = list()
for path in glob.glob('../data/ggkbase/*organism_info.tsv'):
    organism_info_df.append(load_organism_info(path))
organism_info_df = pd.concat(organism_info_df)

mp_organism_info_df = organism_info_df[organism_info_df.taxonomy.str.contains('Methanoperedens') | organism_info_df.index.str.contains('Methanoperedens')].copy()
bb_organism_info_df = organism_info_df[organism_info_df.index.str.contains('Black_Borg')].copy()


In [3]:
def run_fastani(input_dir:str='../data/ggkbase/contigs', output_dir:str='../data/ggkbase'):
    input_list_path = os.path.join('.', 'fastani_input.txt')
    output_path = os.path.join(output_dir, 'fastani.tsv')
    input_paths = list(glob.glob(os.path.join(input_dir, '*')))
    with open(input_list_path, 'w') as f:
        f.write('\n'.join(input_paths))
    cmd = f'fastANI --ql {input_list_path} --rl {input_list_path} -o {output_path}'
    subprocess.run(cmd, shell=True, check=True)
    os.remove(input_list_path)
# run_fastani()

def load_fastani(path:str='../data/ggkbase/fastani.tsv'):
    cols = ['query_genome_path', 'ref_genome_path', 'ani', 'n_aligned', 'n_fragments']
    fastani_df = pd.read_csv(path, sep='\t', names=cols)
    fastani_df['query_genome_id'] = fastani_df.query_genome_path.apply(lambda path : os.path.basename(path).replace('.contigs.fa', ''))
    fastani_df['ref_genome_id'] = fastani_df.ref_genome_path.apply(lambda path : os.path.basename(path).replace('.contigs.fa', ''))
    return fastani_df

In [4]:
threshold = 99
fastani_df = load_fastani()
fastani_df = fastani_df[fastani_df.ani > threshold].copy()
fastani_df = fastani_df[fastani_df.query_genome_id.isin(mp_organism_info_df.index.values)].copy()

edges = list(zip(fastani_df.query_genome_id, fastani_df.ref_genome_id))
graph = nx.Graph(edges)
strains = list(nx.connected_components(graph))
print('Num. separate Methanoperedens strains:', len(strains))

strain_ids = {genome_id:f'mp_{i + 1}' for i, genome_ids in enumerate(strains) for genome_id in genome_ids}

mp_organism_info_df['strain_id'] = mp_organism_info_df.index.map(strain_ids)
# Use the biggest assembly as the representative. These are the ones used to create the id_to_ggkbase_name_map.
mp_organism_info_df = mp_organism_info_df.sort_values('bin_length', ascending=False).drop_duplicates('strain_id')
mp_organism_info_df = mp_organism_info_df.reset_index()

Num. separate Methanoperedens strains: 5


In [5]:

for id_, ggkbase_name in id_to_ggkbase_name_map.items():
    # print(f'cp ../data/ggkbase/contigs/{ggkbase_name}.contigs.fa ../data/data/{id_}.fn')
    subprocess.run(f'cp ../data/ggkbase/contigs/{ggkbase_name}.contigs.fa ../data/data/{id_}.fn', shell=True, check=True)
    subprocess.run(f'cp ../data/ggkbase/genbank/{ggkbase_name}.gbk ../data/data/{id_}.gbk', check=True, shell=True)
