In [2]:
from utils import * 
import os 
from src.ggkbase import * 
import glob
import matplotlib as mpl 
from src.files import BLASTFileJSON, FASTAFile, InterProScanFileTSV, GenBankFile, fasta_get_genome_size
import subprocess
import networkx as nx
import json

%load_ext autoreload
%autoreload 2

# Notebook for all the setup stuff. 

# SR-VP_05_06_2024_ck_bottom_Methanoperedens_41_16 is incomplete but important, needs to be binned in other samples. 
# SR-VP_05_06_2024_ck_bottom_scaffold_71 is a possible misbinned ECE, so don't use as a seed sequence. 
# Strain seems like it might be most abundant in n_middle or ck_bottom 2025.

# Mysterious McrA is on this scaffold https://ggkbase.berkeley.edu/organisms/669853/contigs/1100394534. A bit worried that it didn't end up in any of the bins.  

In [3]:
# mp_x_contig_ids = subprocess.run('cat ../data/mp_x.fn | grep ">"', capture_output=True, shell=True, check=True).stdout.decode().split('\n')
# mp_x_contig_ids = [contig_id.replace('>', '').split()[0] for contig_id in mp_x_contig_ids if (len(contig_id) > 0)]
# mp_x_contig_ids = [contig_id for contig_id in mp_x_contig_ids if (contig_id not in blast_df.qseqid.unique())]

In [4]:

fasta_df = FASTAFile.from_file('../data/checkm/mp_x_n_top_2025.fn').to_df()
fasta_df.seq.apply(len).sum() / 1e6

np.float64(1.436289)

In [5]:
# Dowloaded pretty much every Methanoperedens genome under the SR-VP project.
mp_genome_paths = glob.glob('../data/ggkbase/contigs/methanoperedens/*')
mp_genome_ids = [os.path.basename(path).replace('.contigs.fa', '') for path in mp_genome_paths]
mp_genome_sizes = [fasta_get_genome_size(path) for path in mp_genome_paths]

mp_genome_paths = np.array(mp_genome_paths)[np.array(mp_genome_sizes) > 1e6].tolist()
print('Num. Methanoperedens genomes meeting the minimum size requirement:', len(mp_genome_paths))

for path in glob.glob('../data/ggkbase/contigs/methanoperedens/*'):
    if path not in mp_genome_paths:
        os.remove(path)

Num. Methanoperedens genomes meeting the minimum size requirement: 94


In [6]:
# Genomes selected after running dRep with default parameters. 
mp_genome_ids = [os.path.basename(path).replace('.contigs.fa', '') for path in glob.glob('../data/ggkbase/drep/dereplicated_genomes/*')]
print('Num. Methanoperedens strains:', len(mp_genome_ids))

genome_id_to_ggkbase_id_map = {f'mp_{i + 1}':genome_id for i, genome_id in enumerate(mp_genome_ids)}

genome_id_to_ggkbase_id_map['jupiter_mini_borg_1'] = 'SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_33_21'
genome_id_to_ggkbase_id_map['jupiter_mini_borg_2'] = 'SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_32_5'
genome_id_to_ggkbase_id_map['jupiter_mini_borg_3'] = 'SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_34_1246'
genome_id_to_ggkbase_id_map['jupiter_mini_borg_4'] = 'SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_36_6'
genome_id_to_ggkbase_id_map['jupiter_mini_borg_6'] = 'SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_35_3'
genome_id_to_ggkbase_id_map['jupiter_mini_borg_7'] = 'SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_33_6'
genome_id_to_ggkbase_id_map['jupiter_mini_borg_8'] = 'SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_31_4'
genome_id_to_ggkbase_id_map['jupiter_mini_borg_9'] = 'SR-VP_05_06_2024_coassembly_Jupiter_mini-Borg_35_6'
genome_id_to_ggkbase_id_map['saturn_mini_borg_1'] = 'SR-VP_05_06_2024_coassembly_Saturn_mini-Borg_35_7'
genome_id_to_ggkbase_id_map['saturn_mini_borg_2'] = 'SR-VP_05_06_2024_coassembly_Saturn_mini-Borg_32_200'
genome_id_to_ggkbase_id_map['saturn_mini_borg_3'] = 'SR-VP_05_06_2024_coassembly_Saturn_mini-Borg_33_3'
genome_id_to_ggkbase_id_map['saturn_mini_borg_4'] = 'SR-VP_05_06_2024_coassembly_Saturn_mini-Borg_33_4'
genome_id_to_ggkbase_id_map['unclassified_mini_borg'] = 'SR-VP_05_06_2024_coassembly_mini-Borg_reminscent_42_6-8'
genome_id_to_ggkbase_id_map['unclassified_borg'] = 'SR-VP_05_06_2024_coassembly_new_Borg_34_11'
genome_id_to_ggkbase_id_map['amethyst_borg'] = 'SR-VP_05_06_2024_coassembly_Amethyst_Borg_34_3'
genome_id_to_ggkbase_id_map['oxblood_borg'] = 'SR-VP_05_06_2024_coassembly_Oxblood_Borg_33_20'
genome_id_to_ggkbase_id_map['pink_borg'] = 'SR-VP_05_06_2024_coassembly_Pink_Borg_32_55'
genome_id_to_ggkbase_id_map['purple_borg'] = 'SR-VP_05_06_2024_coassembly_Purple_Borg_33_3'
genome_id_to_ggkbase_id_map['rose_borg'] = 'SR-VP_05_06_2024_coassembly_Rose_Borg_31_2'
genome_id_to_ggkbase_id_map['vermilion_borg'] = 'SR-VP_05_06_2024_coassembly_Vermilion_Borg_34_8'
genome_id_to_ggkbase_id_map['mercury_mini_borg'] = 'SR-VP_05_06_2024_coassembly_Mercury_mini-Borg_37_9'
genome_id_to_ggkbase_id_map['saturn_mini_borg_like'] = 'SR-VP_05_06_2024_coassembly_Saturn_mini-Borg-like_32_7'
genome_id_to_ggkbase_id_map['ruby_borg_related'] = 'SR-VP_05_06_2024_coassembly_Ruby-Borg-related_37_10'
genome_id_to_ggkbase_id_map['black_borg'] = 'SR-VP_05_06_2024_coassembly_Black_Borg_32_272'
genome_id_to_ggkbase_id_map['linear_ece_19kb'] = 'Final_SR-VP_05_06_2024_coassembly_19kb_linear_ECE_26_1334_complete'

genome_metadata_df = pd.DataFrame.from_dict(genome_id_to_ggkbase_id_map, orient='index').reset_index()
genome_metadata_df.columns = ['genome_id', 'ggkbase_id']
genome_metadata_df = genome_metadata_df.set_index('genome_id')
genome_metadata_df.to_csv('../src/data/genome_metadata.csv')
genome_metadata_df.to_csv('./data/genome_metadata.csv')

Num. Methanoperedens strains: 20


In [7]:
# SR-VP_05_06_2024_ck_bottom_Black_host_Methanoperedens-related_44_24 
# SR-VP_05_06_2024_N_top_Candidatus_Methanoperedens_Black-host_type_44_27 

# FINAL_2-50MB_Methanoperedens_SR-VP_07_25_2022_A1_90cm_PACBIO-HIFI_44_8_complete

ggkbase_id_to_genome_id_map = {ggkbase_id:genome_id for genome_id, ggkbase_id in genome_id_to_ggkbase_id_map.items()}
print('ECE host genome:', ggkbase_id_to_genome_id_map['FINAL_2-50MB_Methanoperedens_SR-VP_07_25_2022_A1_90cm_PACBIO-HIFI_44_8_complete'])

ECE host genome: mp_18


In [8]:
get_contig_index = lambda gene_id : gene_id.split('.')[-1].split('_')[0]

for genome_id, ggkbase_id in genome_id_to_ggkbase_id_map.items():
    try:
        fasta_file = FASTAFile.from_file(f'../data/ggkbase/contigs/{ggkbase_id}.contigs.fa')
    except FileNotFoundError:
        fasta_file = FASTAFile.from_file(f'../data/ggkbase/contigs/methanoperedens/{ggkbase_id}.contigs.fa')
        
    fasta_file.ids = [f'{genome_id}.{i + 1}' for i in range(len(fasta_file.ids))] # Rename the contigs
    fasta_file.write(f'../data/data/{genome_id}.fn')

    genbank_file = GenBankFile.from_file(f'../data/ggkbase/genbank/{ggkbase_id}.gbk')
    genbank_file.df['gene_id'] = [f'{genome_id}.{gene_id}' for gene_id in genbank_file.df.gene_id] # Rename the gene IDs.
    genbank_file.df['contig_id'] = [f'{genome_id}.{get_contig_index(gene_id)}' for gene_id in genbank_file.df.gene_id] # Rename the contig IDs.
    subprocess.run(f'cp ../data/ggkbase/genbank/{ggkbase_id}.gbk ../data/data/{genome_id}.gbk', check=True, shell=True)
    genbank_file.to_gff(f'../data/data/{genome_id}.gff')
    genbank_file.to_fasta(f'../data/data/{genome_id}.fa')

In [9]:
with open('../data/data/all.fn', 'w') as f:
    paths = [path for path in glob.glob('../data/data/*fn') if (os.path.basename(path) != 'all.fn')]
    for path in paths:
        with open(path, 'r') as _f: 
            f.write(_f.read())
        
with open('../data/data/all.fa', 'w') as f:
    paths = [path for path in glob.glob('../data/data/*fa') if (os.path.basename(path) != 'all.fa')]
    for path in paths:
        with open(path, 'r') as _f: 
            f.write(_f.read())

gff_header = '##gff-version  3\n'
with open('../data/data/all.gff', 'w') as f:
    f.write(gff_header)
    paths = [path for path in glob.glob('../data/data/*gff') if (os.path.basename(path) != 'all.gff')]
    for path in paths:
        with open(path, 'r') as _f: 
            f.write(_f.read().replace(gff_header, ''))

In [10]:
# def run_fastani(input_paths:list, output_dir:str='../data/ggkbase', fragment_length:int=1000, min_fraction:float=0.2):
#     input_list_path = os.path.join('.', 'fastani_input.txt')
#     output_path = os.path.join(output_dir, 'fastani.tsv')
#     with open(input_list_path, 'w') as f:
#         f.write('\n'.join(input_paths))
#     cmd = f'fastANI --ql {input_list_path} --rl {input_list_path} -o {output_path} --fragLen {fragment_length} --minFraction {min_fraction}'
#     subprocess.run(cmd, shell=True, check=True)
#     os.remove(input_list_path)
    
# # run_fastani(input_paths=mp_genome_paths)

# def load_fastani(path:str='../data/ggkbase/fastani.tsv'):
#     cols = ['query_genome_path', 'ref_genome_path', 'ani', 'n_aligned_fragments', 'n_fragments']
#     fastani_df = pd.read_csv(path, sep='\t', names=cols)
#     fastani_df['query_genome_id'] = fastani_df.query_genome_path.apply(lambda path : os.path.basename(path).replace('.contigs.fa', ''))
#     fastani_df['ref_genome_id'] = fastani_df.ref_genome_path.apply(lambda path : os.path.basename(path).replace('.contigs.fa', ''))
#     return fastani_df

In [11]:
coverm_df

NameError: name 'coverm_df' is not defined