In [18]:
import pandas as pd 
import os 
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.ticker as tick 
import seaborn as sns 
from utils import load_ncbi_genome_metadata
import src.download as download
import src.tools as tools
from src import get_genome_id
import glob
from tqdm import tqdm
from src.files import FASTAFile

%load_ext autoreload 
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
dataset_df = pd.read_csv('../data/dataset/dataset.csv', index_col=0)
# This is the genome metadata for all NCBI reference genomes. 
ncbi_genome_metadata_df = load_ncbi_genome_metadata(genome_metadata_path='../data/dataset/ncbi_genome_metadata.tsv', taxonomy_metadata_path='../data/dataset/ncbi_taxonomy_metadata.tsv') 

In [3]:
phyla_in_dataset = np.unique([lineage.split(';')[1] for lineage in dataset_df.lineage[dataset_df.lineage.str.contains(';')]])
# phyla_in_dataset = np.unique([phylum.replace('Candidatus ', '') for phylum in phyla_in_dataset]) 
phyla_in_dataset = np.unique([phylum for phylum in phyla_in_dataset if (phylum != 'environmental samples')])

print(len(phyla_in_dataset), 'phyla in the dataset.')
print(ncbi_genome_metadata_df.phylum.nunique(), 'phyla with NCBI reference genomes.')

83 phyla in the dataset.
56 phyla with NCBI reference genomes.


In [4]:
rare_taxa_df = ncbi_genome_metadata_df[~ncbi_genome_metadata_df.phylum.isin(phyla_in_dataset)].copy()
rare_taxa_df = rare_taxa_df.sort_values('checkm_completeness', ascending=False)
rare_taxa_df = rare_taxa_df.drop_duplicates('phylum', keep='first')

In [None]:
ncbi = download.ncbi.NCBI()
ncbi.get_genomes(rare_taxa_df.index, include=['gbff', 'genome'], dirs={'genome':'../data/ncbi/genomes', 'gbff':'../data/ncbi/gbffs'})
ncbi.cleanup()

NCBI.get_genomes: Downloading data for GCF_021057185.1.: 100%|██████████| 20/20 [03:05<00:00,  9.29s/it]


In [17]:
prodigal = tools.Prodigal()
for input_path in tqdm(glob.glob('../data/ncbi/genomes/*'), desc='Running Prodigal on genomes for rare taxa.'):
    output_path = f'../data/prodigal/{get_genome_id(input_path)}.faa'
    if not os.path.exists(output_path):
        prodigal.run(input_path, output_path=output_path)

Running Prodigal on genomes for rare taxa.:   0%|          | 0/20 [00:00<?, ?it/s]

Running Prodigal on genomes for rare taxa.: 100%|██████████| 20/20 [01:58<00:00,  5.94s/it]


In [6]:
# def figure(dataset_df:pd.DataFrame, level:str='genus', top_n:int=5):

#     fig, axes = plt.subplots(ncols=2, figsize=(10, 5))

#     for ax, (_, df) in zip(axes, dataset_df.groupby('label')):
#         ax_df = df.value_counts(level)
#         ax_df = ax_df / ax_df.sum() * 100
#         ax_df = ax_df[ax_df.index != 'none'].copy()
#         ax_df = ax_df.iloc[:top_n]

#         sns.barplot(data=ax_df, ax=ax, color='lightgray', edgecolor='black')

#         if (level == 'species'):
#             x_tick_labels = [f'{taxon.split()[0][0]}. {taxon.split()[-1]}' for taxon in ax_df.index]
#             ax.set_xticks(np.arange(len(ax_df)), labels=x_tick_labels, rotation=90, fontstyle='italic')
#         else:
#             ax.set_xticks(np.arange(len(ax_df)), labels=ax_df.index, rotation=90)
#         ax.yaxis.set_major_formatter(tick.PercentFormatter())
#         ax.text(0.5, 0.7, f'{ax_df.sum():.2f}% in top {top_n} {level}', transform=ax.transAxes)

#     axes[0].set_title('AntiFam')
#     axes[1].set_title('SwissProt')

#     fig.tight_layout()
#     plt.show()

# figure(dataset_df, level='phylum')