In [2]:
import pandas as pd
import h5py
import re
import os
import numpy as np 

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

from selenobot.files import FASTAFile
from selenobot.utils import download_ncbi_data

from transformers import AutoTokenizer

%load_ext autoreload
%autoreload 2

In [3]:
# Want to dig into how Prodigal might be truncating selenoproteins at the N-terminus (i.e. picking up the right half of the sequence)

# First figure out what taxa have known selenoproteins... 
metadata_sec_df = pd.read_csv('../data/uniprot_sec.csv', index_col=0)
metadata_sec_df = metadata_sec_df[metadata_sec_df.domain == 'Bacteria']
print('Fraction with assigned species:', (~metadata_sec_df.species.isnull()).sum() / len(metadata_sec_df)) # Everything is assigned a species!
print('Fraction with assigned NCBI taxonomy IDs:', (~metadata_sec_df.ncbi_taxonomy_id.isnull()).sum() / len(metadata_sec_df)) # Everything is assigned a species!
print('Number of unique species:', metadata_sec_df.species.nunique()) # Why is there not one ID per species?
print('Number of unique families:', metadata_sec_df.family.nunique()) # Why is there not one ID per species?
print('Number of unique taxonomy IDs:', metadata_sec_df.ncbi_taxonomy_id.nunique())
print('Number of selenoproteins:', len(metadata_sec_df))

# Perhaps the species label is a particular strain?

n_sec_per_tax_id = metadata_sec_df.groupby('ncbi_taxonomy_id').apply(len, include_groups=False)
n_sec_per_tax_id.sort_values(ascending=False)

# NCBI says that the taxonomy IDs are for species... how could there be 900 different selenoproteins associated with a single species?
# Maybe they are different strains? The sequences seem different. I wonder how many were left after dereplication. 

# How do we figure out if there is an NCBI reference genome?

genome_metadata_df = pd.read_csv('../data/genome_metadata_gtdb.csv', index_col=0)
print('Number of GTDB genomes:', len(genome_metadata_df))
print('Number of GTDB genomes with a RefSeq ID:', (genome_metadata_df.ncbi_refseq_category != 'na').sum())
genome_metadata_df = genome_metadata_df[genome_metadata_df.ncbi_refseq_category != 'na']
genome_metadata_df = genome_metadata_df[genome_metadata_df.ncbi_taxonomy_id.isin(metadata_sec_df.ncbi_taxonomy_id.unique())]
print('Number of GTDB genomes with a RefSeq ID and a selenoprotein:', len(genome_metadata_df))

Fraction with assigned species: 1.0
Fraction with assigned NCBI taxonomy IDs: 1.0
Number of unique species: 16412
Number of unique families: 702
Number of unique taxonomy IDs: 2894
Number of selenoproteins: 16450
Number of GTDB genomes: 62291
Number of GTDB genomes with a RefSeq ID: 10444
Number of GTDB genomes with a RefSeq ID and a selenoprotein: 732


In [6]:
# for file_name in os.listdir('../data/refseq/ref'):
#     genome_id = file_name.replace('_genomic.gbff', '')
#     if genome_id not in genome_metadata_df.index:
#         print('Removing files for', genome_id)
#         os.remove(os.path.join('../data/refseq/ref', file_name))
#         os.remove(os.path.join('../data/refseq/genomes', f'{genome_id}_genomic.fna'))

In [8]:
download_ncbi_data(genome_metadata_df, dir_='../data/refseq/')


download_ncbi_data: Downloading data for Alsobacter soli...:   2%|▏         | 15/732 [00:00<00:02, 288.69it/s] 

[Acp: cannot stat 'ncbi_dataset/data/GCF_013409135.1/genomic.gbff': No such file or directory


download_ncbi_data: Failed to download data for Pseudomonas_A kunmingensis_A.


download_ncbi_data: Downloading data for Paracoccus thiocyanatus...: 100%|██████████| 732/732 [00:14<00:00, 49.14it/s] 


In [6]:
# Generate FASTA files for KEGG annotation. 
fasta_files = dict()
fasta_files['uniprot_sec.faa'] = pd.read_csv('../data/uniprot_metadata_sec.csv', index_col=0)
fasta_files['uniprot_sec_bacteria.faa'] = metadata_sec_df[metadata_sec_df.domain == 'Bacteria']
seqs, index = [s[:s.index('U')] for s in metadata_sec_bac_df.seq], pd.Series(name='id', data=[id_ + '-' for id_ in metadata_sec_bac_df.index])
fasta_files['uniprot_sec_bacteria_truncated.faa'] = metadata_sec_bac_df.copy().assign(seq=seqs).set_index(index)

for file_name, df in fasta_files.items():
    path = os.path.join('../data', file_name)
    fasta_file = FASTAFile.from_df(df, add_description=False)
    fasta_file.write(path)


In [6]:
confusion_matrix = sklearn.metrics.confusion_matrix(test_metadata_df.label.values, test_metadata_df.prediction.values)

image = ax.imshow(confusion_matrix, cmap='RdBu')

ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xlabel('prediction')
ax.set_ylabel('label')
for i in range(confusion_matrix.shape[0]):
    for j in range(confusion_matrix.shape[-1]):
        text = ax.text(i, j, confusion_matrix[i, j], ha='center', va='center', color='white')

image = ax.figure.colorbar(image, ax=ax)
# Most of the erroneous predictions are false positives. 
plt.show()