In [None]:
import pandas as pd 
import numpy as np 
from Bio import SeqIO
import src.tools.download
from src.tools import MMSeqs
from src.dataset import Dataset
from src.files import FASTAFile, GBFFFile
from src.clusterer import Clusterer
from tqdm import tqdm
import os
import json 
from src import fillna 

%load_ext autoreload
%autoreload 2

# Because genomes are constantly being re-annotated, the GBFF files I downloaded a month or so ago are now not aligned with the 
# protein files I used to generate the most recent round of embeddings (05/23/2025). GBFF files were re-downloaded for 
# the Campylobacterota phylum on 05/25/2025. 

# ncbi = src.tools.download.NCBI()
# ncbi.get_genomes(genome_ids=genome_ids, include=['gbff'], dirs={'gbff':'../data/ncbi/gbffs/'})
# ncbi.cleanup()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Changing my training dataset strategy. Instead going to use the sequences directly from NCBI. 
genome_metadata_df = pd.read_csv('../data/genome_metadata.csv', index_col=0)
genome_ids = genome_metadata_df[genome_metadata_df.phylum == 'Campylobacterota'].index.values 


In [None]:
# Want to build the dataset for Campylobacterota, but filter out the "suspect" sequences, i.e. those which are both hypothetical
# and have only ab initio evidence (i.e. no evidence of conservation). 

is_hypothetical = lambda df : df['product'] == 'hypothetical protein'
is_ab_initio = lambda df : df.evidence_type == 'ab initio prediction'
is_suspect = lambda df : is_hypothetical(df) & is_ab_initio(df) # This will be False for intergenic sequences. 

campylobacterota_df = list()
for genome_id in tqdm(genome_ids, desc='Building Campylobacterota dataset.'):
    protein_path = f'../data/ncbi/proteins/{genome_id}_protein.faa'
    gbff_path = f'../data/ncbi/gbffs/{genome_id}_genomic.gbff'

    gbff_df = GBFFFile(gbff_path).to_df()
    gbff_df = gbff_df[(gbff_df.feature == 'CDS') & (~gbff_df.pseudo)].copy()
    copy_numbers = gbff_df.protein_id.value_counts() # There are multiple copies of the same protein at different coordinates in the GBFF files with different coordinates. 
    gbff_df['copy_number'] = gbff_df.protein_id.map(copy_numbers)
    gbff_df = gbff_df.drop_duplicates('protein_id').copy()
    gbff_df = gbff_df.drop(columns=['seq']) # Use the sequences from the protein DataFrame, just to make sure everything is equal. 
    gbff_df = gbff_df.set_index('protein_id')
    gbff_df.index.name = 'id'

    protein_df = FASTAFile(path=protein_path).to_df(prodigal_output=False)
    protein_df = protein_df.drop(columns=['description'])

    assert len(protein_df) == len(gbff_df), 'Expected the number of non-pseudo CDS entries in the GBFF file to match the entries in the FASTA file.'
    assert np.all(np.sort(protein_df.index) == np.sort(gbff_df.index)), 'Expected the number of non-pseudo CDS entries in the GBFF file to match the entries in the FASTA file.'
    assert protein_df.index.is_unique and gbff_df.index.is_unique, 'Expected the indices of both DataFrames to be unique.'

    campylobacterota_df.append(protein_df.merge(gbff_df, left_index=True, right_index=True).assign(genome_id=genome_id))

campylobacterota_df = pd.concat(campylobacterota_df)

mask = is_suspect(campylobacterota_df)
print(f'Removing {mask.sum()} suspect sequences from the Campylobacterota dataset.')
campylobacterota_df = campylobacterota_df[~mask].copy()

campylobacterota_df.to_csv('../data/campylobacterota.csv')

Building Campylobacterota dataset.: 100%|██████████| 197/197 [06:16<00:00,  1.91s/it]


Removing 44475 suspect sequences from the Campylobacterota dataset.
Removing 341 sequences exceeding the maximum length from the Campylobacterota dataset.


In [78]:
antifam_df = pd.read_csv('../data/antifam.csv', index_col=0)
campylobacterota_df = pd.read_csv('../data/campylobacterota.csv', index_col=0)

is_bacterial = lambda df : np.array(['2' in lineage.split(' ') for lineage in df.lineage])

mask = ~is_bacterial(antifam_df)
print(f'Removing {mask.sum()} non-bacterial sequences from the AntiFam dataset.')
antifam_df = antifam_df[~mask].copy()


Removing 9789 non-bacterial sequences from the AntiFam dataset.


In [205]:
campylobacterota_df['library_entry_name'] = campylobacterota_df.genome_id 
campylobacterota_df['label'] = 1

antifam_df['library_entry_name'] = 'antifam'
antifam_df['label'] = 0

dataset_df = pd.concat([antifam_df, campylobacterota_df])
dataset_df = fillna(dataset_df, rules={str:'none', bool:False, int:0, float:0})

mask = (dataset_df.seq.apply(len) >= 2000) # Upper length bound is non-inclusive. 
print(f'Removing {mask.sum()} sequences exceeding the maximum length from the Campylobacterota dataset.')
dataset_df = dataset_df[~mask].copy()

def check_duplicate_ids_have_identical_sequences(dataset_df):
    duplicate_ids = dataset_df.index[dataset_df.index.duplicated()].unique()
    for id_ in tqdm(duplicate_ids, desc='check_duplicate_ids_have_identical_sequences'):
        seqs = dataset_df.seq[dataset_df.index == id_]
        assert np.all(seqs == seqs.iloc[0]), f'check_duplicate_ids_have_identical_sequences: Sequences with ID {id_} are not equal.'

# check_duplicate_ids_have_identical_sequences(dataset_df)

mask = dataset_df.seq.duplicated(keep='first')
print(f'Removing {mask.sum()} duplicate sequences from the Campylobacterota dataset')
dataset_df = dataset_df[~mask].copy()

dataset_df.to_csv('../data/dataset.csv')


Removing 16 sequences exceeding the maximum length from the Campylobacterota dataset.
Removing 9522 duplicate sequences from the Campylobacterota dataset


In [207]:
dataset_subset_df = dataset_df.sample(5000, random_state=42)
dataset_subset_df.to_csv('../data/dataset_subset.csv')

In [169]:
# What is a good estimate for the number of clusters? Maybe use 50 percent sequence similarity?
mmseqs = MMSeqs()
cluster_df = mmseqs.cluster(dataset_df, job_name='cluster', output_dir='../data', sequence_identity=0.5, overwrite=False)
mmseqs.cleanup()

cluster_sizes = cluster_df.groupby('mmseqs_cluster_label').apply(len, include_groups=False)

In [100]:
dataset_df['cluster_label'] = dataset_df.index.map(cluster_df.mmseqs_cluster_label)
dataset_df['cluster_n_labels'] = dataset_df.cluster_label.map(dataset_df.groupby('cluster_label').label.nunique())
print('Number of clusters:', dataset_df.cluster_label.nunique())
print('Number of singleton clusters:', (cluster_sizes == 1).sum())

Number of clusters: 68865
Number of singleton clusters: 39301


In [201]:
dataset_subset = Dataset.from_hdf('../data/dataset_subset.h5', feature_type='esm_650m_gap')
clusterer = Clusterer(n_clusters=1000)
clusterer.fit(dataset_subset)
cluster_df = clusterer.to_df(dataset=dataset_subset)
dataset_subset_df = dataset_subset_df.drop(columns=cluster_df.columns, errors='ignore').merge(cluster_df, left_index=True, right_index=True)

  


In [206]:
dataset_df.index.is_unique

True