In [13]:
import pandas as pd 
import numpy as np 
import os
import itertools
from aerobot.utils import DATA_DIR
import glob
print(DATA_DIR)
print(os.listdir(DATA_DIR))

AMINO_ACIDS = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'U']
NUCLEOTIDES = ['A', 'C', 'T', 'G']

FEATURE_TYPES = [f'nt_{i}mer' for i in range(1, 6)] + [f'aa_{i}mer' for i in range(1, 4)]

/home/prichter/miniconda3/envs/aerobot-tool/lib/python3.12/site-packages/aerobot/data
['__pycache__', '__init__.py']


In [12]:
for file_name in os.listdir(DATA_DIR):
    if (file_name != '__init__.py') and os.path.isfile(os.path.join(DATA_DIR, file_name)):
        os.remove(os.path.join(DATA_DIR, file_name))

In [14]:
store = pd.HDFStore('/home/prichter/miniconda3/envs/aerobot-tool/lib/python3.12/site-packages/aerobot/data/training_datasets.h5')
print(list(store.keys()))
store.close()

['/aa_1mer', '/aa_2mer', '/aa_3mer', '/labels', '/nt_1mer', '/nt_2mer', '/nt_3mer', '/nt_4mer', '/nt_5mer']


In [None]:
# Subset the training data used in the original paper to only include the features used in the tool. 

for file_name in ['training_datasets.h5', 'testing_datasets.h5', 'validation_datasets.h5']:
    store = pd.HDFStore(os.path.join('/home/prichter/Documents/aerobot-paper/data', file_name))
    subset_store = pd.HDFStore(os.path.join('./data', file_name), mode='w')
    for feature_type in FEATURE_TYPES:
        feature_data = store.get(f'/{feature_type}').copy()
        feature_data.index.name = 'genome_id'
        subset_store.put(feature_type, feature_data)

    metadata_df = store.get('/metadata')
    ternary_labels = metadata_df.physiology.str.lower() 
    binary_labels = ternary_labels.replace({'aerobe':'tolerant', 'facultative':'tolerant', 'anaerobe':'intolerant'}).values
    labels_df = pd.DataFrame({'binary':binary_labels, 'ternary':ternary_labels, 'genome_id':metadata_df.index}).set_index('genome_id')
    subset_store.put('labels', labels_df)

    subset_store.close()
    store.close()

In [3]:
subset_store = pd.HDFStore(os.path.join('./data', 'testing_datasets.h5'), mode='r')
aa_3mers_df = subset_store.get('aa_3mer')
print('Number of 3-mers:', len(aa_3mers_df.columns))
print('Number of valid 3-mers:', len([c for c in aa_3mers_df.columns if np.all(np.isin(list(c), AMINO_ACIDS))]))

valid_aa_3mers = [c for c in aa_3mers_df.columns if np.all(np.isin(list(c), AMINO_ACIDS))]
amino_acids = []
for aa_3mer in valid_aa_3mers:
    amino_acids += list(aa_3mer)
print(f'Amino acids present (n={len(set(amino_acids))}):', set(amino_acids))

subset_store.close()

Number of 3-mers: 9868
Number of valid 3-mers: 8134
Amino acids present (n=21): {'Y', 'I', 'T', 'C', 'P', 'H', 'S', 'F', 'R', 'A', 'V', 'Q', 'D', 'G', 'L', 'K', 'M', 'W', 'U', 'N', 'E'}


In [9]:

# Test to make sure the above worked...
for file_name in ['training_datasets.h5', 'testing_datasets.h5', 'validation_datasets.h5']:
    subset_store = pd.HDFStore(os.path.join('./data', file_name), mode='r')
    print(file_name, len(subset_store.get('aa_1mer')))

    subset_store.close()

training_datasets.h5 2084
testing_datasets.h5 587
validation_datasets.h5 465


In [5]:
len([''.join(i) for i in itertools.product(AMINO_ACIDS, repeat=3)])

9261

In [6]:
# count = 0
# for i in itertools.product(set(amino_acids), repeat=3):
#     print(''.join(i))
#     count += 1
#     if count > 20:
#         break