# Play with FASTA Files and Hepfiles

In [1]:
import hepfile as hf

In [2]:
def read_fasta(filepath:str) -> list[str]:
    '''
    Function to read in a fasta file and return a list of the nucleotide sequences
    
    Args:
        filepath [str]: path to the fasta file
        
    Returns:
        list of nucleotide sequences to be parsed
    '''
    
    with open(filepath, 'r') as f:
        lines = [line.replace('\n', '').strip() for line in f.readlines()]
        idxs = [idx for idx, line in enumerate(lines) if line[0] == '>']
                
    split_fasta = []
    for ii in range(len(idxs)):
        idx1 = idxs[ii]
        if ii == len(idxs)-1:
            idx2 = -1
        else:
            idx2 = idxs[ii+1]
        
        split_fasta.append(lines[idx1:idx2])
        
    return split_fasta

def parse_sequence(seq:str) -> dict:
    '''
    Parses a sequence and returns a dictionary of the information
    
    Args:
        seq (str): sequence in fasta format
    
    Returns:
        dictionary of sequence
    '''
    
    # first deal with the metadata
    meta = seq[0].split()
    name = meta[0][1:]
    descr = meta[1:]
    
    # then concatenate the rest of the data
    data = ''.join(seq[1:])
    
    # pack this all into a dictionary
    all_data = {'name': name, 'meta':descr, 'data':list(data)}
    return all_data

In [3]:
filepath = '/home/nfranz/research/hepfile/docs/example_nb/test.fasta'
split = read_fasta(filepath)

## Entering the data as singletons

In [4]:
data = []
for seq in split:
    data.append(parse_sequence(seq))
    
print(data)

[{'name': 'crab_anapl', 'meta': ['ALPHA', 'CRYSTALLIN', 'B', 'CHAIN', '(ALPHA(B)-CRYSTALLIN).'], 'data': ['M', 'D', 'I', 'T', 'I', 'H', 'N', 'P', 'L', 'I', 'R', 'R', 'P', 'L', 'F', 'S', 'W', 'L', 'A', 'P', 'S', 'R', 'I', 'F', 'D', 'Q', 'I', 'F', 'G', 'E', 'H', 'L', 'Q', 'E', 'S', 'E', 'L', 'L', 'P', 'A', 'S', 'P', 'S', 'L', 'S', 'P', 'F', 'L', 'M', 'R', 'S', 'P', 'I', 'F', 'R', 'M', 'P', 'S', 'W', 'L', 'E', 'T', 'G', 'L', 'S', 'E', 'M', 'R', 'L', 'E', 'K', 'D', 'K', 'F', 'S', 'V', 'N', 'L', 'D', 'V', 'K', 'H', 'F', 'S', 'P', 'E', 'E', 'L', 'K', 'V', 'K', 'V', 'L', 'G', 'D', 'M', 'V', 'E', 'I', 'H', 'G', 'K', 'H', 'E', 'E', 'R', 'Q', 'D', 'E', 'H', 'G', 'F', 'I', 'A', 'R', 'E', 'F', 'N', 'R', 'K', 'Y', 'R', 'I', 'P', 'A', 'D', 'V', 'D', 'P', 'L', 'T', 'I', 'T', 'S', 'S', 'L', 'S', 'L', 'D', 'G', 'V', 'L', 'T', 'V', 'S', 'A', 'P', 'R', 'K', 'Q', 'S', 'D', 'V', 'P', 'E', 'R', 'S', 'I', 'P', 'I', 'T', 'R', 'E', 'E', 'K', 'P', 'A', 'I', 'A', 'G', 'A', 'Q', 'R', 'K']}, {'name': 'crab_bovin',

In [5]:
awk = hf.dict_tools.dictlike_to_hepfile(data, 'out-fasta.h5', write_hepfile=False)

Adding dataset [1mname[0m to the dictionary as a SINGLETON.
Adding dataset [1mmeta[0m to the dictionary as a SINGLETON.
Adding dataset [1mdata[0m to the dictionary as a SINGLETON.




## Examples using the hepfile structure

In [6]:
# get all of the data names
awk.name

In [7]:
# get all of the data flattened
import awkward as ak
ak.flatten(awk.data)

In [8]:
# get information corresponding to 'crab_anapl'
anapl = awk[awk.name == 'crab_anapl']
anapl.show()

[{name: 'crab_anapl', meta: ['ALPHA', ...], data: ['M', ...]}]


In [9]:
# get just the crab_anapl data
anapl.data

In [10]:
# get just the crab_anapl metadata
anapl.meta

## Entering the data using groups and datasets

In [11]:
hepfile = hf.initialize()

for_hepfile = []
for seq in split:
    for_hepfile.append(parse_sequence(seq))

for d in for_hepfile:
    
    group = d['name']
    
    # create the group and add metadata for that group
    hf.create_group(hepfile, group, counter=f'n_{group}')
    hf.add_group_meta(hepfile, group, d['meta'])
    
    # create a dataset underneath that group
    hf.create_dataset(hepfile, 'sequence', group=group, dtype=str)
    
bucket = hf.create_single_bucket(hepfile)
for d in for_hepfile:
    group = d['name']
    bucket[f'{group}/sequence'] = d['data']

return_value = hf.pack(hepfile,bucket,STRICT_CHECKING=True,verbose=False)  

Adding group [1mcrab_anapl[0m
Adding a counter for [1mcrab_anapl[0m as [1mn_crab_anapl[0m
Adding dataset [1msequence[0m to the dictionary under group [1mcrab_anapl[0m.
Adding group [1mcrab_bovin[0m
Adding a counter for [1mcrab_bovin[0m as [1mn_crab_bovin[0m
Adding dataset [1msequence[0m to the dictionary under group [1mcrab_bovin[0m.
Adding group [1mcrab_chick[0m
Adding a counter for [1mcrab_chick[0m as [1mn_crab_chick[0m
Adding dataset [1msequence[0m to the dictionary under group [1mcrab_chick[0m.
Adding group [1mcrab_human[0m
Adding a counter for [1mcrab_human[0m as [1mn_crab_human[0m
Adding dataset [1msequence[0m to the dictionary under group [1mcrab_human[0m.
Adding group [1mcrab_mesau[0m
Adding a counter for [1mcrab_mesau[0m as [1mn_crab_mesau[0m
Adding dataset [1msequence[0m to the dictionary under group [1mcrab_mesau[0m.
Adding group [1mcrab_mouse[0m
Adding a counter for [1mcrab_mouse[0m as [1mn_crab_mouse[0m
Adding dataset

In [12]:
filepath = 'test-fasta-out.h5'
hf.write_to_file(filepath, hepfile)

_SINGLETONS_GROUP_/COUNTER       has 1            entries
crab_anapl/n_crab_anapl          has 1            entries
crab_bovin/n_crab_bovin          has 1            entries
crab_chick/n_crab_chick          has 1            entries
crab_human/n_crab_human          has 1            entries
crab_mesau/n_crab_mesau          has 1            entries
crab_mouse/n_crab_mouse          has 1            entries
crab_rabit/n_crab_rabit          has 1            entries
crab_rat/n_crab_rat              has 1            entries
crab_squac/n_crab_squac          has 1            entries
Metadata added


<Closed HDF5 file>

In [13]:
hepfile['_GROUPS_']

{'_SINGLETONS_GROUP_': ['COUNTER'],
 'crab_anapl': ['n_crab_anapl', 'sequence'],
 'crab_bovin': ['n_crab_bovin', 'sequence'],
 'crab_chick': ['n_crab_chick', 'sequence'],
 'crab_human': ['n_crab_human', 'sequence'],
 'crab_mesau': ['n_crab_mesau', 'sequence'],
 'crab_mouse': ['n_crab_mouse', 'sequence'],
 'crab_rabit': ['n_crab_rabit', 'sequence'],
 'crab_rat': ['n_crab_rat', 'sequence'],
 'crab_squac': ['n_crab_squac', 'sequence']}

In [14]:
data, meta, bucket = hf.load(filepath)

Building the indices...

Built the indices!
Data is read in and input file is closed.


In [15]:
data

{'_MAP_DATASETS_TO_COUNTERS_': {'_SINGLETONS_GROUP_': '_SINGLETONS_GROUP_/COUNTER',
  'crab_anapl': 'crab_anapl/n_crab_anapl',
  'crab_anapl/sequence': 'crab_anapl/n_crab_anapl',
  'crab_bovin': 'crab_bovin/n_crab_bovin',
  'crab_bovin/sequence': 'crab_bovin/n_crab_bovin',
  'crab_chick': 'crab_chick/n_crab_chick',
  'crab_chick/sequence': 'crab_chick/n_crab_chick',
  'crab_human': 'crab_human/n_crab_human',
  'crab_human/sequence': 'crab_human/n_crab_human',
  'crab_mesau': 'crab_mesau/n_crab_mesau',
  'crab_mesau/sequence': 'crab_mesau/n_crab_mesau',
  'crab_mouse': 'crab_mouse/n_crab_mouse',
  'crab_mouse/sequence': 'crab_mouse/n_crab_mouse',
  'crab_rabit': 'crab_rabit/n_crab_rabit',
  'crab_rabit/sequence': 'crab_rabit/n_crab_rabit',
  'crab_rat': 'crab_rat/n_crab_rat',
  'crab_rat/sequence': 'crab_rat/n_crab_rat',
  'crab_squac': 'crab_squac/n_crab_squac',
  'crab_squac/sequence': 'crab_squac/n_crab_squac'},
 '_MAP_DATASETS_TO_INDEX_': {'_SINGLETONS_GROUP_': '_SINGLETONS_GROUP_/C

In [16]:
meta

{'_NUMBER_OF_BUCKETS_': 1,
 'date': '2023-06-16 10:39:39.433175',
 'h5py_version': '3.7.0',
 'hepfile_version': '0.1.3',
 'numpy_version': '1.21.5',
 'python_version': '3.9.16 (main, Mar  1 2023, 18:22:10) \n[GCC 11.2.0]'}