# Play with FASTA Files and Hepfiles

In [1]:
import hepfile as hf

In [2]:
def read_fasta(filepath:str) -> list[str]:
    '''
    Function to read in a fasta file and return a list of the nucleotide sequences
    
    Args:
        filepath [str]: path to the fasta file
        
    Returns:
        list of nucleotide sequences to be parsed
    '''
    
    with open(filepath, 'r') as f:
        lines = [line.replace('\n', '').strip() for line in f.readlines()]
        idxs = [idx for idx, line in enumerate(lines) if line[0] == '>']
                
    split_fasta = []
    for ii in range(len(idxs)):
        idx1 = idxs[ii]
        if ii == len(idxs)-1:
            idx2 = -1
        else:
            idx2 = idxs[ii+1]
        
        split_fasta.append(lines[idx1:idx2])
        
    return split_fasta

def parse_sequence(seq:str) -> dict:
    '''
    Parses a sequence and returns a dictionary of the information
    
    Args:
        seq (str): sequence in fasta format
    
    Returns:
        dictionary of sequence
    '''
    
    # first deal with the metadata
    meta = seq[0].split()
    name = meta[0][1:]
    descr = meta[1:]
    
    # then concatenate the rest of the data
    data = ''.join(seq[1:])
    
    # pack this all into a dictionary
    all_data = {'name': name, 'meta':descr, 'data':list(data)}
    return all_data

In [14]:
filepath = '/home/nfranz/research/hepfile/docs/example_nb/test.fasta'
split = read_fasta(filepath)

## Entering the data as singletons

In [15]:
data = []
for seq in split:
    data.append(parse_sequence(seq))
    
print(data)

[{'name': 'crab_anapl', 'meta': ['ALPHA', 'CRYSTALLIN', 'B', 'CHAIN', '(ALPHA(B)-CRYSTALLIN).'], 'data': ['M', 'D', 'I', 'T', 'I', 'H', 'N', 'P', 'L', 'I', 'R', 'R', 'P', 'L', 'F', 'S', 'W', 'L', 'A', 'P', 'S', 'R', 'I', 'F', 'D', 'Q', 'I', 'F', 'G', 'E', 'H', 'L', 'Q', 'E', 'S', 'E', 'L', 'L', 'P', 'A', 'S', 'P', 'S', 'L', 'S', 'P', 'F', 'L', 'M', 'R', 'S', 'P', 'I', 'F', 'R', 'M', 'P', 'S', 'W', 'L', 'E', 'T', 'G', 'L', 'S', 'E', 'M', 'R', 'L', 'E', 'K', 'D', 'K', 'F', 'S', 'V', 'N', 'L', 'D', 'V', 'K', 'H', 'F', 'S', 'P', 'E', 'E', 'L', 'K', 'V', 'K', 'V', 'L', 'G', 'D', 'M', 'V', 'E', 'I', 'H', 'G', 'K', 'H', 'E', 'E', 'R', 'Q', 'D', 'E', 'H', 'G', 'F', 'I', 'A', 'R', 'E', 'F', 'N', 'R', 'K', 'Y', 'R', 'I', 'P', 'A', 'D', 'V', 'D', 'P', 'L', 'T', 'I', 'T', 'S', 'S', 'L', 'S', 'L', 'D', 'G', 'V', 'L', 'T', 'V', 'S', 'A', 'P', 'R', 'K', 'Q', 'S', 'D', 'V', 'P', 'E', 'R', 'S', 'I', 'P', 'I', 'T', 'R', 'E', 'E', 'K', 'P', 'A', 'I', 'A', 'G', 'A', 'Q', 'R', 'K']}, {'name': 'crab_bovin',

In [16]:
awk = hf.dict_tools.dictlike_to_hepfile(data, 'out-fasta.h5', write_hepfile=False)

Adding dataset [1mname[0m to the dictionary as a SINGLETON.
Adding dataset [1mmeta[0m to the dictionary as a SINGLETON.
Adding dataset [1mdata[0m to the dictionary as a SINGLETON.


## Examples using the hepfile structure

In [17]:
# get all of the data names
awk.name

In [18]:
# get all of the data flattened
import awkward as ak
ak.flatten(awk.data)

In [19]:
# get information corresponding to 'crab_anapl'
anapl = awk[awk.name == 'crab_anapl']
anapl.show()

[{name: 'crab_anapl', meta: ['ALPHA', ...], data: ['M', ...]}]


In [20]:
# get just the crab_anapl data
anapl.data

In [21]:
# get just the crab_anapl metadata
anapl.meta

## Entering the data using groups and datasets

In [22]:
hepfile = hf.initialize()
meta_mapping = {}

for d in data:
    
    group = d['name']
    meta_name = f'meta_{group}'
    
    hf.create_group(hepfile, group, counter=f'n_{group}')
    hf.create_dataset(hepfile, 'sequence', group=group)
    hf.create_dataset(hepfile, meta_name) 
    
    bucket = hf.create_single_bucket(hepfile)
    bucket[f'{group}/sequence'] = d['data']
    bucket[meta_name] = d['meta']
    
    meta_mapping[group] = meta_name 
    
    return_value = hf.pack(hepfile,bucket,STRICT_CHECKING=True)
    
    hf.clear_bucket(bucket)

hf.create_dataset(hepfile, 'meta_mapping')
hepfile['meta_mapping'] = meta_mapping

    
hepfile

Adding group [1mcrab_anapl[0m
Adding a counter for [1mcrab_anapl[0m as [1mn_crab_anapl[0m
Adding dataset [1msequence[0m to the dictionary under group [1mcrab_anapl[0m.
Adding dataset [1mmeta_crab_anapl[0m to the dictionary as a SINGLETON.
Adding group [1mcrab_bovin[0m
Adding a counter for [1mcrab_bovin[0m as [1mn_crab_bovin[0m
Adding dataset [1msequence[0m to the dictionary under group [1mcrab_bovin[0m.
Adding dataset [1mmeta_crab_bovin[0m to the dictionary as a SINGLETON.
Adding group [1mcrab_chick[0m
Adding a counter for [1mcrab_chick[0m as [1mn_crab_chick[0m
Adding dataset [1msequence[0m to the dictionary under group [1mcrab_chick[0m.
Adding dataset [1mmeta_crab_chick[0m to the dictionary as a SINGLETON.
Adding group [1mcrab_human[0m
Adding a counter for [1mcrab_human[0m as [1mn_crab_human[0m
Adding dataset [1msequence[0m to the dictionary under group [1mcrab_human[0m.
Adding dataset [1mmeta_crab_human[0m to the dictionary as a SINGLET

{'_GROUPS_': {'_SINGLETONS_GROUP_': ['COUNTER',
   'meta_crab_anapl',
   'meta_crab_bovin',
   'meta_crab_chick',
   'meta_crab_human',
   'meta_crab_mesau',
   'meta_crab_mouse',
   'meta_crab_rabit',
   'meta_crab_rat',
   'meta_crab_squac',
   'meta_mapping'],
  'crab_anapl': ['n_crab_anapl', 'sequence'],
  'crab_bovin': ['n_crab_bovin', 'sequence'],
  'crab_chick': ['n_crab_chick', 'sequence'],
  'crab_human': ['n_crab_human', 'sequence'],
  'crab_mesau': ['n_crab_mesau', 'sequence'],
  'crab_mouse': ['n_crab_mouse', 'sequence'],
  'crab_rabit': ['n_crab_rabit', 'sequence'],
  'crab_rat': ['n_crab_rat', 'sequence'],
  'crab_squac': ['n_crab_squac', 'sequence']},
 '_MAP_DATASETS_TO_COUNTERS_': {'_SINGLETONS_GROUP_': '_SINGLETONS_GROUP_/COUNTER',
  'crab_anapl': 'crab_anapl/n_crab_anapl',
  'crab_anapl/sequence': 'crab_anapl/n_crab_anapl',
  'meta_crab_anapl': '_SINGLETONS_GROUP_/COUNTER',
  'crab_bovin': 'crab_bovin/n_crab_bovin',
  'crab_bovin/sequence': 'crab_bovin/n_crab_bovin',
