# Play with FASTA Files and Hepfiles

In [None]:
import hepfile as hf

In [None]:
def read_fasta(filepath:str) -> list[str]:
    '''
    Function to read in a fasta file and return a list of the nucleotide sequences
    
    Args:
        filepath [str]: path to the fasta file
        
    Returns:
        list of nucleotide sequences to be parsed
    '''
    
    with open(filepath, 'r') as f:
        lines = [line.replace('\n', '').strip() for line in f.readlines()]
        idxs = [idx for idx, line in enumerate(lines) if line[0] == '>']
                
    split_fasta = []
    for ii in range(len(idxs)):
        idx1 = idxs[ii]
        if ii == len(idxs)-1:
            idx2 = -1
        else:
            idx2 = idxs[ii+1]
        
        split_fasta.append(lines[idx1:idx2])
        
    return split_fasta

def parse_sequence(seq:str) -> dict:
    '''
    Parses a sequence and returns a dictionary of the information
    
    Args:
        seq (str): sequence in fasta format
    
    Returns:
        dictionary of sequence
    '''
    
    # first deal with the metadata
    meta = seq[0].split()
    name = meta[0][1:]
    descr = meta[1:]
    
    # then concatenate the rest of the data
    data = ''.join(seq[1:])
    
    # pack this all into a dictionary
    all_data = {'name': name, 'meta':descr, 'data':list(data)}
    return all_data

In [None]:
filepath = '/home/nfranz/research/hepfile/docs/example_nb/test.fasta'
split = read_fasta(filepath)

## Entering the data as singletons

In [None]:
data = []
for seq in split:
    data.append(parse_sequence(seq))
    
print(data)

In [None]:
awk = hf.dict_tools.dictlike_to_hepfile(data, 'out-fasta.h5', write_hepfile=False)

## Examples using the hepfile structure

In [None]:
# get all of the data names
awk.name

In [None]:
# get all of the data flattened
import awkward as ak
ak.flatten(awk.data)

In [None]:
# get information corresponding to 'crab_anapl'
anapl = awk[awk.name == 'crab_anapl']
anapl.show()

In [None]:
# get just the crab_anapl data
anapl.data

In [None]:
# get just the crab_anapl metadata
anapl.meta

## Entering the data using groups and datasets

In [None]:
hepfile = hf.initialize()

for_hepfile = []
for seq in split:
    for_hepfile.append(parse_sequence(seq))

for d in for_hepfile:
    
    group = d['name']
    
    # create the group and add metadata for that group
    hf.create_group(hepfile, group, counter=f'n_{group}')
    hf.add_group_meta(hepfile, group, d['meta'])
    
    # create a dataset underneath that group
    hf.create_dataset(hepfile, 'sequence', group=group, dtype=str)
    
bucket = hf.create_single_bucket(hepfile)
for d in for_hepfile:
    group = d['name']
    bucket[f'{group}/sequence'] = d['data']

return_value = hf.pack(hepfile,bucket,STRICT_CHECKING=True,verbose=False)  

In [None]:
filepath = 'test-fasta-out.h5'
hf.write_to_file(filepath, hepfile)

In [None]:
hepfile['_GROUPS_']

In [None]:
data, meta, bucket = hf.load(filepath)

In [None]:
data

In [None]:
meta