In [3]:
import os
import sys
import pandas
import glob
from Bio import SeqIO
from Bio import SeqRecord
from Bio import SeqFeature

In [4]:
# lets set our paths for the assemblies
assemblies_path = '/Users/mf019/bioinformatics/longread_GWAS/assemblies'
annotations_path = f'{assemblies_path}/paired_assemblies/annotation'
# and get a list of the genbank files we have.
list_of_genbank_paths = glob.glob(f'{annotations_path}/genbank/*.gbff')

### this is the dictionary structure for what we want to end up with.
```
dict_format: dict = {
    isolate : {
        'shortread' : { # UNY123
            assembly_path : 'path/to/assembly',
          annotation_path : 'path/to/annotation',
                  seq_obj : 'seq_dict' # see below
            },
         'longread' : { # UNY123H/P
            assembly_path : 'path/to/assembly',
          annotation_path : 'path/to/annotation',
                  seq_obj : 'seq_dict' # see below
            }
        }
}

seq_dict: dict = {
    'metadata' : {
            'file' : 'path/to/file', # put the path to the file here
        'filedate' : 'date', # date of the file creation
       'parsedate' : 'date', # date of the file parsing (or the date of the creation of this object)
      'filesource' : 'source', # where in the world did this file come from?
          'origin' : 'origin', # where in the world did this organism come from?
        'organism' : 'organism', # species name
          'strain' : 'strain', # strain name
          'length' : 'length', # how long is everything all together?
     'num_records' : 'number of records', # count our contigs or replicons or chromosomes or plasmids or whatever a record is to you.
    'num_features' : 'number of features', # count our features (genes and whatnot)
      'additional' : [], # here is where we can put whatever else is needed.
    },
    'records' : {
        'record_id' : { # this is contigs, replicons, chromosomes, plasmids, etc.
              'seq' : 'actual_sequence', # this is nucleotide sequence of the forward strand, is a Seq() object which is basically a string with some extra methods :)
         'features' : { # this is a dictionary of features parsed with biopython
                  'name' : 'feature_name', # name of the feature
              'location' : 'feature_location', # coordinates of the feature. (THIS SHOULD BE DIRECTLY USEABLE BY BIOPYTHON METHODS SUCH AS .extract() or .reverse_complement() or .complement() or .translate() or .seq)!
                  'type' : 'feature_type', # what type of genetic feature is this (CDS, gene, rRNA, tRNA, etc)
            'qualifiers' : 'feature_qualifiers' # any additional information about the feature from the genbank file? (Perhaps I could put my own notes here from lipopredict or something like that?)
         }
        },
    },
}
```

# ok let's pull out our VLS now.

In [5]:
# lets count our annotation files:
all_assemblies = {}
for file in list_of_genbank_paths:
    basename = os.path.basename(file) # and strip the extension
    basename = os.path.splitext(basename)[0]
    all_assemblies[basename] = {}
    records = SeqIO.parse(file, "genbank") # parse the multirecord genbank file!
    for record in records:
        features = [f for f in record.features if f.type == 'CDS']