In [41]:
from Bio import SeqIO
import pathlib
from collections import defaultdict
import pandas

In [42]:
b31_ref = pathlib.Path("/home/mf019/longread_pangenome/expanded_dataset_analysis/ref/B31/GCF_000008685.2_ASM868v2_genomic.gff")

In [43]:
id2name = defaultdict()

with open(pathlib.Path("/home/mf019/longread_pangenome/ref/ID_to_name.csv"), 'r') as infile:
    lines = infile.readlines()
    for line in lines[1::]:
        fields = line.strip().split(',')
        for field in fields:
            id2name[fields[0].strip("'")] = fields[1].strip("'")

In [44]:
with open(b31_ref, 'r') as infile:
    lines = infile.readlines()

In [48]:
gff3_headers = [
    "seqid",      # name of the sequence/chromosome
    "source",     # program or database that generated the feature
    "type",       # type of feature (gene, CDS, exon, etc.)
    "start",      # starting position (1-based)
    "end",        # ending position (inclusive)
    "score",      # floating point confidence value
    "strand",     # DNA strand (+, -, or .)
    "phase",      # for CDS features, where next codon begins (0,1,2 or .)
    "attributes"  # semicolon-separated tag-value pairs
]

values = defaultdict(list)
attribute_keys = set()  # To collect all possible attribute keys

# First pass: collect all possible attribute keys
for line in lines:
    if line.startswith('#'):
        continue
    fields = line.strip().split('\t')
    if len(fields) >= 9:
        attr_pairs = fields[8].strip().split(';')
        for pair in attr_pairs:
            if '=' in pair:
                key, _ = pair.split('=', 1)
                attribute_keys.add(key)

# Initialize values dict with attribute keys
for key in attribute_keys:
    values[key] = []

# Second pass: populate all fields including expanded attributes
for line in lines:
    if line.startswith('#'):
        continue
    fields = line.strip().split('\t')
    
    # Process the first 8 fields
    for idx, field in enumerate(gff3_headers[:-1]):
        values[field].append(fields[idx])
    
    # Parse and store the full attributes field
    values['attributes'].append(fields[8] if len(fields) >= 9 else '')
    
    # Parse individual attributes
    attr_dict = {}
    if len(fields) >= 9:
        attr_pairs = fields[8].strip().split(';')
        for pair in attr_pairs:
            if '=' in pair:
                key, value = pair.split('=', 1)
                attr_dict[key] = value
    
    # Add values for each attribute key (using empty string if not present)
    for key in attribute_keys:
        values[key].append(attr_dict.get(key, ''))

# Now values contains both the original GFF3 fields and separate columns for each attribute

# Example usage:
print(f"Available fields:  {list(values.keys())}")
print(f"Number of records: {len(values['seqid'])}")
print(f"Example ID values: {values['ID'][:5]}")

Available fields:  ['transl_table', 'plasmid-name', 'old_locus_tag', 'go_function', 'protein_id', 'genome', 'strain', 'anticodon', 'type-material', 'mol_type', 'Note', 'Ontology_term', 'go_process', 'gene', 'Dbxref', 'exception', 'gene_biotype', 'ID', 'inference', 'product', 'start_range', 'locus_tag', 'Is_circular', 'partial', 'pseudo', 'Name', 'end_range', 'Parent', 'go_component', 'gbkey', 'seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
Number of records: 3171
Example ID values: ['NC_001318.1:1..910724', 'gene-BB_RS00005', 'cds-WP_002658391.1', 'gene-BB_RS00010', 'cds-WP_002658389.1']


In [49]:
df = pandas.DataFrame(values)

In [50]:
pf32_df = df[
    (df['type'] == 'CDS') & 
    (df['product'].str.contains('ParA', na=False))
]

In [51]:
pf32_df.columns

Index(['transl_table', 'plasmid-name', 'old_locus_tag', 'go_function',
       'protein_id', 'genome', 'strain', 'anticodon', 'type-material',
       'mol_type', 'Note', 'Ontology_term', 'go_process', 'gene', 'Dbxref',
       'exception', 'gene_biotype', 'ID', 'inference', 'product',
       'start_range', 'locus_tag', 'Is_circular', 'partial', 'pseudo', 'Name',
       'end_range', 'Parent', 'go_component', 'gbkey', 'seqid', 'source',
       'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'],
      dtype='object')

In [52]:
pf32_df = pf32_df[['seqid', 'locus_tag', 'start', 'end', 'strand', 'product', 'ID']]

In [53]:
pf32_df['plasmid_name'] = pf32_df['seqid'].apply(lambda x: id2name[x])

In [54]:
pf32_df = pf32_df[['seqid', 'plasmid_name', 'locus_tag', 'start', 'end', 'strand', 'product', 'ID']]

In [55]:
pf32_df

Unnamed: 0,seqid,plasmid_name,locus_tag,start,end,strand,product,ID
542,NC_001318.1,chromosome,BB_RS01335,281800,282687,-,MinD/ParA family protein,cds-WP_002556868.1
714,NC_001318.1,chromosome,BB_RS01775,368885,370027,+,MinD/ParA family protein,cds-WP_002657813.1
873,NC_001318.1,chromosome,BB_RS02160,449419,450171,+,ParA family protein,cds-WP_002656688.1
1499,NC_001318.1,chromosome,BB_RS03680,764371,765342,+,MinD/ParA family protein,cds-WP_002557312.1
1816,NC_001849.2,lp17,BB_RS05680,13339,14079,+,ParA family protein,cds-WP_010257677.1
1829,NC_000955.2,lp21,BB_RS07340,2868,3656,+,ParA family protein,cds-WP_010883900.1
1860,NC_001850.1,lp25,BB_RS04450,12096,12854,-,ParA family protein,cds-WP_010258956.1
1905,NC_001903.1,cp26,BB_RS05535,9275,10036,+,ParA family protein,cds-WP_010890586.1
1960,NC_001851.2,lp28-1,BB_RS05745,6632,7381,-,ParA family protein,cds-WP_014540418.1
1974,NC_001851.2,lp28-1,BB_RS05780,13029,13793,-,ParA family protein,cds-WP_010890276.1


In [39]:
pf32_df.to_csv('B31_pf32_coordinates.csv', index=False)