# Biosample parser

This notebook contains the code for parsing and extract metadata from an xml file containing biosample information. The parsed file can be the result of search on NCBI's `Biosample` database. If the contains data for multiple organisms, the species of interest has to be specified (variable `species`). Cells that can be edited begin with the comment: `# Edit this cell`. or `# Can be edited`.

The folder `data` contains an example of a file that was obtained by searching the database using `Klebsiella pneumoniae` as the search key words. Runing the notebook should result in a file named `Klebsiella_metadata.csv` in the same folder.

In [None]:
import xml.etree.ElementTree as ET # for parsing xml files
import csv

In [None]:
# Edit this cell
species = 'Klebsiella pneumoniae' # Change to your organism
all_biosamples = 'data/biosample_result.xml' # Change this to your search file


In [None]:
species_biosamples = f'data/{species.split(" ")[0]}_biosamples.xml'
metadata = f'data/{species.split(" ")[0]}_metadata.csv'

In [None]:
tree = ET.parse(all_biosamples)
myroot = tree.getroot()

In [None]:
count_all_samples = 0
for record in tree.findall('BioSample'):
    count_all_samples += 1
    accession = record.get('accession')
    for elt in record.iter('Organism'):
        name = elt.get('taxonomy_name')
    if not species in name:
        myroot.remove(record)
tree.write(species_biosamples)

In [None]:
new_tree = ET.parse(species_biosamples)
new_root = new_tree.getroot()
IDS = [] # Accession numbers of records for the species 
for record in new_tree.findall('BioSample'):
    accession = record.get('accession')
    if not accession in IDS:
        IDS.append(accession)
print(f'Total biosample records: {count_all_samples}')
print(f'Biosample records for {species}: {len(IDS)}')

In [None]:
# Make a list of all attributes (will help for counting)
atribute_list = []
for record in new_tree.findall('BioSample'):
    for attribute in record.iter('Attribute'):
        atribute_list.append(attribute.attrib['attribute_name'])

# Make a list of unique attributes
uniq_atributes = []
for att in atribute_list:
    if not att in uniq_atributes:
        uniq_atributes.append(att)

# Make lists of attributes showing geographic location, collection date and host 
for att in sorted (uniq_atributes):
    print(att)


In [None]:
# Can be edited

# From the output, the following is a list of attributes and all their possible names.
select_attributes = { 
'location': ['country', 'geographic location (country and/or sea)', 'geographic location (country and/or sea, region)', 'geographic location (country and/or sea,region)', 'geographic location (region and locality)', 'geo_loc_name', 'Isolation_country'],
'collection_date': ['collection date', 'collection_date', 'collection-date', 'time of collection'],
'hosts': ['host', 'host scientific name', 'specific host', 'specific_host'],
'sources':['source', 'isolation site', 'isolation source', 'isolation-source', 'isolation_source'],
'source_type': ['source type', 'source_type'],
'serotype': ['serotype', 'serovar'],
'disease': ['host disease', 'host health state', 'host-disease', 'host_disease', 'host_health_state'],
'package': ['package', 'atribute_package', 'attribute package', 'attribute_package'],
'collector': ['collected by', 'collected-by', 'collected_by']
}

In [None]:
for header in select_attributes:
    print(header.capitalize().replace('_', ' '))

In [None]:
# Get the numbers of files with needed attributes
attributes = {}
n_host = 0
n_date = 0
n_loc = 0
n_source = 0

for att in atribute_list:
    if att in select_attributes['hosts']:
        n_host += 1
    elif att in select_attributes['collection_date']:
        n_date += 1
    elif att in select_attributes['location']:
        n_loc += 1
    elif att in select_attributes['sources']:
        n_source += 1

attributes['location'] = n_loc
attributes['collection_date'] = n_date
attributes['hosts'] = n_host
attributes['sources'] = n_source

print('Total number of files: ', len(IDS))

print('With location: ', attributes['location'])
print('With collection date: ', attributes['collection_date'])
print('With host info: ', attributes['hosts'])
print('With source info: ', attributes['sources'])


In [None]:
# From the output the lists will be: 

with open(metadata, 'w') as meta:
    fwriter = csv.writer(meta, dialect=csv.excel_tab)
    headers = ['Biosample']
    for header in select_attributes:
        headers.append(header.capitalize().replace('_', ' '))
    
    fwriter.writerow(headers)

    for record in new_tree.findall('BioSample'):
        record_attributes = {} # This will hold the attributes to be added to the metadata

        accession = record.get('accession')

        for attribute in record.iter('Attribute'):
            for value in select_attributes:
                if attribute.attrib['attribute_name'] in select_attributes[value]:
                    if value == 'location': # Thelocation value has a ':' separator - we want just the country
                        record_attributes[value] = attribute.text.split(':')[0].strip()
                    elif value == 'collection_date': # We just want the year - first four digits
                        record_attributes[value] = attribute.text[:4]
                    else:
                        record_attributes[value] = attribute.text
        final_values = [accession]
    
        for element in  select_attributes:
            #Here you can make a sublist from the select_attributes if you dont want them all.
            final_values.append(record_attributes.get(element, 'Not available'))
                                
        fwriter.writerow(final_values)