In [2]:
def transform_entities(data):
    all_entities = []

    # Extract entities and probabilities from different categories
    for category, entities in data['entities'].items():
        if data['prob'][category]:
            for entity, prob in zip(entities, data['prob'][category]):
                all_entities.append({
                    'entity_group': category.capitalize(),
                    'score': prob[1],
                    'word': data['abstract'][entity['start']:entity['end']+1],  # Adjust slicing here
                    'start': entity['start'],
                    'end': entity['end']
                })

    # Sort entities by start, end, and -score (negative score for descending order)
    all_entities.sort(key=lambda x: (x['start'], x['end'], -x['score']))

    # Filter out overlapping entities, keeping only the one with the highest score
    non_overlapping_entities = []
    last_end = -1

    for entity in all_entities:
        if entity['start'] >= last_end:
            non_overlapping_entities.append(entity)
            last_end = entity['end']

    return non_overlapping_entities

# Example data input
data = {
    'pmid': '6b3838a5a74c4caa39f6071125ac7149f3c15cee940ed1034bec68b6',
    'entities': {
        'disease': [{'start': 45, 'end': 51}],
        'drug': [],
        'gene': [{'start': 0, 'end': 3}],
        'species': [],
        'cell_line': [],
        'DNA': [{'start': 0, 'end': 3}, {'start': 10, 'end': 23}],
        'RNA': [],
        'cell_type': []
    },
    'title': '',
    'abstract': 'kras is a proto-oncogene involved in various cancers.',
    'prob': {
        'disease': [[{'start': 45, 'end': 51}, 0.9998258948326111]],
        'drug': [],
        'gene': [[{'start': 0, 'end': 3}, 0.9947220087051392]],
        'species': [],
        'cell_line': [],
        'DNA': [[{'start': 0, 'end': 3}, 0.8248917460441589], [{'start': 10, 'end': 23}, 0.8577249050140381]],
        'RNA': [],
        'cell_type': []
    },
    'num_entities': 4,
    'elapse_time': {'biomedner_elapse_time': 0.3346831798553467, 'ner_elapse_time': 0.33543848991394043},
    'error_code': 0,
    'error_message': ''
}

# Transforming the example data
transformed_data = transform_entities(data)
print(transformed_data)


[{'entity_group': 'Gene', 'score': 0.9947220087051392, 'word': 'kras', 'start': 0, 'end': 3}, {'entity_group': 'Dna', 'score': 0.8577249050140381, 'word': 'proto-oncogene', 'start': 10, 'end': 23}, {'entity_group': 'Disease', 'score': 0.9998258948326111, 'word': 'cancers', 'start': 45, 'end': 51}]
