In [None]:
def transform_entities(data):
    all_entities = []

    # Extract entities and probabilities from different categories
    for category, entities in data['entities'].items():
        if data['prob'][category]:
            for entity, prob in zip(entities, data['prob'][category]):
                all_entities.append({
                    'entity_group': category.capitalize(),
                    'score': prob[1],
                    'word': data['abstract'][entity['start']:entity['end']+1],  # Adjust slicing here
                    'start': entity['start'],
                    'end': entity['end']
                })

    # Sort entities by start, end, and -score (negative score for descending order)
    all_entities.sort(key=lambda x: (x['start'], x['end'], -x['score']))

    # Filter out overlapping entities, keeping only the one with the highest score
    non_overlapping_entities = []
    last_end = -1

    for entity in all_entities:
        if entity['start'] >= last_end:
            non_overlapping_entities.append(entity)
            last_end = entity['end']

    return non_overlapping_entities

# Example data input
data = {
    'pmid': '6b3838a5a74c4caa39f6071125ac7149f3c15cee940ed1034bec68b6',
    'entities': {
        'disease': [{'start': 45, 'end': 51}],
        'drug': [],
        'gene': [{'start': 0, 'end': 3}],
        'species': [],
        'cell_line': [],
        'DNA': [{'start': 0, 'end': 3}, {'start': 10, 'end': 23}],
        'RNA': [],
        'cell_type': []
    },
    'title': '',
    'abstract': 'kras is a proto-oncogene involved in various cancers.',
    'prob': {
        'disease': [[{'start': 45, 'end': 51}, 0.9998258948326111]],
        'drug': [],
        'gene': [[{'start': 0, 'end': 3}, 0.9947220087051392]],
        'species': [],
        'cell_line': [],
        'DNA': [[{'start': 0, 'end': 3}, 0.8248917460441589], [{'start': 10, 'end': 23}, 0.8577249050140381]],
        'RNA': [],
        'cell_type': []
    },
    'num_entities': 4,
    'elapse_time': {'biomedner_elapse_time': 0.3346831798553467, 'ner_elapse_time': 0.33543848991394043},
    'error_code': 0,
    'error_message': ''
}

# Transforming the example data
transformed_data = transform_entities(data)
print(transformed_data)


In [6]:
from Bio import Entrez

# Always provide your email address
Entrez.email = "your_email@example.com"

# Define the database and search term
db = "gene"
search_term = "Homo sapiens[orgn]"

# First, perform the search to get total count of records
search_handle = Entrez.esearch(db=db, term=search_term, usehistory="y", retmax=1)
search_results = Entrez.read(search_handle)
search_handle.close()

# Total count of records found
count = int(search_results['Count'])
print(f"Total records: {count}")

# Use history and WebEnv to fetch batches of records
webenv = search_results['WebEnv']
query_key = search_results['QueryKey']

batch_size = 1000 # NCBI typically allows batches of 500
gene_ids = []

# Retrieve all records in batches
for start in range(0, count, batch_size):
    fetch_handle = Entrez.efetch(db=db, rettype="uilist", retmode="text", retstart=start,
                                 retmax=batch_size, webenv=webenv, query_key=query_key)
    batch_ids = fetch_handle.read().split()
    fetch_handle.close()
    gene_ids.extend(batch_ids)
    print(f"Fetched batch from {start} to {start + batch_size - 1}")

# Output the total number of gene IDs collected
print(f"Collected a total of {len(gene_ids)} Gene IDs.")


Total records: 359884
Fetched batch from 0 to 999
Fetched batch from 1000 to 1999
Fetched batch from 2000 to 2999
Fetched batch from 3000 to 3999
Fetched batch from 4000 to 4999
Fetched batch from 5000 to 5999
Fetched batch from 6000 to 6999
Fetched batch from 7000 to 7999
Fetched batch from 8000 to 8999
Fetched batch from 9000 to 9999
Fetched batch from 10000 to 10999
Fetched batch from 11000 to 11999
Fetched batch from 12000 to 12999
Fetched batch from 13000 to 13999
Fetched batch from 14000 to 14999
Fetched batch from 15000 to 15999
Fetched batch from 16000 to 16999
Fetched batch from 17000 to 17999
Fetched batch from 18000 to 18999
Fetched batch from 19000 to 19999
Fetched batch from 20000 to 20999
Fetched batch from 21000 to 21999
Fetched batch from 22000 to 22999
Fetched batch from 23000 to 23999
Fetched batch from 24000 to 24999
Fetched batch from 25000 to 25999
Fetched batch from 26000 to 26999
Fetched batch from 27000 to 27999
Fetched batch from 28000 to 28999
Fetched batch fr

In [34]:
import numpy as np
np.where(np.array(gene_ids)=='6482')

(array([6540]),)

In [1]:
from convert import pubtator2dict_list

In [4]:
pubtator2dict_list("input/0a6edf3dc68091397084504be9bf9ff50acebcd142c1a949e4fc4654.PubTator.PubTator")[0]["abstract"]

'cancer is a disease.'