## **Recolha e Pré-processamento**

In [3]:
from Bio import Entrez
import pandas as pd

Entrez.email = "conhecimentolinguagem@gmail.com"
term = '("disease"[MeSH Terms]) AND ("symptom"[Title/Abstract] OR "treatment"[Title/Abstract]) AND ("2020"[Date - Publication] : "2025"[Date - Publication])'


handle = Entrez.esearch(db="pubmed", term=term, retmax=100)
record = Entrez.read(handle)
ids = record["IdList"]

articles = []
for pmid in ids:
    fetch = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
    text = fetch.read()
    articles.append({"pmid": pmid, "text": text})

df = pd.DataFrame(articles)

# Save the articles to a CSV file
df.to_csv("articles.csv", index=False)

## **Extração de Entidades**

- Criar um ambiente virtual novo
- pip install scapy==3.7.4
- pip install scispacy==0.5.1
- Download de "en_ner_bc5cdr_md" em https://allenai.github.io/scispacy/
- pip install "location"


   ### **Spacy e Scispacy**  

In [4]:
import pandas as pd
import spacy

df = pd.read_csv("articles.csv")

nlp = spacy.load("en_ner_bc5cdr_md") # carrega o modelo do scispaCy

doc = nlp(df.iloc[0]['text'])
for ent in doc.ents:
    print(ent.text, ent.label_)

Acute vestibular syndrome DISEASE
Agger-Nielsen CHEMICAL
Gødstrup CHEMICAL
Acute vestibular syndrome DISEASE
AVS DISEASE
stroke DISEASE
neuritis DISEASE
nystagmus DISEASE
strokes DISEASE


#### **Transformers**

In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd

# Carrega o modelo NER específico
tokenizer = AutoTokenizer.from_pretrained("kamalkraj/bio-med-ner")
model = AutoModelForTokenClassification.from_pretrained("kamalkraj/bio-med-ner")

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Lê CSV com artigos
df = pd.read_csv("articles.csv")

entities_list = []

for index, row in df.iterrows():
    text = row['text']
    ner_results = ner_pipeline(text)
    for ent in ner_results:
        entities_list.append({
            "pmid": row['pmid'],
            "entity": ent['word'],
            "label": ent['entity_group']  # Disease, Symptom, Drug
        })

entities_df = pd.DataFrame(entities_list)
print(entities_df.head())


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


OSError: kamalkraj/bio-med-ner is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`