실제 파이프라인 예시 (precision 지향형)

In [4]:
from transformers import pipeline

# 1️⃣ NER 파이프라인 로드
ner_pipeline = pipeline(
    "ner",
    model="d4data/biomedical-ner-all",
    aggregation_strategy="simple"
)

# 2️⃣ 예시 문장
text = "Patients were treated with nivolumab and pembrolizumab in this study."

# 3️⃣ 추출 결과 확인
entities = ner_pipeline(text)
print(entities)


Device set to use cuda:0


[{'entity_group': 'Medication', 'score': np.float32(0.99990416), 'word': 'ni', 'start': 27, 'end': 29}, {'entity_group': 'Medication', 'score': np.float32(0.9997353), 'word': '##vo', 'start': 29, 'end': 31}, {'entity_group': 'Medication', 'score': np.float32(0.9989798), 'word': '##lumab', 'start': 31, 'end': 36}, {'entity_group': 'Medication', 'score': np.float32(0.99985707), 'word': 'pe', 'start': 41, 'end': 43}, {'entity_group': 'Medication', 'score': np.float32(0.9726839), 'word': '##mbrolizumab', 'start': 43, 'end': 54}]


In [5]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# PDF 로드
loader = PyMuPDFLoader("/data1/workspace/pdfs/1058.full.pdf")
docs = loader.load()
print(f"총 {len(docs)} 페이지 로드됨")

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(docs)
print(f"분할된 chunk 수: {len(split_docs)}")
print(split_docs[0].page_content[:300])  # 첫 chunk 일부 미리보기


총 6 페이지 로드됨
분할된 chunk 수: 60
Abstract. Background/Aim: Colorectal cancer (CRC) is the
third-leading cause of death in the world. Although the
prognosis has improved due to improvement of chemotherapy,
metastatic CRC is still a recalcitrant disease, with a 5-year
survival of only 13%. Irinotecan (IRN) is used as first-line
chemo


In [6]:
def extract_drugs(text):
    """텍스트에서 약물명만 추출"""
    ents = ner_pipeline(text)
    drugs = [e['word'] for e in ents if e['entity_group'].lower() == 'drug']
    return list(set(drugs))  # 중복 제거

for i, doc in enumerate(split_docs[:5]):  # 처음 5개만 예시 출력
    drugs = extract_drugs(doc.page_content)
    doc.metadata["ner_drugs"] = drugs
    print(f"Chunk {i} → {drugs}")


Chunk 0 → []
Chunk 1 → []
Chunk 2 → []
Chunk 3 → []
Chunk 4 → []


In [7]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_name = "Jean-Baptiste/roberta-large-ner-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [8]:
from transformers import pipeline

ner_pipe = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
text = "Lapatinib and trastuzumab are used in breast cancer treatment."
result = ner_pipe(text)
print(result)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'entity_group': 'MISC', 'score': np.float32(0.6981254), 'word': 'La', 'start': 0, 'end': 2}]


In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "Jean-Baptiste/roberta-large-ner-english"
cache_dir = "/data1/workspace/hf_cache"  # user1 권한 있는 경로

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = AutoModelForTokenClassification.from_pretrained(model_name, cache_dir=cache_dir, device_map="cpu")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import pipeline

ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0)  # GPU 0번 사용
text = "Lapatinib and trastuzumab are used in breast cancer treatment."
result = ner_pipe(text)
print(result)


ValueError: The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please discard the `device` argument when creating your pipeline object.

In [3]:
from transformers import pipeline

ner_pipe = pipeline(
    "ner",
    model=model,         # 이미 accelerate로 로드됨
    tokenizer=tokenizer,
    aggregation_strategy="simple"
    # device 인자 제거!
)


Device set to use cpu


In [4]:
from transformers import pipeline

ner_pipe = pipeline(
    "ner",
    model=model,        # 이미 accelerate로 GPU/CPU 배치됨
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)


Device set to use cpu


In [5]:
from transformers import pipeline

# model과 tokenizer는 device_map="auto"로 로드 완료
ner_pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

# text 분석
text = "Lapatinib and trastuzumab are used in breast cancer treatment."
result = ner_pipe(text)
print(result)

Device set to use cpu


[]


In [6]:
from transformers import pipeline

ner_pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="none"  # 토큰 단위 결과 확인
)

text = "Lapatinib and trastuzumab are used in breast cancer treatment."
result = ner_pipe(text)
print(result)


Device set to use cpu


[]


In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# 모델 로드
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# NER pipeline
ner_pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

text = "Lapatinib and trastuzumab are used in breast cancer treatment."
result = ner_pipe(text)
print(result)


Device set to use cuda:0


[{'entity_group': 'Medication', 'score': 0.91673374, 'word': 'lapatinib', 'start': 0, 'end': 9}, {'entity_group': 'Medication', 'score': 0.99988055, 'word': 'tr', 'start': 14, 'end': 16}, {'entity_group': 'Medication', 'score': 0.9977405, 'word': '##ast', 'start': 16, 'end': 19}, {'entity_group': 'Medication', 'score': 0.89958596, 'word': '##uzumab', 'start': 19, 'end': 25}, {'entity_group': 'Biological_structure', 'score': 0.9155983, 'word': 'breast', 'start': 38, 'end': 44}]


In [8]:
def merge_wordpieces(ner_results):
    drugs = []
    current_word = ""
    for token in ner_results:
        if token['entity_group'] == 'Medication':
            if token['word'].startswith("##"):
                current_word += token['word'][2:]
            else:
                if current_word:
                    drugs.append(current_word)
                current_word = token['word']
    if current_word:
        drugs.append(current_word)
    return drugs

merged_drugs = merge_wordpieces(result)
print(merged_drugs)  # ['lapatinib', 'trastuzumab']


['lapatinib', 'trastuzumab']
