In [1]:
from gliner import GLiNER

import glob
import os
import pandas as pd

from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = GLiNER.from_pretrained("urchade/gliner_multi")

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 43351.98it/s]


In [3]:
# get news data
news_folder = '/workspaces/ner_news_malay/scraper/news_id'
parquet_files = glob.glob(os.path.join(news_folder, '*.parquet'))
print(f"found {len(parquet_files)} parquet files")

found 1 parquet files


In [4]:
# extract text from files
corpus_text = []
for file_path in parquet_files:
    try:
        df = pd.read_parquet(file_path)
        corpus_text.extend(df['Title'].dropna().str.lower().tolist())
        corpus_text.extend(df['Summary'].dropna().str.lower().tolist())
    except Exception as e:
        print(f"error processing {file_path}: {str(e)}")

In [5]:
# save corpus to text file
corpus_file = '/workspaces/ner_news_malay/model_gliner/malay_news_corpus.txt'

with open(corpus_file, 'w', encoding='utf-8') as f:
    for text in corpus_text:
        f.write(text + '\n')
print(f"corpus size: {len(corpus_text)} sentences")

corpus size: 8 sentences


In [None]:
import re

# Read text from file
with open("malay_news_corpus.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Define target labels
labels = ["GPE","PERSON","ORG","FAC","MONEY","NORP","LOC","PRODUCT","EVENT",
          "PERCENT","WORK_OF_ART","TIME","ORDINAL","CARDINAL","QUANTITY","LAW"]

# Chunk processing parameters
CHUNK_SIZE = 300  # Max tokens per chunk
OVERLAP = 50      # Token overlap between chunks
WORD_PATTERN = r'\b\w+\b'  # Word boundary pattern

# Split text into words
words = re.findall(WORD_PATTERN, text)
all_entities = []

# Process text in chunks with overlap
start = 0
while start < len(words):
    # Calculate chunk end with overlap
    end = start + CHUNK_SIZE
    if end > len(words):
        end = len(words)
    
    # Reconstruct text chunk
    chunk_text = ' '.join(words[start:end])
    
    # Find chunk boundaries in original text
    chunk_start_idx = text.find(chunk_text)
    if chunk_start_idx == -1:  # Handle edge case
        chunk_start_idx = 0
    chunk_end_idx = chunk_start_idx + len(chunk_text)
    
    # Process chunk with model
    entities = model.predict_entities(chunk_text, labels)
    
    # Adjust entity positions to original text
    for entity in entities:
        entity["start"] += chunk_start_idx
        entity["end"] += chunk_start_idx
        all_entities.append(entity)
    
    # Move to next chunk with overlap
    start = end - OVERLAP
    if start < 0:
        start = 0

# Remove duplicates while preserving order
seen_entities = set()
unique_entities = []
for entity in all_entities:
    # Create unique identifier using position and label
    identifier = (entity["start"], entity["end"], entity["label"])
    if identifier not in seen_entities:
        seen_entities.add(identifier)
        unique_entities.append(entity)

# Sort entities by position in original text
unique_entities.sort(key=lambda x: x["start"])

# Print results
for entity in unique_entities:
    entity_text = text[entity["start"]:entity["end"]]
    print(f"{entity_text} => {entity['label']}")

# Print statistics
print(f"\nTotal entities detected: {len(unique_entities)}")
label_counts = {}
for entity in unique_entities:
    label = entity["label"]
    label_counts[label] = label_counts.get(label, 0) + 1

print("\nEntity counts by type:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
