In [1]:
from gliner import GLiNER

import glob
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = GLiNER.from_pretrained("urchade/gliner_multi")

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 34030.86it/s]


In [3]:
# get news data (parquet file)
news_folder = '/workspaces/ner_news_malay/model_gliner'
parquet_files = glob.glob(os.path.join(news_folder, '*.parquet'))

In [4]:
# extract text from files
corpus_text = []
for file_path in parquet_files:
    try:
        df = pd.read_parquet(file_path)
        corpus_text.extend(df['Title'].dropna().str.lower().tolist())
        corpus_text.extend(df['Summary'].dropna().str.lower().tolist())
    except Exception as e:
        print(f"error processing {file_path}: {str(e)}")

In [5]:
# save corpus to text file
corpus_file = '/workspaces/ner_news_malay/model_gliner/malay_news_corpus.txt'

with open(corpus_file, 'w', encoding='utf-8') as f:
    for text in corpus_text:
        f.write(text + '\n')
print(f"Corpus size: {len(corpus_text)} sentences")

Corpus size: 2252 sentences


In [6]:
# set labels
labels = ["GPE","PERSON","ORG","FAC","MONEY","NORP","LOC","PRODUCT","EVENT",
          "PERCENT","WORK_OF_ART","TIME","ORDINAL","CARDINAL","QUANTITY","LAW"]

In [7]:
# process individual sentences
all_entities = []
current_position = 0

for sentence in corpus_text:
    if not sentence.strip():
        current_position += len(sentence) + 1
        continue
        
    # get predictions
    entities = model.predict_entities(sentence, labels)
    
    # adjust position
    for entity in entities:
        entity["start"] += current_position
        entity["end"] += current_position
        all_entities.append(entity)
    
    # update position
    current_position += len(sentence) + 1

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
# duplicates
seen = set()
unique_entities = []
for entity in all_entities:
    identifier = (entity["text"], entity["label"], entity["start"], entity["end"])
    if identifier not in seen:
        seen.add(identifier)
        unique_entities.append(entity)

In [9]:
# convert to df
df = pd.DataFrame(unique_entities)
df.head()

Unnamed: 0,start,end,text,label,score
0,0,13,operasi kesan,EVENT,0.630561
1,36,42,ec120b,PRODUCT,0.817231
2,62,67,ipcmc,ORG,0.962938
3,110,125,utusan malaysia,ORG,0.784653
4,170,180,ezad lazim,PERSON,0.918969


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15926 entries, 0 to 15925
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   start   15926 non-null  int64  
 1   end     15926 non-null  int64  
 2   text    15926 non-null  object 
 3   label   15926 non-null  object 
 4   score   15926 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 622.2+ KB


In [11]:
# save as csv
df.to_csv("results_main.csv", index=False)