In [12]:
from gliner import GLiNER

import glob
import os
import pandas as pd

In [13]:
model = GLiNER.from_pretrained("urchade/gliner_multi")

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 46474.28it/s]


In [14]:
# get news data (parquet file)
news_folder = '/workspaces/ner_news_malay/model_gliner'
parquet_files = glob.glob(os.path.join(news_folder, '*.parquet'))

In [15]:
# extract text from files
corpus_text = []
for file_path in parquet_files:
    try:
        df = pd.read_parquet(file_path)
        corpus_text.extend(df['Title'].dropna().str.lower().tolist())
        corpus_text.extend(df['Summary'].dropna().str.lower().tolist())
    except Exception as e:
        print(f"error processing {file_path}: {str(e)}")

In [16]:
# save corpus to text file
corpus_file = '/workspaces/ner_news_malay/model_gliner/malay_news_corpus.txt'

with open(corpus_file, 'w', encoding='utf-8') as f:
    for text in corpus_text:
        f.write(text + '\n')
print(f"Corpus size: {len(corpus_text)} sentences")

Corpus size: 2040 sentences


In [17]:
# set labels
labels = ["GPE","PERSON","ORG","FAC","MONEY","NORP","LOC","PRODUCT","EVENT",
          "PERCENT","WORK_OF_ART","TIME","ORDINAL","CARDINAL","QUANTITY","LAW"]

In [None]:
# process individual sentences
all_entities = []
current_position = 0

for sentence in corpus_text:
    if not sentence.strip():
        current_position += len(sentence) + 1
        continue
        
    # get predictions
    entities = model.predict_entities(sentence, labels)
    
    # adjust position
    for entity in entities:
        entity["start"] += current_position
        entity["end"] += current_position
        all_entities.append(entity)
    
    # update position
    current_position += len(sentence) + 1

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.




In [None]:
# duplicates
seen = set()
unique_entities = []
for entity in all_entities:
    identifier = (entity["text"], entity["label"], entity["start"], entity["end"])
    if identifier not in seen:
        seen.add(identifier)
        unique_entities.append(entity)

In [None]:
# convert to df
df = pd.DataFrame(unique_entities)
df.head()

Unnamed: 0,start,end,text,label,score
0,6,9,jkm,ORG,0.951072
1,60,75,utusan malaysia,ORG,0.684363
2,76,82,soniia,PERSON,0.960096
3,94,105,bukit kiara,LOC,0.654027
4,108,123,utusan malaysia,LOC,0.667498


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10930 entries, 0 to 10929
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   start   10930 non-null  int64  
 1   end     10930 non-null  int64  
 2   text    10930 non-null  object 
 3   label   10930 non-null  object 
 4   score   10930 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 427.1+ KB


In [None]:
# save as csv
df.to_csv("results_main.csv", index=False)