In [11]:
from gliner import GLiNER

import glob
import os
import pandas as pd

In [12]:
model = GLiNER.from_pretrained("urchade/gliner_multi")

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 50686.45it/s]


In [13]:
# get news data (parquet file)
news_folder = '/workspaces/ner_news_malay/model_gliner'
parquet_files = glob.glob(os.path.join(news_folder, '*.parquet'))

In [14]:
# extract text from files
corpus_text = []
for file_path in parquet_files:
    try:
        df = pd.read_parquet(file_path)
        corpus_text.extend(df['Title'].dropna().str.lower().tolist())
        corpus_text.extend(df['Summary'].dropna().str.lower().tolist())
    except Exception as e:
        print(f"error processing {file_path}: {str(e)}")

In [15]:
# save corpus to text file
corpus_file = '/workspaces/ner_news_malay/model_gliner/malay_news_corpus.txt'

with open(corpus_file, 'w', encoding='utf-8') as f:
    for text in corpus_text:
        f.write(text + '\n')
print(f"Corpus size: {len(corpus_text)} sentences")

Corpus size: 930 sentences


In [16]:
# set labels
labels = ["GPE","PERSON","ORG","FAC","MONEY","NORP","LOC","PRODUCT","EVENT",
          "PERCENT","WORK_OF_ART","TIME","ORDINAL","CARDINAL","QUANTITY","LAW"]

In [17]:
# process individual sentences
all_entities = []
current_position = 0

for sentence in corpus_text:
    if not sentence.strip():
        current_position += len(sentence) + 1
        continue
        
    # get predictions
    entities = model.predict_entities(sentence, labels)
    
    # adjust position
    for entity in entities:
        entity["start"] += current_position
        entity["end"] += current_position
        all_entities.append(entity)
    
    # update position
    current_position += len(sentence) + 1

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [18]:
# Remove duplicates while preserving order
seen = set()
unique_entities = []
for entity in all_entities:
    identifier = (entity["text"], entity["label"], entity["start"], entity["end"])
    if identifier not in seen:
        seen.add(identifier)
        unique_entities.append(entity)

In [19]:
# results
for entity in unique_entities:
    print(f"{entity['text']} => {entity['label']}")

velodrom => LOC
dungun => LOC
bulan depan => TIME
mesir => LOC
utusan malaysia => ORG
individu => PERSON
kereta => PRODUCT
lorong => LOC
motosikal => PRODUCT
guru => PERSON
gaji => MONEY
dua tahun => TIME
24 tahun => TIME
utusan malaysia => ORG
aidilfitri => EVENT
bas ekspres => PRODUCT
lama => QUANTITY
svitolina => PERSON
suami => PERSON
liverpool => ORG
kpt => ORG
pelajar => PERSON
program tvet => PRODUCT
jokowi => PERSON
pilihan raya => EVENT
bencana banjir => EVENT
kebakaran hutan => EVENT
kanada => GPE
utusan malaysia => ORG
utusan malaysia => ORG
old trafford => LOC
utusan malaysia => ORG
mac => TIME
utusan malaysia => ORG
lilibet => PERSON
petronas => ORG
tangki simpanan => PRODUCT
lng baharu => PRODUCT
utusan malaysia => ORG
kais pagi makan pagi => EVENT
2022 => TIME
wp => ORG
mat rempit => ORG
undang-undang ketat => LAW
79 => QUANTITY
murid => PERSON
sekolah cameroon => ORG
utusan malaysia => ORG
sultanah kelantan => ORG
cemar duli => PERSON
sumbangan => MONEY
ipk kelantan => 

In [20]:
# convert to df
df = pd.DataFrame(unique_entities)
df.head()

Unnamed: 0,start,end,text,label,score
0,0,8,velodrom,LOC,0.682346
1,9,15,dungun,LOC,0.551318
2,31,42,bulan depan,TIME,0.643896
3,75,80,mesir,LOC,0.814649
4,83,98,utusan malaysia,ORG,0.78222


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5156 entries, 0 to 5155
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   start   5156 non-null   int64  
 1   end     5156 non-null   int64  
 2   text    5156 non-null   object 
 3   label   5156 non-null   object 
 4   score   5156 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 201.5+ KB


In [21]:
# save as csv
df.to_csv("results_main.csv", index=False)