## Import Library

In [1]:
import pandas as pd

from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification




## Dataset Information

In [2]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,judul,asal,text
0,Legenda Danau Toba,Sumatera Utara,"Di sebuah desa di wilayah Sumatera, hidup seor..."
1,Puteri Hijau,Sumatera Utara,"Sekitar abad 14 dan 15 masehi , berdiri keraja..."
2,Batu Gantung Danau Toba,Sumatera Utara,"Pada zaman dahulu kala, terdapat satu keluarga..."
3,Sampuraga Si Anak Durhaka,Sumatera Utara,"Pada zaman dulu, hiduplah seorang janda tua da..."
4,Si Buyung Besar,Sumatera Utara,Pada zaman dahulu kala penduduk pantai pun mas...


In [3]:
df = df[['judul', 'asal', 'text']]
df.head()

Unnamed: 0,judul,asal,text
0,Legenda Danau Toba,Sumatera Utara,"Di sebuah desa di wilayah Sumatera, hidup seor..."
1,Puteri Hijau,Sumatera Utara,"Sekitar abad 14 dan 15 masehi , berdiri keraja..."
2,Batu Gantung Danau Toba,Sumatera Utara,"Pada zaman dahulu kala, terdapat satu keluarga..."
3,Sampuraga Si Anak Durhaka,Sumatera Utara,"Pada zaman dulu, hiduplah seorang janda tua da..."
4,Si Buyung Besar,Sumatera Utara,Pada zaman dahulu kala penduduk pantai pun mas...


## Implementasi NER

In [4]:
pipe = pipeline("token-classification", model="cahya/xlm-roberta-large-indonesian-NER")
tokenizer = AutoTokenizer.from_pretrained("cahya/xlm-roberta-large-indonesian-NER")
model = AutoModelForTokenClassification.from_pretrained("cahya/xlm-roberta-large-indonesian-NER")

Some weights of the model checkpoint at cahya/xlm-roberta-large-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cahya/xlm-roberta-large-indonesian-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the chec

In [5]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [6]:
df.shape

(220, 3)

### Fungsi

In [7]:
manual_entities = {
    "Petani": "PER",
    "Puteri": "PER",
    "Sang": "PER",
    "Si": "PER",
    "Raja": "PER",
    "Ratu": "PER",
    "Pangeran": "PER",
    "Ayah": "PER",
    "Ibu": "PER",
}

def split_text(text, tokenizer, max_length=512):
    tokens = tokenizer.tokenize(text)
    token_chunks = [tokens[i:i+max_length] for i in range(0, len(tokens), max_length)]
    text_chunks = [tokenizer.convert_tokens_to_string(chunk) for chunk in token_chunks]
    return text_chunks

def apply_ner(text):
    text_chunks = split_text(text, tokenizer)
    results = []
    for chunk in text_chunks:
        ner_results = nlp(chunk)
        results.extend(ner_results)
    return results

def add_manual_entities(text, ner_results, manual_entities):
    words = text.split()
    for i, word in enumerate(words):
        if word in manual_entities:
            entity = {
                'word': word,
                'entity_group': manual_entities[word],
                'start': text.find(word),
                'end': text.find(word) + len(word)
            }
            ner_results.append(entity)
    return ner_results

In [8]:
df['ner_results'] = df['text'].apply(lambda x: add_manual_entities(x, apply_ner(x), manual_entities))
df.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (931 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,judul,asal,text,ner_results
0,Legenda Danau Toba,Sumatera Utara,"Di sebuah desa di wilayah Sumatera, hidup seor...","[{'entity_group': 'GPE', 'score': 0.9980665, '..."
1,Puteri Hijau,Sumatera Utara,"Sekitar abad 14 dan 15 masehi , berdiri keraja...","[{'entity_group': 'DAT', 'score': 0.68840194, ..."
2,Batu Gantung Danau Toba,Sumatera Utara,"Pada zaman dahulu kala, terdapat satu keluarga...","[{'entity_group': 'CRD', 'score': 0.9936027, '..."
3,Sampuraga Si Anak Durhaka,Sumatera Utara,"Pada zaman dulu, hiduplah seorang janda tua da...","[{'entity_group': 'PER', 'score': 0.9767194, '..."
4,Si Buyung Besar,Sumatera Utara,Pada zaman dahulu kala penduduk pantai pun mas...,"[{'entity_group': 'CRD', 'score': 0.74330723, ..."


In [9]:
df.shape

(220, 4)

In [10]:
def find_words_with_label(ner_results, label):
    return list(set(ent['word'] for ent in ner_results if ent['entity_group'] == label))

In [11]:
df['persons'] = df['ner_results'].apply(lambda x: find_words_with_label(x, 'PER'))

In [12]:
df.head(50)

Unnamed: 0,judul,asal,text,ner_results,persons
0,Legenda Danau Toba,Sumatera Utara,"Di sebuah desa di wilayah Sumatera, hidup seor...","[{'entity_group': 'GPE', 'score': 0.9980665, '...","[Pu, Kanda, teri, Petani, ni, Putera, Peta, Pu..."
1,Puteri Hijau,Sumatera Utara,"Sekitar abad 14 dan 15 masehi , berdiri keraja...","[{'entity_group': 'DAT', 'score': 0.68840194, ...","[ri Hij, Mambang Jazid, jazid, putri hijau, Su..."
2,Batu Gantung Danau Toba,Sumatera Utara,"Pada zaman dahulu kala, terdapat satu keluarga...","[{'entity_group': 'CRD', 'score': 0.9936027, '...","[Toki, Sang, uni, Ayah, Ser, seruni, Seruni]"
3,Sampuraga Si Anak Durhaka,Sumatera Utara,"Pada zaman dulu, hiduplah seorang janda tua da...","[{'entity_group': 'PER', 'score': 0.9767194, '...","[Sampura, Ibu, Sang, sampura, sampuraga, Sampu..."
4,Si Buyung Besar,Sumatera Utara,Pada zaman dahulu kala penduduk pantai pun mas...,"[{'entity_group': 'CRD', 'score': 0.74330723, ...","[Buyung, Sang, si Buyung Besar, Datuk, Bu, put..."
5,Jibau Malang,Sumatera Utara,Tersebutlah kisah di zaman dahulu kala di daer...,"[{'entity_group': 'LOC', 'score': 0.896115, 'w...","[si Jibau, si Nogong, bunda Siti Ensah, Datuk ..."
6,Tuah Burung Merbak,Sumatera Utara,"Tersebutlah kisah di zaman dahulukala, daerah ...","[{'entity_group': 'GPE', 'score': 0.93900275, ...","[Pu, Wak Pawang Merbuk, Wak Pawang Berbuk, Ibu..."
7,Partiga Tiga Sipunjung Dan Anggaranim,Sumatera Utara,Menurut yang empunya cerita kisah ini terjadi ...,"[{'entity_group': 'GPE', 'score': 0.99705803, ...","[Sang, PARTIGA TIGA SIPUNJUNG, Si, NAN SORMA, ..."
8,Cerita Raja Narasaon,Sumatera Utara,"Kira-kira pada abad ke-14 di Sibisa, sebuah ka...","[{'entity_group': 'ORD', 'score': 0.52149594, ...","[Datu Pejek, Siboru Bidinglaut, Narasaon, Sang..."
9,Sipakpak Kunal Dan Nagai Sori,Sumatera Utara,Pulau Raja ialah sebuah kota kecil di kabupate...,"[{'entity_group': 'GPE', 'score': 0.78550386, ...","[Margolang, Sipakpak Kunal, Raja Lela, Nagai S..."


In [13]:
df.to_csv('ner_results2.csv', index=False)