In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Training Model

In [2]:
import pickle
import spacy
import random
from spacy.util import minibatch, compounding
from spacy import load, displacy

In [3]:
# load datasets 

with open('/content/drive/MyDrive/CSIT_Internship/ner_spacy_fmt_datasets.pickle', 'rb') as f:
    ner_spacy_fmt_datasets = pickle.load(f)

In [4]:
nlp=spacy.blank("id")

In [5]:
nlp.add_pipe('ner')

<spacy.pipeline.ner.EntityRecognizer at 0x7fa7415f33d0>

In [6]:
nlp.begin_training()

<thinc.optimizers.Optimizer at 0x7fa73f3d8940>

In [7]:
import random
from spacy.util import minibatch, compounding

In [8]:
ner=nlp.get_pipe("ner")

In [9]:
for _, annotations in ner_spacy_fmt_datasets:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
        break

In [10]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [53]:
# TRAINING MODEL
from spacy.training import Example
examples = []
for text, annots in ner_spacy_fmt_datasets:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))
nlp.initialize(lambda: examples)
for i in range(6):
    random.shuffle(examples)
    for batch in minibatch(examples, size=8):
        nlp.update(batch)

In [54]:
# save model 
from pathlib import Path

output_dir = Path('/content/drive/MyDrive/CSIT_Internship/nlp_id_checkpoint_2022_09_22_06')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to /content/drive/MyDrive/CSIT_Internship/nlp_id_checkpoint_2022_09_22_06


## Evaluating Model

In [55]:
# load existing model 
output_dir = '/content/drive/MyDrive/CSIT_Internship/nlp_id_checkpoint_2022_09_22_06'
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)

Loading from /content/drive/MyDrive/CSIT_Internship/nlp_id_checkpoint_2022_09_22_06


In [56]:
f = open("/content/drive/MyDrive/CSIT_Internship/data/indotest.txt")
toPredict = ''
gold_words = []
gold_ents = []
for line in f:
  if line!="\n":
    #print(line)
    word, tag = line.split()
    #print(word)
    gold_words.append(word)
    newtag=tag[2:]
    if newtag=="PLACE":
      newtag="LOCATION"
      newtag=tag[0:2]+newtag
      gold_ents.append(newtag)
    elif newtag=="ORGANISATION":
      newtag="ORGANIZATION"
      newtag=tag[0:2]+newtag
      gold_ents.append(newtag)
    else:
      gold_ents.append(tag)
    toPredict=toPredict+" "+word
f.close()
toPredict=toPredict[1:]
print((gold_ents))
print(len(gold_ents))

['B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'O', 'O', 'O', 'B-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [57]:
# test 
doc = nlp_updated(toPredict)
first_ents=[]
first_words=[]
pred_words=[]
pred_ents=[]
#print(doc.ents)
print(len(toPredict.split()))
num=0
for ent in doc.ents:
  for i in range(len(ent.text.split())):
    if i==0:
      first_ents.append("B-"+ent.label_)
      first_words.append(ent.text.split()[i])
    else:
      first_ents.append("I-"+ent.label_)
      first_words.append(ent.text.split()[i])
wordsInText=toPredict.split()
for word in wordsInText:
  if num<len(first_words):
    if word!=first_words[num]:
      pred_words.append(word)
      pred_ents.append("O")
    else:
      pred_words.append(first_words[num])
      if first_ents[num]=="B-TIME" or first_ents[num]=="I-TIME" or first_ents[num]=="B-QUANTITY" or first_ents[num]=="I-QUANTITY":
        pred_ents.append("O")
      else:
        pred_ents.append(first_ents[num])
      num+=1
  else:
    pred_words.append(word)
    pred_ents.append("O")
      

    
print(pred_words)
print(len(pred_words))
print(pred_ents)
print(len(pred_ents))
#print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

6854
['Joetata', 'Hadihardaja', 'dan', 'dihadiri', 'oleh', 'Rektor', 'Undip', 'Prof', '.', 'Sejak', 'masih', 'duduk', 'di', 'bangku', 'sekolah', 'TK', 'Kevin', 'sudah', 'belajar', 'alat', 'musik', 'piano', 'secara', 'formal', 'dan', 'ketika', 'ia', 'menginjak', 'sekolah', 'SMP', 'pemilik', 'nama', 'asli', 'Kevin', 'Aprilio', 'Sumaatmaja', 'ini', ',', 'mulai', 'belajar', 'menulis', 'lagu', 'sendiri', '.', 'Pada', 'tanggal', '6', 'Februari', '1976', ',', 'wakil', 'ketua', 'Lockheed', 'Corporation', 'memberitahu', 'subkomite', 'Senat', 'AS', 'bahwa', 'Tanaka', 'selaku', 'PM', 'telah', 'dibayar', '(', 'disogok', ')', 'sebagai', 'ganjaran', 'pembelian', 'pesawat', 'Lockheed', 'L', '-', '1011', '.', 'Dengan', 'kondisi', 'alam', 'yang', 'sejuk', 'dan', 'curah', 'hujan', 'yang', 'tinggi', 'maka', 'didaerah', 'tersebut', 'banyak', 'didapati', 'bermacam', 'jenis', 'Flora', 'dan', 'fauna', 'seperti', ':', 'Gajah', 'yang', 'di', 'kenal', 'dengan', 'legenda', 'Pocut', 'Meurahnya', ',', 'rusa', ',',

In [58]:
from spacy.tokens import Doc
from spacy.training import Example
predicted = Doc(nlp_updated.vocab, words=pred_words, ents=pred_ents)
reference = Doc(nlp_updated.vocab, words=gold_words, ents=gold_ents)
example = Example(predicted, reference)

In [59]:
from spacy.scorer import Scorer

scorer = Scorer()
scores = scorer.score([example])
print(scores)

{'token_acc': None, 'token_p': None, 'token_r': None, 'token_f': None, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': 0.4225352112676056, 'ents_r': 0.30523255813953487, 'ents_f': 0.3544303797468354, 'ents_per_type': {'PERSON': {'p': 0.34146341463414637, 'r': 0.33816425120772947, 'f': 0.33980582524271846}, 'ORGANIZATION': {'p': 0.1523809523809524, 'r': 0.12903225806451613, 'f': 0.13973799126637554}, 'LOCATION': {'p': 0.6631016042780749, 'r': 0.3473389355742297, 'f': 0.4558823529411764}}, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}


# Testing Model by loading PDFs

In [18]:
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading PyPDF2-2.10.9-py3-none-any.whl (218 kB)
[K     |████████████████████████████████| 218 kB 26.9 MB/s 
Installing collected packages: PyPDF2
Successfully installed PyPDF2-2.10.9


In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk import stem
import os
import PyPDF2
from PyPDF2 import PdfFileReader
import numpy as np
from sklearn.feature_extraction import text
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [20]:
def pdf2text(pdf):
    '''Iterate over pages and extract text'''
    text = ''
    for i in range(pdf.getNumPages()):
        page = pdf.getPage(i)
        text = text + page.extractText()
    return text
 
def stem_tokenize(document):
    '''return stemmed words longer than 2 chars and all alpha'''
    tokens = [stem(w) for w in document.split() if len(w) > 2 and w.isalpha()]
    return tokens
 
def tokenize(document):
    '''return words longer than 2 chars and all alpha'''
    tokens = [w for w in document.split() if len(w) > 2 and w.isalpha()]
    return tokens
 
def build_corpus_from_dir(dir_path):
    corpus = []
    for root, dirs, filenames in os.walk(dir_path):
        for name in filenames:
            print(name)
            f  = os.path.join(root, name)
            pdf = PdfFileReader(f, 'rb')
            document = pdf2text(pdf)
            corpus.append(document)
    return corpus

In [26]:
corpus = build_corpus_from_dir('/content/drive/MyDrive/CSIT_Internship/indo_news')

indo_news1.pdf


In [27]:
test_text =''
for i in corpus:
  test_text=test_text+i
print(test_text)

Inggris dan dunia mengucapkan salam perpisahan yang terakhir kalinya kepada Ratu Elizabeth II di 
pemakaman kenegaraan, yang digelar Senin kemarin (19/09).  
 
Dihadiri para pemimpin dunia dan tentunya ratusan ribu warga di jalanan kota London, semua mata 
tertuju pada peti yang membawa jenazah mendiang Ratu Elizabeth II.  
 
Sebagai negara yang identik dengan kemewahan, pemakaman Ratu menjadi yang pemakaman 
kenegara an pertama kalinya dilakukan sejak pemakaman mantan perdana menteri Inggris Winston 
Churchill.  
 
Sebelum prosesi pemakaman, bel dibunyikan sebanyak 96 kali dalam setiap menit, menandakan usia 
Ratu Elizabeth II.  
 
Sebanyak 142 pelaut Angkatan Laut Kerajaan me narik kereta yang mengangkut peti mati ratu yang 
diselimuti bendera menuju Westminister Abbey.  
 
Di Westminister Abbey, 2.000 orang dari seluruh dunia, mulai dari pemimpin dunia hingga tenaga 
kesehatan berkumpul untuk menyaksikan upacara pemakaman.  
 
kartu d ari Raja Charles III yang ditulis tangannya send

In [28]:
doc = nlp_updated(test_text)
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)