# Natural Language Processing - Context-Free Grammar (CFG)
##### Dataset from Indonesian Treebank Corpus

This program made by Naufal Hilmiaji to accomplish my task in Natural Language Processing

In [0]:
import nltk
from nltk import Tree
import re
from collections import Counter
from nltk.grammar import CFG, Nonterminal
from nltk.parse.chart import BottomUpChartParser

In [0]:
import urllib.request  # untuk baca dari url raw github

dirty_corpus = []

for line in urllib.request.urlopen("https://raw.githubusercontent.com/famrashel/idn-treebank/master/Indonesian_Treebank.bracket"):
    dirty_corpus.append(line.decode('utf-8')) #append to list dirty_corpus[]

In [4]:
cleaned_corpus = [] #untuk menampung corpus yang sudah diubah

for x in range(200):
    fixed = ''.join(dirty_corpus[x]) #ubah list dirty_corpus[] jadi string fixed
    fixed = re.sub(r'\(([^()]*)\)', r'\1', fixed) #regex untuk menghapus kurung
    cleaned_corpus.append(fixed) #append string yg telah dihapus parentheses pada lexicon ke list cleaned_corpus[]
print(cleaned_corpus)

['(NP  (NN\t  Kera) (SBAR  (SC\t  untuk) (S  (NP-SBJ  *) (VP  (VB\t  amankan) (NP  (NN\t  pesta olahraga))))))\r\n', '(S        (NP-SBJ        (NNP\t        Pemerintah) (NNP\t        kota) (NNP\t        Delhi)) (VP        (VB\t        mengerahkan) (NP        (NN\t        monyet)) (SBAR        (SC\t        untuk) (S        (NP-SBJ        *) (VP        (VB\t        mengusir) (NP        (NP      (NN\t        monyet-monyet) (JJ\t        lain)) (SBAR        (SC\t        yang) (S        (NP-SBJ        *) (VP        (VB\t        berbadan) (ADJP        (RB\t        lebih) (JJ\t        kecil)))))) (PP        (IN\t        dari) (NP        (NN\t        arena) (NP      (NNP  Pesta Olahraga) (NNP\t        Persemakmuran)))))))) (Z\t        .))\r\n', '(S      (NP-SBJ      (CD\t      Beberapa) (NN\t      laporan)) (VP      (VB\t      menyebutkan) (SBAR      0 (S      (NP-SBJ-1    (QP      (RB\t      setidaknya) (CD\t      10)) (NN\t      monyet)) (VP      (VB\t      ditempatkan) (NP    *-1) (PP      (

## Extract Production Rules

In [5]:
prod = []
forLexicon = []

for i in range(len(cleaned_corpus)):
    t = Tree.fromstring(cleaned_corpus[i])
    forLexicon.append(t)
    prod.extend(t.productions())
print(prod)

[NP -> NN SBAR, NN -> 'Kera', SBAR -> SC S, SC -> 'untuk', S -> NP-SBJ VP, NP-SBJ -> '*', VP -> VB NP, VB -> 'amankan', NP -> NN, NN -> 'pesta' 'olahraga', S -> NP-SBJ VP Z, NP-SBJ -> NNP NNP NNP, NNP -> 'Pemerintah', NNP -> 'kota', NNP -> 'Delhi', VP -> VB NP SBAR, VB -> 'mengerahkan', NP -> NN, NN -> 'monyet', SBAR -> SC S, SC -> 'untuk', S -> NP-SBJ VP, NP-SBJ -> '*', VP -> VB NP PP, VB -> 'mengusir', NP -> NP SBAR, NP -> NN JJ, NN -> 'monyet-monyet', JJ -> 'lain', SBAR -> SC S, SC -> 'yang', S -> NP-SBJ VP, NP-SBJ -> '*', VP -> VB ADJP, VB -> 'berbadan', ADJP -> RB JJ, RB -> 'lebih', JJ -> 'kecil', PP -> IN NP, IN -> 'dari', NP -> NN NP, NN -> 'arena', NP -> NNP NNP, NNP -> 'Pesta' 'Olahraga', NNP -> 'Persemakmuran', Z -> '.', S -> NP-SBJ VP Z, NP-SBJ -> CD NN, CD -> 'Beberapa', NN -> 'laporan', VP -> VB SBAR, VB -> 'menyebutkan', SBAR -> '0' S, S -> NP-SBJ-1 VP, NP-SBJ-1 -> QP NN, QP -> RB CD, RB -> 'setidaknya', CD -> '10', NN -> 'monyet', VP -> VB NP PP, VB -> 'ditempatkan', NP 

## Top-10 Most Common Rules

In [6]:
data_set = Counter(prod)
print(data_set.most_common(10))

[(PP -> IN NP, 283), (S -> NP-SBJ VP, 199), (Z -> '.', 192), (SBAR -> SC S, 160), (NP-SBJ -> '*', 154), (VP -> VB NP, 143), (Z -> ',', 134), (NP -> NN, 120), (SC -> 'yang', 99), (VP -> MD VP, 85)]


## Generate Grammar CFG

In [7]:
prod = []
for i in range(len(cleaned_corpus)):
    t = Tree.fromstring(cleaned_corpus[i])
    prod.extend(t.productions())
grammar = CFG(Nonterminal('S'), prod)
print(grammar)

Grammar with 7705 productions (start state = S)
    NP -> NN SBAR
    NN -> 'Kera'
    SBAR -> SC S
    SC -> 'untuk'
    S -> NP-SBJ VP
    NP-SBJ -> '*'
    VP -> VB NP
    VB -> 'amankan'
    NP -> NN
    NN -> 'pesta' 'olahraga'
    S -> NP-SBJ VP Z
    NP-SBJ -> NNP NNP NNP
    NNP -> 'Pemerintah'
    NNP -> 'kota'
    NNP -> 'Delhi'
    VP -> VB NP SBAR
    VB -> 'mengerahkan'
    NP -> NN
    NN -> 'monyet'
    SBAR -> SC S
    SC -> 'untuk'
    S -> NP-SBJ VP
    NP-SBJ -> '*'
    VP -> VB NP PP
    VB -> 'mengusir'
    NP -> NP SBAR
    NP -> NN JJ
    NN -> 'monyet-monyet'
    JJ -> 'lain'
    SBAR -> SC S
    SC -> 'yang'
    S -> NP-SBJ VP
    NP-SBJ -> '*'
    VP -> VB ADJP
    VB -> 'berbadan'
    ADJP -> RB JJ
    RB -> 'lebih'
    JJ -> 'kecil'
    PP -> IN NP
    IN -> 'dari'
    NP -> NN NP
    NN -> 'arena'
    NP -> NNP NNP
    NNP -> 'Pesta' 'Olahraga'
    NNP -> 'Persemakmuran'
    Z -> '.'
    S -> NP-SBJ VP Z
    NP-SBJ -> CD NN
    CD -> 'Beberapa'
    NN -> 'l

## Parsing Example

In [8]:
#sentence = 'Binatang ini tidak bisa dibunuh karena masyarakat India menganggap mereka suci .'

parser = BottomUpChartParser(grammar)
sentence = 'Binatang ini tidak bisa dibunuh karena masyarakat India menganggap mereka suci .'.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

(S
  (NP-SBJ
    (NP
      (NN Binatang)
      (S-NOM
        (NP-SBJ (PR ini))
        (VP (NEG tidak) (MD bisa) (VP (VB dibunuh)))))
    (PP
      (IN karena)
      (S-NOM
        (NP-SBJ (NP (NN masyarakat)) (NP (NNP India)))
        (VP (VB menganggap)))))
  (NP-PRD (NP (NP (PRP mereka)) (JJ suci)) (NP (NNP .))))


## Top-50 Most Common Lexicons (10 poin)

In [9]:
is_lexicon = []
isnt_lexicon = []

for i in range(len(prod)):
    if prod[i].is_lexical() == True:
        is_lexicon.append(prod[i])
    else:
        isnt_lexicon.append(prod[i])

data_set = Counter(is_lexicon)
print(data_set.most_common(50))

[(Z -> '.', 192), (NP-SBJ -> '*', 154), (Z -> ',', 134), (SC -> 'yang', 99), (IN -> 'di', 81), (CC -> 'dan', 70), (NP -> '*-1', 69), (Z -> '"', 68), (PRP -> 'nya', 61), (SBAR -> '0' S, 59), (IN -> 'dari', 40), (PR -> 'itu', 37), (SC -> 'untuk', 35), (MD -> 'akan', 33), (PR -> 'ini', 31), (NP-SBJ-1 -> '*', 27), (IN -> 'dengan', 23), (NP -> '*-2', 23), (VB -> 'mengatakan', 23), (IN -> 'dalam', 21), (NP-SBJ -> '*-1', 20), (PRP -> 'mereka', 20), (RB -> 'lebih', 19), (IN -> 'untuk', 18), (NN -> 'tahun', 18), (NEG -> 'tidak', 17), (S -> '*T*-1', 16), (NN -> 'orang', 16), (PRP -> 'dia', 16), (VB -> 'kata', 14), (IN -> 'pada', 14), (IN -> 'oleh', 14), (IN -> 'kepada', 13), (NP-SBJ-2 -> '*', 13), (NN -> 'monyet', 12), (IN -> 'sebagai', 12), (CD -> 'banyak', 11), (VB -> 'menjadi', 11), (NNP -> 'The', 11), (VB -> 'merokok', 11), (MD -> 'bisa', 10), (IN -> 'ke', 10), (Z -> '-', 10), (NN -> 'parlemen', 9), (VB -> 'adalah', 9), (SC -> 'bahwa', 9), (VB -> 'meraih', 9), (NN -> 'anggur', 9), (CD -> 'du

## Sentences and Parsing

In [10]:
lexicons = [] #list seluruh lexicon

for i in range(len(forLexicon)):
    for j in forLexicon[i].subtrees(lambda x: x.height() == 2):
        lexicons.append(j[0])
    
print(lexicons)

['Kera', 'untuk', '*', 'amankan', 'pesta', 'Pemerintah', 'kota', 'Delhi', 'mengerahkan', 'monyet', 'untuk', '*', 'mengusir', 'monyet-monyet', 'lain', 'yang', '*', 'berbadan', 'lebih', 'kecil', 'dari', 'arena', 'Pesta', 'Persemakmuran', '.', 'Beberapa', 'laporan', 'menyebutkan', 'setidaknya', '10', 'monyet', 'ditempatkan', '*-1', 'di', 'luar', 'arena', 'lomba', 'dan', 'pertandingan', 'di', 'ibukota', 'India', '.', 'Pemkot', 'Delhi', 'memiliki', '28', 'monyet', 'dan', 'berencana', '*-1', 'mendatangkan', '10', 'monyet', 'sejenis', 'dari', 'negara', 'Rajasthan', '.', 'Jumlah', 'monyet', 'di', 'ibukota', 'India', 'mencapai', 'ribuan', ',', 'sebagian', 'berada', 'di', 'kantor-kantor', 'pemerintah', 'dan', 'hewan', 'ini', 'dianggap', '*-1', 'mengganggu', 'ketertiban', 'umum', '.', 'Jenis', 'monyet', 'yang', '*', 'dikerahkan', '*-1', 'pemkot', 'berbadan', 'besar', ',', 'berekor', 'panjang', ',', 'dan', 'memiliki', 'wajah', '*', 'berwarna', 'hitam', '.', 'Monyet', 'ini', 'diikat', '*-1', 'denga

In [0]:
sent1 = 'Pemkot Delhi mengganggu warga Delhi dengan monyet .'
sent2 = 'Bill Gates meraup kekayaan dari industri bangunan baru .'
sent3 = 'Simon Clark , Presiden Amerika melakukan kejahatan di India .'
sent4 = 'Zinedine Zidane mengaku pernah menjadi pemain sepak bola di Spanyol .'
sent5 = 'Indonesia akan menjadi rumah bagi investor asing dari Cina .'

In [12]:
# Sentence 1
# Pemkot Delhi mengganggu warga Delhi dengan monyet .

parser = BottomUpChartParser(grammar)
sentence = sent1.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

(S
  (NP-SBJ
    (NP (NN Pemkot))
    (S
      (S-SBJ (NP-SBJ (NNP Delhi)) (VP (VB mengganggu)))
      (NP-PRD (NP (NN warga)) (NP (NNP Delhi)))))
  (NP-PRD (NP (QP (IN dengan) (NN monyet))) (NP (NNP .))))


In [14]:
# Sentence 2
# Bill Gates meraup kekayaan dari industri bangunan baru .

parser = BottomUpChartParser(grammar)
sentence = sent2.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

(S
  (NP-SBJ
    (NP (NNP Bill))
    (S
      (S-SBJ (NP-SBJ (NNP Gates)) (VP (VB meraup)))
      (NP-PRD
        (NP (NN kekayaan))
        (PP (IN dari) (NP-LGS (NN industri))))))
  (NP-1 (NN bangunan))
  (VP (MD baru) (FRAG (NP (NNP .)))))


In [13]:
# Sentence 3
# Simon Clark , Presiden Amerika melakukan kejahatan di India .

parser = BottomUpChartParser(grammar)
sentence = sent3.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

(S
  (NP-SBJ
    (NP (NNP Simon))
    (S
      (NP-SBJ (NNP Clark))
      (Z ,)
      (S-PRD
        (NP-SBJ (NP (NN Presiden)) (NP (NNP Amerika)))
        (VP (VB melakukan)))))
  (NP-PRD
    (NP (NP (NN kejahatan)) (PP (IN di) (NP (NNP India))))
    (NP (NNP .))))


In [15]:
# Sentence 4
# Zinedine Zidane mengaku pernah menjadi pemain sepak bola di Spanyol .

parser = BottomUpChartParser(grammar)
sentence = sent4.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

(S
  (NP-SBJ-1 (NP (NNP Zinedine)) (NP (NNP Zidane)))
  (VP
    (VB mengaku)
    (SINV
      (VP
        (VP (MD pernah))
        (VP
          (VB menjadi)
          (S
            (NP-SBJ (NP (NN pemain)) (NP (NN sepak bola)))
            (PP (IN di) (NP (NNP Spanyol))))))
      (NP-SBJ (NNP .)))))


In [18]:
# Sentence 5

parser = BottomUpChartParser(grammar)
sentence = sent5.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

(S
  (S-SBJ
    (NP-SBJ (NNP Indonesia))
    (VP (VP (MD akan)) (VP (VB menjadi))))
  (NP-PRD
    (NP (NN rumah))
    (PP
      (IN bagi)
      (NP-LGS
        (NP
          (NP (NN investor) (ADJP (JJ asing)))
          (PP (IN dari) (NP (NNP Cina))))
        (NP (NNP .))))))


This program made by Naufal Hilmiaji to accomplish my task in Natural Language Processing.
You are allowed to clone or download. Please use wisely!