# Natural Language Processing - Context-Free Grammar (CFG)
##### Dataset from Indonesian Treebank Corpus

This program made by Naufal Hilmiaji to accomplish my task in Natural Language Processing

In [None]:
import nltk
from nltk import Tree
import re
from collections import Counter
from nltk.grammar import CFG, Nonterminal
from nltk.parse.chart import BottomUpChartParser

In [None]:
import urllib.request  # untuk baca dari url raw github

dirty_corpus = []

for line in urllib.request.urlopen("https://raw.githubusercontent.com/famrashel/idn-treebank/master/Indonesian_Treebank.bracket"):
    dirty_corpus.append(line.decode('utf-8')) #append to list dirty_corpus[]

In [None]:
cleaned_corpus = [] #untuk menampung corpus yang sudah diubah

for x in range(200):
    fixed = ''.join(dirty_corpus[x]) #ubah list dirty_corpus[] jadi string fixed
    fixed = re.sub(r'\(([^()]*)\)', r'\1', fixed) #regex untuk menghapus kurung
    cleaned_corpus.append(fixed) #append string yg telah dihapus parentheses pada lexicon ke list cleaned_corpus[]
print(cleaned_corpus)

## Extract Production Rules

In [None]:
prod = []
forLexicon = []

for i in range(len(cleaned_corpus)):
    t = Tree.fromstring(cleaned_corpus[i])
    forLexicon.append(t)
    prod.extend(t.productions())
print(prod)

## Top-10 Most Common Rules

In [None]:
data_set = Counter(prod)
print(data_set.most_common(10))

## Generate Grammar CFG

In [None]:
prod = []
for i in range(len(cleaned_corpus)):
    t = Tree.fromstring(cleaned_corpus[i])
    prod.extend(t.productions())
grammar = CFG(Nonterminal('S'), prod)
print(grammar)

## Parsing Example

In [None]:
#sentence = 'Binatang ini tidak bisa dibunuh karena masyarakat India menganggap mereka suci .'

parser = BottomUpChartParser(grammar)
sentence = 'Binatang ini tidak bisa dibunuh karena masyarakat India menganggap mereka suci .'.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

## Top-50 Most Common Lexicons (10 poin)

In [None]:
is_lexicon = []
isnt_lexicon = []

for i in range(len(prod)):
    if prod[i].is_lexical() == True:
        is_lexicon.append(prod[i])
    else:
        isnt_lexicon.append(prod[i])

data_set = Counter(is_lexicon)
print(data_set.most_common(50))

## Sentences and Parsing

In [None]:
lexicons = [] #list seluruh lexicon

for i in range(len(forLexicon)):
    for j in forLexicon[i].subtrees(lambda x: x.height() == 2):
        lexicons.append(j[0])
    
print(lexicons)

In [None]:
sent1 = 'Pemkot Delhi mengganggu warga Delhi dengan monyet .'
sent2 = 'Bill Gates meraup kekayaan dari industri bangunan baru .'
sent3 = 'Simon Clark , Presiden Amerika melakukan kejahatan di India .'
sent4 = 'Zinedine Zidane mengaku pernah menjadi pemain sepak bola di Spanyol .'
sent5 = 'Indonesia akan menjadi rumah bagi investor asing dari Cina .'

In [None]:
# Sentence 1
# Pemkot Delhi mengganggu warga Delhi dengan monyet .

parser = BottomUpChartParser(grammar)
sentence = sent1.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

In [None]:
# Sentence 2
# Bill Gates meraup kekayaan dari industri bangunan baru .

parser = BottomUpChartParser(grammar)
sentence = sent2.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

In [None]:
# Sentence 3
# Simon Clark , Presiden Amerika melakukan kejahatan di India .

parser = BottomUpChartParser(grammar)
sentence = sent3.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

In [None]:
# Sentence 4
# Zinedine Zidane mengaku pernah menjadi pemain sepak bola di Spanyol .

parser = BottomUpChartParser(grammar)
sentence = sent4.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

In [None]:
# Sentence 5

parser = BottomUpChartParser(grammar)
sentence = sent5.split()
parsed = list(parser.parse(sentence))
print(parsed[0])

This program made by Naufal Hilmiaji to accomplish my task in Natural Language Processing.
You are allowed to clone or download. Please use wisely!