In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import treebank
from nltk.grammar import induce_pcfg
from nltk import Nonterminal, ProbabilisticProduction
from nltk.parse import ViterbiParser
import time

from devoir4 import *

In [2]:
column_names = ["Source_Code", "Acceptability_Label", "Acceptability_Judgment", "Sentence"]

In [54]:
df = pd.read_csv(r"C:\Users\barka\Desktop\NLP\ift6285-devoirs\Devoir 4\data\cola_public\tokenized\in_domain_dev.tsv", delimiter='\t', header=None, names=column_names)
df.head()

Unnamed: 0,Source_Code,Acceptability_Label,Acceptability_Judgment,Sentence
0,gj04,1,,the sailors rode the breeze clear of the rocks .
1,gj04,1,,the weights made the rope stretch over the pul...
2,gj04,1,,the mechanical doll wriggled itself loose .
3,cj99,1,,"if you had eaten more , you would want less ."
4,cj99,0,*,"as you eat the most , you want the least ."


In [55]:
problematic_sentences_df = df[df['Acceptability_Label'] == 0]

problematic_sentences_df.head()

Unnamed: 0,Source_Code,Acceptability_Label,Acceptability_Judgment,Sentence
4,cj99,0,*,"as you eat the most , you want the least ."
5,cj99,0,*,"the more you would want , the less you would e..."
6,cj99,0,*,"i demand that the more john eat , the more he ..."
13,cj99,0,*,"the more does bill smoke , the more susan hate..."
16,bc01,0,*,who does john visit sally because he likes ?


In [57]:
problematic_sentences = problematic_sentences_df["Sentence"].values
len(problematic_sentences)

162

In [58]:
tic = time.perf_counter()
productions = get_PTB_produtions()
print(f"Took {time.perf_counter() - tic}s.")

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\barka\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Took 11.54024629999185s.


In [59]:
tic = time.perf_counter()
full_grammar = induce_pcfg(Nonterminal("S"), productions)
print(f"Took {time.perf_counter() - tic}s.")

Took 1.3252873999881558s.


In [60]:
parser = ViterbiParser(full_grammar)

In [61]:
ptb_sentences = treebank.sents()

known_words = set(word.lower() for sentence in ptb_sentences for word in sentence)


In [63]:
unk_token = 'UNK'
modified_sentences = []

for sentence in problematic_sentences:
    words = sentence.split()
    modified_words = [word if word.lower() in known_words else unk_token for word in words]
    modified_sentence = ' '.join(modified_words)
    modified_sentences.append(modified_sentence)
    

In [64]:
modified_sentences

['as you eat the most , you want the least .',
 'the more you would want , the less you would eat .',
 'i demand that the more john eat , the more he UNK .',
 'the more does bill smoke , the more susan UNK him .',
 'who does john visit UNK because he UNK ?',
 'mickey looked up it .',
 'the box contained the ball from the tree .',
 'the tube was escaped by gas .',
 'what the water did to the bottle was fill it .',
 'what the water did to the whole bottle was fill it .',
 'mary UNK plays the UNK .',
 'sue gave to bill a book .',
 'they represented seriously to the dean mary as a genuine UNK .',
 'us UNK they .',
 'mary intended john to go abroad .',
 "we wanted to invite someone , but we could n't decide who to .",
 'mary came to be introduced by the UNK and i also came to be .',
 "i know which book UNK did n't read for class , and which book lilly did it for him .",
 "this is the book which bob reviewed , and this is the one which fred wo n't do it .",
 "i know which book UNK read , and

In [65]:
non_terminals = set([Nonterminal(nt) for sent in modified_sentences for nt in sent.split() if isinstance(Nonterminal(nt), Nonterminal)])

In [66]:
non_terminals

{!,
 'd,
 's,
 ,,
 .,
 150,
 ?,
 UNK,
 a,
 about,
 abroad,
 absurd,
 achieved,
 advice,
 after,
 against,
 alienated,
 also,
 am,
 an,
 and,
 andy,
 answer,
 any,
 apart,
 apples,
 are,
 article,
 as,
 ask,
 at,
 ball,
 bars,
 basket,
 battle,
 be,
 became,
 because,
 been,
 before,
 being,
 believed,
 best,
 bill,
 black,
 blood,
 blue,
 bob,
 book,
 books,
 bottle,
 box,
 boy,
 brian,
 broke,
 build,
 building,
 but,
 buy,
 buying,
 by,
 ca,
 came,
 can,
 carla,
 child,
 children,
 city,
 claim,
 class,
 come,
 comment,
 conditions,
 contained,
 contract,
 could,
 country,
 criticized,
 cut,
 dark,
 dean,
 decide,
 definitely,
 demand,
 depends,
 diamond,
 did,
 differ,
 difficult,
 dinner,
 do,
 does,
 door,
 drive,
 drives,
 dumped,
 easily,
 easy,
 eat,
 employer,
 end,
 english,
 escaped,
 every,
 everyone,
 expect,
 failed,
 father,
 fed,
 fell,
 few,
 fierce,
 file,
 fill,
 for,
 fought,
 fountain,
 france,
 fred,
 friend,
 from,
 gas,
 gave,
 genuine,
 get,
 girl,
 give,
 give

In [67]:
unknown_word = "UNK"

for non_terminal in non_terminals:
    productions.append(ProbabilisticProduction(non_terminal, [unknown_word]))

productions.append(ProbabilisticProduction(Nonterminal("UNK"), [unknown_word]))

In [68]:
for sent in modified_sentences:
    tokens = nltk.word_tokenize(sent)
    for token in tokens:
        productions.append(ProbabilisticProduction(Nonterminal(token), [f"{token}"]))

In [69]:
tic = time.perf_counter()
full_grammar = induce_pcfg(Nonterminal("S"), productions)
print(f"Took {time.perf_counter() - tic}s.")

Took 1.4109680999536067s.


In [70]:
modified_sentences[3]

'the more does bill smoke , the more susan UNK him .'

In [71]:
parser = ViterbiParser(full_grammar)

In [None]:
from tqdm import tqdm

all_parses = []

for sentence in tqdm(modified_sentences, desc="Parsing Sentences"):
    tokens = nltk.word_tokenize(sentence)
    parses = parser.parse_all(tokens)
    
    for parse in parses:
        all_parses.append(parse)

Parsing Sentences:   2%|█▏                                                             | 3/162 [00:32<27:18, 10.30s/it]