In [12]:
import nltk
nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ThienLaptop\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ThienLaptop\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\ThienLaptop\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


True

In [10]:
text = nltk.word_tokenize("We are going to the party")
print(nltk.pos_tag(text))

[('We', 'PRP'), ('are', 'VBP'), ('going', 'VBG'), ('to', 'TO'), ('the', 'DT'), ('party', 'NN')]


In [5]:
from nltk.parse.generate import generate, demo_grammar
from nltk import CFG

In [6]:
print(demo_grammar)


  S -> NP VP
  NP -> Det N
  PP -> P NP
  VP -> 'slept' | 'saw' NP | 'walked' PP
  Det -> 'the' | 'a'
  N -> 'man' | 'park' | 'dog'
  P -> 'in' | 'with'



In [7]:
from __future__ import print_function

import itertools
import sys
from nltk.grammar import Nonterminal


def generate(grammar, start=None, depth=None, n=None):
    """
    Generates an iterator of all sentences from a CFG.

    :param grammar: The Grammar used to generate sentences.
    :param start: The Nonterminal from which to start generate sentences.
    :param depth: The maximal depth of the generated tree.
    :param n: The maximum number of sentences to return.
    :return: An iterator of lists of terminal tokens.
    """
    if not start:
        start = grammar.start()
    if depth is None:
        depth = sys.maxsize

    iter = _generate_all(grammar, [start], depth)

    if n:
        iter = itertools.islice(iter, n)

    return iter



def _generate_all(grammar, items, depth):
    if items:
        try:
            for frag1 in _generate_one(grammar, items[0], depth):
                for frag2 in _generate_all(grammar, items[1:], depth):
                    yield frag1 + frag2
        except RuntimeError as _error:
            if _error.message == "maximum recursion depth exceeded":
                # Helpful error message while still showing the recursion stack.
                raise RuntimeError("The grammar has rule(s) that yield infinite recursion!!")
            else:
                raise
    else:
        yield []


def _generate_one(grammar, item, depth):
    if depth > 0:
        if isinstance(item, Nonterminal):
            for prod in grammar.productions(lhs=item):
                for frag in _generate_all(grammar, prod.rhs(), depth-1):
                    yield frag
        else:
            yield [item]

demo_grammar = """
  S -> NP VP
  NP -> Det N
  PP -> P NP
  VP -> 'slept' | 'saw' NP | 'walked' PP
  Det -> 'the' | 'a'
  N -> 'man' | 'park' | 'dog'
  P -> 'in' | 'with'
"""


def demo(N=23):
    from nltk.grammar import CFG

    print('Generating the first %d sentences for demo grammar:' % (N,))
    print(demo_grammar)
    grammar = CFG.fromstring(demo_grammar)
    for n, sent in enumerate(generate(grammar, n=N), 1):
        print('%3d. %s' % (n, ' '.join(sent)))



if __name__ == '__main__':
    demo()

Generating the first 23 sentences for demo grammar:

  S -> NP VP
  NP -> Det N
  PP -> P NP
  VP -> 'slept' | 'saw' NP | 'walked' PP
  Det -> 'the' | 'a'
  N -> 'man' | 'park' | 'dog'
  P -> 'in' | 'with'

  1. the man slept
  2. the man saw the man
  3. the man saw the park
  4. the man saw the dog
  5. the man saw a man
  6. the man saw a park
  7. the man saw a dog
  8. the man walked in the man
  9. the man walked in the park
 10. the man walked in the dog
 11. the man walked in a man
 12. the man walked in a park
 13. the man walked in a dog
 14. the man walked with the man
 15. the man walked with the park
 16. the man walked with the dog
 17. the man walked with a man
 18. the man walked with a park
 19. the man walked with a dog
 20. the park slept
 21. the park saw the man
 22. the park saw the park
 23. the park saw the dog


In [13]:
from nltk.corpus import brown
prepchoices = nltk.ConditionalFreqDist((v[0], p[0]) 
    for (v, p) in nltk.bigrams(brown.tagged_words(tagset="universal")) 
        if v[1] == "VERB" and p[1] == "ADP") 

In [14]:
prepchoices["writing"]

FreqDist({'in': 5, 'from': 3, 'at': 3, 'to': 2, 'about': 1, 'under': 1, 'with': 1, 'on': 1, 'for': 1, 'since': 1})

In [15]:
grammar = {}
grammar["sitting"] = {}
grammar["sitting"]["table"] = "on"
grammar["sitting"]["van"] = "in"

In [16]:
print(grammar)

{'sitting': {'table': 'on', 'van': 'in'}}


In [20]:
import spacy
nlp = spacy.load('en_core_web_sm')
sent = "when the bell rang, saurav went out"
doc=nlp(sent)

sub_toks = [tok for tok in doc if (tok.dep_ == "nsubj") ]

print(sub_toks) 

[bell, saurav]


In [23]:
from __future__ import unicode_literals, print_function

raw_text = 'Hello, world. Here are two sentences.'
nlp = spacy.load('en_core_web_sm')
doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents]
print(sentences)

AttributeError: 'spacy.tokens.span.Span' object has no attribute 'string'

In [None]:
import spacy
nlp = spacy.load('en')

doc = nlp(u"the shop is closed.")

for token in doc:
    print(token, token.lemma, token.lemma_)

In [None]:
def noun_chunks(doc, drop_determiners=True, min_freq=1):
    """
    Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally
    filtering by frequency and dropping leading determiners.
    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        drop_determiners (bool): remove leading determiners (e.g. "the")
            from phrases (e.g. "the quick brown fox" => "quick brown fox")
        min_freq (int): remove chunks that occur in ``doc`` fewer than
            ``min_freq`` times
    Yields:
        ``spacy.Span``: the next noun chunk from ``doc`` in order of appearance
        in the document
    """
    if hasattr(doc, 'spacy_doc'):
        ncs = doc.spacy_doc.noun_chunks
    else:
        ncs = doc.noun_chunks
    if drop_determiners is True:
        ncs = (nc if nc[0].pos != DET else nc[1:]
               for nc in ncs)
    if min_freq > 1:
        ncs = list(ncs)
        freqs = itertoolz.frequencies(nc.lower_ for nc in ncs)
        ncs = (nc for nc in ncs
               if freqs[nc.lower_] >= min_freq)

    for nc in ncs:
        yield nc

In [None]:
noun_chunks("the boy")

In [None]:
import spacy
from nltk import Tree


en_nlp = spacy.load('en')

doc = en_nlp("The downside is that, because statistical programs are easy to use, it is equally easy to do the wrong analysis.")

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

In [None]:
import spacy
from nltk import Tree


en_nlp = spacy.load('en')

doc = en_nlp("children plays in the garden")

def tok_format(tok):
    return "_".join([tok.orth_, tok.dep_, tok.tag_])


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)


[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

In [None]:
doc2 = en_nlp("He was swimming in the river")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

In [None]:
doc3 = en_nlp("When he got the email, he came to my small office house and started shouting.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]
for sent in doc3.sents:
    print(sent.root)
    for ch in sent.root.children:
        if(ch.tag_ == "IN"):
            print(ch)
            for sec in ch.children:
                print(sec)

In [None]:
doc3 = en_nlp("I am walking on the road.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]
for sent in doc3.sents:
    print(sent.root)
    for ch in sent.root.children:
        if(ch.tag_ == "IN"):
            print(ch)
            for sec in ch.children:
                print(sec)

In [None]:
doc3 = en_nlp("The little boys were playing in the garden")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]
for sent in doc3.sents:
    print(sent.root)
    for ch in sent.root.children:
        if(ch.tag_ == "IN"):
            print(ch)
            for sec in ch.children:
                print(sec)

In [None]:
doc3 = en_nlp("Admist all confusion, Salman was found guilty in the case.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]
for sent in doc3.sents:
    print(sent.root)
    for ch in sent.root.children:
        if(ch.tag_ == "IN"):
            print(ch)
            for sec in ch.children:
                print(sec)

In [None]:
doc3 = en_nlp("The mother was cooking dinner in the home kitchen and the boys were playing in the garden.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]

grammar = {}

def VB_IN_NN(payload):
    if(payload.tag_[:2] != 'VB'):
        return
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            VB_IN_NN(ch)
    temp = [payload]
    for ch in payload.children:
        if(ch.tag_ == "IN"):
            temp.append(ch)
            for sec in ch.children:
                temp.append(sec)
                if(len(temp) == 3):
                    grammar[payload.text.lower()] = {}
                    grammar[payload.text.lower()][sec.text.lower()] = ch.text.lower()
                return
    

for sent in doc3.sents:
    VB_IN_NN(sent.root)
print(grammar)

In [None]:
import nltk
from nltk.corpus import brown
from nltk.tokenize.moses import MosesDetokenizer
mdetok = MosesDetokenizer()

In [None]:
for sent in brown.sents('cb01')[:20]:
    munged_sentence = ' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'")
    print(mdetok.detokenize(munged_sentence.split(), return_str=True))
    print()

In [None]:
count = 0

for cps in brown.fileids()[:10]:
    
    for sent in brown.sents(cps):
        count += 1
        munged_sentence = ' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'")
        doc4 = en_nlp(mdetok.detokenize(munged_sentence.split(), return_str=True))
        #[to_nltk_tree(sent.root).pretty_print() for sent in doc4.sents]
        for sent in doc4.sents:
            VB_IN_NN(sent.root)

print(grammar)
#print(str(len(combos)) + " " + str(count))

In [None]:
grammar["joined"]["1925"]

In [None]:
import numpy as np
np.save('correctly.npy', grammar)

In [None]:
temp_grammar = np.load('correctly.npy').item()

In [None]:
def VB_IN_NN_correction(payload, raw_text, master_dictionary):
	if(payload.tag_[:2] != 'VB'):
		return
	for ch in payload.children:
		if(ch.tag_[:2] == 'VB'):
			VB_IN_NN(ch)
	temp = [payload]
	for ch in payload.children:
		if(ch.tag_ == "IN"):
			temp.append(ch)
			for sec in ch.children:
				temp.append(sec)
				if(len(temp) == 3):
					try:
						correct_prep = master_dictionary[payload.text.lower()][sec.text.lower()]
						if(correct_prep != ch.text.lower()):
							raw_text = raw_text[:ch.idx] + raw_text[ch.idx:].replace(temp[1].text, correct_prep, 1)
							return raw_text
					except KeyError:
						return raw_text
				return

In [None]:
text = "i was dancing with the park."
doc = en_nlp(text)
for sent in doc.sents:
    text = VB_IN_NN_correction(sent.root, text, grammar)
    print(text)

In [None]:
from pattern.en import conjugate, lemma, lexeme, INFINITIVE, PRESENT, PAST, PARTICIPLE, FUTURE, SG, PL, INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE, PROGRESSIVE 

In [None]:
print(conjugate(verb='downloading', tense=PRESENT, mood=INDICATIVE, aspect=PROGRESSIVE, person=1, number=PL)) # add aspect=PROGRESSIVE to indicate continuous tense

In [None]:
doc2 = en_nlp("has ram taken the ball?")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

In [None]:
doc2 = en_nlp("ram has been watching tv.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

In [None]:
for sent in doc2.sents:
    for comp in sent.root.children:
        if(comp.tag_ == 'VBD'):
            print(comp)

In [None]:
def VB_VB_VB(payload):
    if(payload.tag_[:2] != 'VB'):
        return
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            VB_VB_VB(ch)
    temp = []
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            temp.append(ch.lower_ + '_' + ch.tag_)
        if(len(temp) == 2):
            temp.append(payload.lower_+ '_' + ch.tag_)
            combos.append(temp)
            temp = []

In [None]:
import spacy
from nltk import Tree

en_nlp = spacy.load('en')
rtext = "Ramu has been travel since early this year."
doc2 = en_nlp(rtext)
combos = []
for sent in doc2.sents:
    rtext = VB_VB_VB_correction(sent.root, rtext)
print(rtext)

In [None]:
def VB_VB_VB_correction(payload, raw_text):
    if(payload.tag_[:2] != 'VB' and payload.tag_[:2] != 'NN'  and payload.tag_[:2] != 'JJ'):
        return
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'): # this might need to be removed
            VB_VB_VB_correction(ch, raw_text)
    temp = []
    nounBeforeVerb = False
    nounAfterVerb = False
    verbFound = False
    since = False
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            verbFound = True
        if((not verbFound) and (ch.tag_[:2] == 'NN' or ch.tag_[:2] == 'PR')):
            nounBeforeVerb = True
        if(verbFound and (ch.tag_[:2] == 'NN' or ch.tag_[:2] == 'PR')):
            nounAfterVerb = True
        if(ch.lower_ == 'since'):
            since = True
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            # print(ch.idx)
            temp.append(ch.lower_ + '_' + ch.tag_)
        if(len(temp) == 2):
            temp.append(payload.lower_+ '_' + ch.tag_)
            #print(temp)
            if (temp[0][-3:] == 'VBZ' or temp[0][-3:] == 'VBP') and temp[1][-3:] == 'VBN':
                if nounAfterVerb or since:
                    x = conjugate(verb=lemma(temp[2][:-4]), tense=PRESENT, mood=INDICATIVE, aspect=PROGRESSIVE, person=1, number=PL)
                elif nounBeforeVerb:
                    x = conjugate(verb=lemma(temp[2][:-4]), tense=PAST+PARTICIPLE, mood=INDICATIVE, person=1, number=PL)
                # print(temp[2][:-4] + ' -> ' + x)
            combos.append(temp)
            # print(nounBeforeVerb)
            raw_text = raw_text[:payload.idx] + raw_text[payload.idx:].replace(temp[2][:-4], x, 1)
            #print(raw_text)
            temp = []
            return raw_text
    return raw_text

In [None]:
def VB_IN_NN(payload):
	if(payload.tag_[:2] != 'VB'):
		return
	for ch in payload.children:
		if(ch.tag_[:2] == 'VB'):
			VB_IN_NN(ch)
	temp = [payload]
	for ch in payload.children:
		if(ch.tag_ == "IN"):
			temp.append(ch)
			for sec in ch.children:
				temp.append(sec)
				if(len(temp) == 3):
					grammar[payload.text.lower()] = {}
					grammar[payload.text.lower()][sec.text.lower()] = ch.text.lower()
				return
            
def VB_IN_NN_correction(payload, raw_text, master_dictionary):
	if(payload.tag_[:2] != 'VB'):
		return
	for ch in payload.children:
		if(ch.tag_[:2] == 'VB'):
			VB_IN_NN_correction(ch, raw_text, master_dictionary)
	temp = [payload]
	for ch in payload.children:
		if(ch.tag_ == "IN"):
			temp.append(ch)
			for sec in ch.children:
				temp.append(sec)
				if(len(temp) == 3):
					try:
						correct_prep = master_dictionary[payload.text.lower()][sec.text.lower()]
						if(correct_prep != ch.text.lower()):
							raw_text = raw_text[:ch.idx] + raw_text[ch.idx:].replace(temp[1].text, correct_prep, 1)
							return raw_text
					except KeyError:
						return raw_text
	return raw_text

In [None]:
def VB_VB_correction(payload, raw_text):
    if(payload.tag_[:2] != 'VB'):
        return
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'): # this might need to be removed
            VB_VB_VB_correction(ch, raw_text)
            
            if(ch.lower_ == 'has') or (ch.lower_ == 'have') or (ch.lower_ == 'had'):
                x = conjugate(verb=lemma(payload.text), tense=PAST+PARTICIPLE, mood=INDICATIVE, person=1, number=PL)
            else:
                x = conjugate(verb=lemma(payload.text), tense=PRESENT, mood=INDICATIVE, aspect=PROGRESSIVE, person=1, number=PL)
        
            raw_text = raw_text[:payload.idx] + raw_text[payload.idx:].replace(payload.text, x, 1)
            return raw_text
    return raw_text

In [None]:
doc2 = en_nlp("he has doing his homework")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

In [None]:
doc2 = en_nlp("he is walking on the road")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

In [None]:
import spacy
from nltk import Tree

en_nlp = spacy.load('en')
rtext = "He has done his homework."
doc2 = en_nlp(rtext)
combos = []
for sent in doc2.sents:
    rtext = VB_VB_correction(sent.root, rtext)
print(rtext)