In [43]:
from collections import Counter
from nltk import pos_tag, RegexpParser, Tree
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

**Create Chunk Counter**

In [44]:
# function that pulls chunks out of chunked sentence and finds the most common chunks
def np_chunk_counter(chunked_sentences):
    chunks = []
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'NP'):
            chunks.append(tuple(subtree))

    # a Counter object
    chunk_counter = Counter()
    for chunk in chunks:
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(30)

# function that pulls chunks out of chunked sentence and finds the most common chunks
def vp_chunk_counter(chunked_sentences):

    #a list for chunks
    chunks = []
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'VP'):
            chunks.append(tuple(subtree))

    #a Counter object
    chunk_counter = Counter()
    for chunk in chunks:
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(30)


**Import old text**

In [53]:
# import text
old_script = open("E:\Language-Chatbot\dorian_gray.txt", encoding='utf-8').read().lower()

**Tokenizing sentences and tokenizing words by sentences** 

In [46]:
#word tokenized by sentences
sentence_tokenizer = PunktSentenceTokenizer(old_script)
sentence_tokenized = sentence_tokenizer.tokenize(old_script)
word_tokenized = list()

for tokenized_sentence in sentence_tokenized:
    word_tokenized.append(word_tokenize(tokenized_sentence))
print(word_tokenized[1])

['to', 'reveal', 'art', 'and', 'conceal', 'the', 'artist', 'is', 'art', "'s", 'aim', '.']


**Find the most common Noun phrase**

In [47]:
#part-of-speech tagged sentences
pos_tagged_text = []
for sentence in word_tokenized:
    pos_tagged_text.append(pos_tag(sentence))
print(pos_tagged_text[1])

#a noun phrase chunk grammar is
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"
np_chunk_parser = RegexpParser(np_chunk_grammar)

#a verb phrase chunk grammar is
vp_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}"
vp_chunk_parser = RegexpParser(vp_chunk_grammar)

#a list for noun phrase chunked sentences 
np_chunker = []
for sentence in pos_tagged_text:
    np_chunker.append(np_chunk_parser.parse(sentence))
print(np_chunker[1])
Tree.fromstring(str(np_chunker[1])).pretty_print()

#view the most occured noun phrase
most_occured_np_chunks = np_chunk_counter(np_chunker)
print(most_occured_np_chunks)

[('to', 'TO'), ('reveal', 'VB'), ('art', 'NN'), ('and', 'CC'), ('conceal', 'VB'), ('the', 'DT'), ('artist', 'NN'), ('is', 'VBZ'), ('art', 'NN'), ("'s", 'POS'), ('aim', 'NN'), ('.', '.')]
(S
  to/TO
  reveal/VB
  (NP art/NN)
  and/CC
  conceal/VB
  (NP the/DT artist/NN)
  is/VBZ
  (NP art/NN)
  's/POS
  (NP aim/NN)
  ./.)
                                           S                                                 
   ________________________________________|_____________________________________________     
  |       |       |        |        |      |     |    NP           NP             NP     NP  
  |       |       |        |        |      |     |    |       _____|______        |      |    
to/TO reveal/VB and/CC conceal/VB is/VBZ 's/POS ./. art/NN the/DT     artist/NN art/NN aim/NN

[((('i', 'NN'),), 963), ((('henry', 'NN'),), 200), ((('lord', 'NN'),), 197), ((('life', 'NN'),), 170), ((('harry', 'NN'),), 136), ((('dorian', 'JJ'), ('gray', 'NN')), 127), ((('something', 'NN'),), 126), 