In [55]:
import nltk
import re    

from sklearn.externals import joblib
from  nltk.corpus import brown
from nltk import load

# ----------------------------------------------------------
# POS TAGGER
# Using a pre-trained Conditional Random Field POS tag model
# ----------------------------------------------------------

def pos_features(sentence, i):
    current = sentence[i]

    # previous word
    if (i>0):
        prev_w = sentence[i-1]
    else:
        prev_w = "<START>"

    # next word
    if (i<len(sentence)-1):
        next_w = sentence[i+1]
    else:
        next_w = "<END>"

    # generate feature sets
    features = {
        "word": current,
        "next_word": next_w,
        "prev_word": prev_w,
        "suffix(1)": current[-1:],
        "suffix(2)": current[-2:],
        "suffix(3)": current[-3:],
        "prefix(1)": current[0],
        "prefix(2)": current[:2],
        "prefix(3)": current[:3],
        "prev_suffix(1)": prev_w[-1:],
        "prev_suffix(2)": prev_w[-2:],
        "prev_suffix(3)": prev_w[-3:],
        "prev_prefix(1)": prev_w[0],
        "prev_prefix(2)": prev_w[:2],
        "prev_prefix(3)": prev_w[:3],
        "next_suffix(1)": next_w[-1:],
        "next_suffix(2)": next_w[-2:],
        "next_suffix(3)": next_w[-3:],
        "next_prefix(1)": next_w[0],
        "next_prefix(2)": next_w[:2],
        "next_prefix(3)": next_w[:3],
        'is_first': i == 0,
        'is_last': i == len(sentence) - 1,
        'curr_is_title': current.istitle(),
        'prev_is_title': prev_w.istitle(),
        'next_is_title': next_w.istitle(),
        'curr_is_lower': current.islower(),
        'prev_is_lower': prev_w.islower(),
        'next_is_lower': next_w.islower(),
        'curr_is_upper': current.isupper(),
        'prev_is_upper': prev_w.isupper(),
        'next_is_upper': next_w.isupper(),
        'curr_is_digit': current.isdigit(),
        'prev_is_digit': prev_w.isdigit(),
        'next_is_digit': next_w.isdigit()
    }
    return features

# example:
sents = ["He want to eat ice cream", "a birds is laying eggs", "I feel interesting", 
         "when I will play soccer", "I will played chess tomorrow",
         "before played, I need to warm up", "how much steps is necessary", "The dog, who is chewing on my jeans, is usually very good.",
         "There is a problem with the balance sheet", "The colors and the rainbow are beautiful"]

# prepocessing
sents = [nltk.word_tokenize(sent) for sent in sents]  # Tokenization
sent2feature = [[pos_features(sent,i) for i in range(len(sent))] for sent in sents] 
# load crf model
crf = joblib.load('POStagger.joblib')

# tagging 
tagged_sents = []
labels = crf.predict(sent2feature) 
for i in range(len(sents)):
    tagged_sent = list(zip(sents[i], labels[i]))
    tagged_sents.append(tagged_sent)


print(tagged_sents)
formatted_sents = [['/'.join([a,b]) for (a,b) in sent] for sent in tagged_sents]
formatted_sents = [' '.join(sent) for sent in formatted_sents]
for sent in formatted_sents:
    print(sent)


[[('He', 'PPS'), ('want', 'VB'), ('to', 'TO'), ('eat', 'VB'), ('ice', 'JJ-HL'), ('cream', 'NN-HL')], [('a', 'AT'), ('birds', 'NNS'), ('is', 'BEZ'), ('laying', 'VBG'), ('eggs', 'NNS-HL')], [('I', 'PPSS'), ('feel', 'VB'), ('interesting', 'VBG-HL')], [('when', 'WRB'), ('I', 'PPSS'), ('will', 'MD'), ('play', 'VB'), ('soccer', 'NN-HL')], [('I', 'PPSS'), ('will', 'MD'), ('played', 'VBN'), ('chess', 'NN'), ('tomorrow', 'NR')], [('before', 'IN'), ('played', 'VBN'), (',', ','), ('I', 'PPSS'), ('need', 'VB'), ('to', 'TO'), ('warm', 'VB'), ('up', 'RP')], [('how', 'WRB'), ('much', 'AP'), ('steps', 'NNS'), ('is', 'BEZ'), ('necessary', 'JJ')], [('The', 'AT'), ('dog', 'NN'), (',', ','), ('who', 'WPS'), ('is', 'BEZ'), ('chewing', 'VBG'), ('on', 'IN'), ('my', 'PP$'), ('jeans', 'NNS'), (',', ','), ('is', 'BEZ'), ('usually', 'RB'), ('very', 'QL'), ('good', 'JJ'), ('.', '.')], [('There', 'EX'), ('is', 'BEZ'), ('a', 'AT'), ('problem', 'NN'), ('with', 'IN'), ('the', 'AT'), ('balance', 'NN'), ('sheet', 'NN-H

In [11]:
import pickle
# ---------------------------------
# POS CHUNKER
# Use NLTK_train chunker based on corpus conll2000 for now
# ---------------------------------
chunker = pickle.load(open("conll2000_ub.pickle", 'rb'))
chunked_sents = [chunker.parse(tagged_list) for tagged_list in tagged_lists] 
for chunked_sent in chunked_sents:
    print(chunked_sent) 
# NOTE: chunked_sent is a NLTK tree structure

(S (NP He/PRP) (VP want/VBP to/TO eat/VB) (NP ice/NN cream/NN))
(S (NP a/DT birds/NNS) (VP is/VBZ laying/VBG) (NP eggs/NNS))
(S (NP I/PRP) (VP feel/VBP interesting/VBG))
(S when/WRB (NP I/PRP) (VP will/MD play/VB) (NP soccer/NN))
(S (NP I/PRP) (VP will/MD played/VB) (NP chess/NN tomorrow/NN))
(S
  (PP before/IN)
  (NP played/VBN)
  ,/,
  (NP I/PRP)
  (VP need/VBP to/TO warm/VB)
  up/RP)
(S how/WRB (NP much/JJ steps/NNS) (VP is/VBZ) (NP necessary/JJ))
(S
  (NP The/DT dog/NN)
  ,/,
  (NP who/WP)
  (VP is/VBZ chewing/VBG)
  (PP on/IN)
  (NP my/PRP$ jeans/NNS)
  ,/,
  (VP is/VBZ usually/RB)
  very/RB
  (NP good/JJ)
  ./.)
(S
  (NP There/EX)
  (VP is/VBZ)
  (NP a/DT problem/NN)
  (PP with/IN)
  (NP the/DT balance/NN sheet/NN))
(S
  (NP The/DT colors/NNS)
  and/CC
  (NP the/DT rainbow/NN)
  (VP are/VBP)
  (NP beautiful/JJ))


In [56]:
# ---------------------------------
# GRAMMAR CHECKER
# ---------------------------------

# Writting grammar rules 
grammar_rules = [ r"(?P<E1>(((a|an)/DT)|(one/CD))(\s\w+/(JJ))*\s\w+/(NNS))",   # plural noun detected after 'a', 'an' or 'one'
                  r"(?P<E2>many/JJ(\s\w+/(JJ))*\s\w+/NN(?!S))",               # singular noun detected after 'many'
                  r"(?P<E3>(?<!the/DT)\s\w+/JJS)",                            # missing 'the' before superlative
                  r"(?P<E4>(feels*|felt)/VB\w\s(\w+/(RB|JJ)\s)*\w+ing/VBG)", # "feel/felt" should not be followed by 'adj-ing'
                  r"(?P<E5>\w+/MD\s\w+(((s|ed)/VB\w*)|/VB\w))",             # modal verb should be followed by a base-form verb
                  r"(?P<E6>\w+/(NN|NNP|PRP|PPS)\s\w+/VB(?!Z))",            # sigular noun requires verb with '-s' or '-es'
                  r"(?P<E7>(is|are|were|was|been)/VB\w\s\w+/(VB(\s|$)|VBZ|VBP))",  # passive required verb with '-ed'
                  r"(?P<E8>when/WRB\s\w+/\w+\s(will|shall)/MD)",   # "when" clause should be in present tense                ]   
                  r"(?P<E9>(before|after)/\w+\s\w+/VB(?!G))",       # expecting a verb-ing after preposition 
                  r"(?P<E10>how/\w+\smuch/\w+\s\w+/(NNS|NNPS))"    # use "many" for plural nouns instead of "much"
                  r"(?P<E11>how/\w+\smany/\w+\s\w+/(NN\s|NNP\s))"    # use "many" for plural nouns instead of "much"

                ]

messages = {"E1":  (lambda x: "{} -> plural noun detected after singular determinant".format(x)),
            "E2":  (lambda x: "{} -> singular noun detected after 'many'".format(x)),
            "E3":  (lambda x: "{} -> missing 'the' before superlative".format(x)),
            "E4":  (lambda x: "{} -> 'feel/felt' should not be followed by 'adj-ing'".format(x)),
            "E5":  (lambda x: "{} -> modal verb should be followed by a base-form verb".format(x)),
            "E6":  (lambda x: "{} -> sigular noun requires verb with '-s' or '-es'".format(x)),
            "E7":  (lambda x: "{} -> passive required verb with '-ed'".format(x)),
            "E8":  (lambda x: "{} -> 'when' clause should be in present tense".format(x)),
            "E9":  (lambda x: "{} -> expecting a verb-ing after preposition".format(x)),
            "E10": (lambda x: "{} -> use 'many' for plural nouns instead of 'much'".format(x)),
            "E11": (lambda x: "{} -> use 'much' for non-plural nouns instead of 'many'".format(x))

           }


In [57]:
for i in range(0,len(grammar_rules)):
    pattern = re.compile(grammar_rules[i])
    results = [pattern.search(sent) for sent in formatted_sents]
    for result in results:
        if result is not None:
            key = ''.join(result.groupdict().keys())
            print(messages[key](result[0]))

will/MD played/VBN -> modal verb should be followed by a base-form verb
He/PPS want/VB -> sigular noun requires verb with '-s' or '-es'
when/WRB I/PPSS will/MD -> 'when' clause should be in present tense
before/IN played/VB -> expecting a verb-ing after preposition
