In [158]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import *

In [160]:
import nltk

In [162]:
# Lexical rules for direct word-to-POS mapping
LEXICAL_RULES = {
    "the": "DT",
    "a": "DT",
    "an": "DT",
    "and": "CC",
    "but": "CC",
    "he": "PRP",
    "she": "PRP",
    "it": "PRP",
    "they": "PRP",
    "is": "VBZ",
    "are": "VBP",
    "was": "VBD",
    "were": "VBD",
    "run": "VB",
    "runs": "VBZ",
    "running": "VBG",
    "ran": "VBD",
    "walk": "VB",
    "walking": "VBG",
    "walked": "VBD"
}

# Suffix rules for pattern-based tagging
SUFFIX_RULES = {
    "ing": "VBG",  # Gerund or Present Participle
    "ed": "VBD",   # Past Tense
    "ly": "RB",    # Adverb
    "ion": "NN",   # Noun
    "s": "NNS"     # Plural Noun
}

# Punctuation rules
PUNCTUATION_RULES = {
    ".": "PUNCT",
    "?": "PUNCT-Q",
    "!": "PUNCT-E",
    ",": "PUNCT-C",
    ":": "PUNCT-S",
    ";": "PUNCT-S",
    "\"": "PUNCT-QT",
    "'": "PUNCT-QT",
    "(": "PUNCT-P",
    ")": "PUNCT-P"
}


In [164]:
def pos_tagging(text):
    tags=[]
    tokens = text.split()
    print(tokens)
    for i,word in enumerate(tokens):
        print(word)
        lower_word = word.lower()

        if word in PUNCTUATION_RULES:
            tags.append((word, PUNCTUATION_RULES[word]))
            continue
        
        #Rule1: Lexical lookup
        if lower_word in LEXICAL_RULES:
            tags.append((word, LEXICAL_RULES[lower_word]))
            continue

        #Rule2: Suffix based 
        mached_suffix=False
        for suffix, tag in SUFFIX_RULES.items():
            if lower_word.endswith(suffix):
                tags.append((word, tag))
                mached_suffix=True
                break
        if mached_suffix:
            continue

        #Rule3: Propernoun
        if word[0].isupper() and i>0:
            tags.append((word, "NNP"))
            continue
            
        #Rule4: Default to noun    
        tags.append((word, "NN"))
        
        #Rule5: Context awareness tagging
        if i>0:
            
            prev_word, prev_tag = tags[i-1]
            #adjective and determiner followed by noun
            if prev_tag in ["DET","JJ"] and len(lower_word)>3:
                tags[-1]=(word,"NN")
            

            
    return tags
            
        
        

In [166]:
tagged_text=pos_tagging("The boy was running somewhere with the stick in hand but Nitin did not do it. He went aside .'")

['The', 'boy', 'was', 'running', 'somewhere', 'with', 'the', 'stick', 'in', 'hand', 'but', 'Nitin', 'did', 'not', 'do', 'it.', 'He', 'went', 'aside', ".'"]
The
boy
was
running
somewhere
with
the
stick
in
hand
but
Nitin
did
not
do
it.
He
went
aside
.'


In [168]:
def check_grammar(tagged_text):
    """
    Perform detailed grammar checking based on POS tagging rules, including punctuation.
    """
    errors = []
    words = [word for word, tag in tagged_text]

    # Rule 1: Sentence must end with valid punctuation
    if words[-1] not in [".", "!", "?"]:
        errors.append("Sentence does not end with valid punctuation.")

    # Rule 2: Unbalanced punctuation (parentheses, quotes, etc.)
    stack = []
    matching_pairs = {"(": ")", "[": "]", "{": "}", "\"": "\"", "'": "'"}
    for word in words:
        if word in matching_pairs.keys():  # Opening punctuation
            stack.append(word)
        elif word in matching_pairs.values():  # Closing punctuation
            if not stack or matching_pairs[stack.pop()] != word:
                errors.append(f"Unbalanced punctuation detected: '{word}'")

    # Rule 3: Commas in lists or clauses
    for i, (word, tag) in enumerate(tagged_text):
        if word == "," and (i == 0 or tagged_text[i - 1][1] == "PUNCT"):
            errors.append("Comma misused or incorrectly placed.")

    return errors


In [170]:
check_grammar(tagged_text)

['Sentence does not end with valid punctuation.']

In [172]:
#HMM Tagging


In [174]:
import nltk
from nltk.corpus import treebank
from nltk.probability import ConditionalFreqDist, ConditionalProbDist, LidstoneProbDist

In [122]:
treebank.tagged_sents()

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]

In [240]:
def hmm_train_tagger():
    tagged_sentences = treebank.tagged_sents()
    start = "<s>"
    end = "</s>"
    sentence_with_boundaries = [
        [(start, start)] + sentence + [(end, end)] for sentence in tagged_sentences
    ]
    #calculating transitional probs (P(tag2|tag1)
    transitional_freqs = ConditionalFreqDist()
    for sentence in sentence_with_boundaries:
        for i in range(len(sentence)-1):
            _, current_tag = sentence[i]
            _, next_tag = sentence[i+1]
            transitional_freqs[current_tag][next_tag]+=1
    transitional_probs = ConditionalProbDist(transitional_freqs,LidstoneProbDist, 0.1)

    #calculating emission pros (P(word|tag))
    emission_freqs = ConditionalFreqDist()
    for sentence in sentence_with_boundaries:
        for word, tag in sentence:
            emission_freqs[tag][word]+=1

    emission_probs = ConditionalProbDist(emission_freqs, LidstoneProbDist, 0.1)

    #Calculating initial prob (P(tag|start))
    initial_freq = ConditionalFreqDist()
    for sentence in sentence_with_boundaries:
        _, first_tag = sentence[1]
        initial_freq[start][first_tag]+=1
    print("Initial Frequencies:")
    
    initial_probs = ConditionalProbDist(initial_freq, LidstoneProbDist, 0.1)

    return initial_probs, transitional_probs, emission_probs
        
        
    

    

In [460]:
#Viterbi algorithm for tagging
def Viterbi(sentence, transitional_probs, emission_probs, initial_probs):
    
    states = list(transitional_probs.keys())
    V = [{}]
    path={}

    for state in states:
        if state in emission_probs:
            V[0][state] = initial_probs["<s>"].prob(state) * emission_probs[state].prob(sentence[0])
            path[state]= [state]
            
    # Handle unseen words in initialization
    if not any(state in emission_probs and sentence[0] in emission_probs[state].samples() for state in states):
        for state in states:
            V[0][state] *= 0.01  # Assign a low probability to unseen words

    for t in range(1, len(sentence)):
        V.append({})
        new_path={}
        for state in states:
            if state not in emission_probs:
                continue
            prob, prev_state = max(
                (V[t-1][prev_state] * emission_probs[state].prob(sentence[t]) * transitional_probs[prev_state].prob(state), prev_state)
                for prev_state in states if prev_state in V[t-1]
            )
            V[t][state] = prob
            new_path[state] = path[prev_state] + [state]
        path = new_path

    final_state = max(V[-1], key=V[-1].get)
    return [(word, tag) for word, tag in zip(sentence, path[final_state])]
    
    

In [462]:
initial_probs, transition_probs, emission_probs = hmm_train_tagger()

Initial Frequencies:


In [468]:
def hmm_train_tagger():
    tagged_sentences = treebank.tagged_sents()
    start = "<s>"
    end = "</s>"
    sentence_with_boundaries = [
        [(start, start)] + sentence + [(end, end)] for sentence in tagged_sentences
    ]

    # Calculate transition probabilities (P(tag2 | tag1))
    transitional_freqs = ConditionalFreqDist()
    for sentence in sentence_with_boundaries:
        for i in range(len(sentence) - 1):
            _, current_tag = sentence[i]
            _, next_tag = sentence[i + 1]
            transitional_freqs[current_tag][next_tag] += 1
    transitional_probs = ConditionalProbDist(transitional_freqs, LidstoneProbDist, 0.1)

    # Calculate emission probabilities (P(word | tag))
    emission_freqs = ConditionalFreqDist()
    for sentence in sentence_with_boundaries:
        for word, tag in sentence:
            emission_freqs[tag][word] += 1
    emission_probs = ConditionalProbDist(emission_freqs, LidstoneProbDist, 0.1)

    # Calculate initial probabilities (P(tag | <s>))
    initial_freq = ConditionalFreqDist()
    for sentence in sentence_with_boundaries:
        _, first_tag = sentence[1]
        initial_freq[start][first_tag] += 1
    initial_probs = ConditionalProbDist(initial_freq, LidstoneProbDist, 0.1)

    return initial_probs, transitional_probs, emission_probs


In [470]:
def Viterbi(sentence, transitional_probs, emission_probs, initial_probs):
    states = list(transitional_probs.keys())
    V = [{}]
    path = {}

    # Initialization for the first word
    for state in states:
        if state in emission_probs:
            emission_prob = emission_probs[state].prob(sentence[0]) if sentence[0] in emission_probs[state].samples() else 0.01
            V[0][state] = initial_probs["<s>"].prob(state) * emission_prob
            path[state] = [state]

    # Handle unseen words in initialization
    if not any(sentence[0] in emission_probs[state].samples() for state in states):
        for state in states:
            V[0][state] *= 0.01  # Assign a low probability for unseen words

    # Recursion for subsequent words
    for t in range(1, len(sentence)):
        V.append({})
        new_path = {}
        for state in states:
            if state not in emission_probs:
                continue
            emission_prob = emission_probs[state].prob(sentence[t]) if sentence[t] in emission_probs[state].samples() else 0.01
            prob, prev_state = max(
                (V[t-1][prev_state] * transitional_probs[prev_state].prob(state) * emission_prob, prev_state)
                for prev_state in states if prev_state in V[t-1]
            )
            V[t][state] = prob
            new_path[state] = path[prev_state] + [state]
        path = new_path

    # Termination
    final_state = max(V[-1], key=V[-1].get)
    return [(word, tag) for word, tag in zip(sentence, path[final_state])]


In [476]:
import nltk
sentence ="not a thing"
tokens = nltk.word_tokenize(sentence)
Viterbi(tokens, transition_probs, emission_probs, initial_probs)

[('not', 'IN'), ('a', 'DT'), ('thing', 'JJ')]

In [478]:
sentence = ["the", "world", "is", "running", "I'm", "wanna", "die", "with", "you"]
initial_probs, transitional_probs, emission_probs = hmm_train_tagger()
result = Viterbi(sentence, transitional_probs, emission_probs, initial_probs)
print(result)

[('the', 'DT'), ('world', 'JJ'), ('is', 'NN'), ('running', 'IN'), ("I'm", 'DT'), ('wanna', 'JJ'), ('die', 'NN'), ('with', 'IN'), ('you', 'DT')]


In [466]:
print("Emission Probabilities for SYM:")
for word in emission_probs["SYM"].samples():
    print(f"P({word} | SYM) = {emission_probs['SYM'].prob(word):.4f}")


Emission Probabilities for SYM:
P(& | SYM) = 1.0000


In [411]:
tag_counts = nltk.FreqDist(tag for sent in treebank.tagged_sents() for _, tag in sent)
print("Tag Frequencies:")
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")


Tag Frequencies:
NNP: 9410
,: 4886
CD: 3546
NNS: 6047
JJ: 5834
MD: 927
VB: 2554
DT: 8165
NN: 13166
IN: 9857
.: 3874
VBZ: 2125
VBG: 1460
CC: 2265
VBD: 3043
VBN: 2134
-NONE-: 6592
RB: 2822
TO: 2179
PRP: 1716
RBR: 136
WDT: 445
VBP: 1321
RP: 216
PRP$: 766
JJS: 182
POS: 824
``: 712
EX: 88
'': 694
WP: 241
:: 563
JJR: 381
WRB: 178
$: 724
NNPS: 244
WP$: 14
-LRB-: 120
-RRB-: 126
PDT: 27
RBS: 35
FW: 4
UH: 3
SYM: 1
LS: 13
#: 16


In [458]:
print("Emission Probabilities for Each Tag:")
for tag in emission_probs.conditions():
    print(f"Tag: {tag}")
    for word in emission_probs[tag].samples():
        print(f"  P({word} | 'SYM') = {emission_probs[tag].prob(word):.4f}")


Emission Probabilities for Each Tag:
Tag: <s>
  P(<s> | 'SYM') = 1.0000
Tag: NNP
  P(Pierre | 'SYM') = 0.0001
  P(Vinken | 'SYM') = 0.0002
  P(Nov. | 'SYM') = 0.0024
  P(Mr. | 'SYM') = 0.0388
  P(Elsevier | 'SYM') = 0.0001
  P(N.V. | 'SYM') = 0.0003
  P(Dutch | 'SYM') = 0.0001
  P(Rudolph | 'SYM') = 0.0003
  P(Agnew | 'SYM') = 0.0001
  P(Consolidated | 'SYM') = 0.0002
  P(Gold | 'SYM') = 0.0002
  P(Fields | 'SYM') = 0.0002
  P(PLC | 'SYM') = 0.0014
  P(Kent | 'SYM') = 0.0008
  P(Lorillard | 'SYM') = 0.0004
  P(Inc. | 'SYM') = 0.0101
  P(Loews | 'SYM') = 0.0001
  P(Corp. | 'SYM') = 0.0121
  P(New | 'SYM') = 0.0165
  P(England | 'SYM') = 0.0024
  P(Journal | 'SYM') = 0.0010
  P(Medicine | 'SYM') = 0.0005
  P(James | 'SYM') = 0.0023
  P(A. | 'SYM') = 0.0017
  P(Talcott | 'SYM') = 0.0004
  P(Boston | 'SYM') = 0.0018
  P(Dana-Farber | 'SYM') = 0.0001
  P(Cancer | 'SYM') = 0.0002
  P(Institute | 'SYM') = 0.0016
  P(Dr. | 'SYM') = 0.0014
  P(National | 'SYM') = 0.0021
  P(Harvard | 'SYM') = 0

In [292]:
print("Initial Probabilities:")
for tag in initial_probs.conditions():
    for tag1 in initial_probs[tag].samples():
        print(f"P({tag1} | {tag}) = {initial_probs[tag].prob(tag1):.4f}")

Initial Probabilities:
P(NNP | <s>) = 0.1976
P(DT | <s>) = 0.2310
P(IN | <s>) = 0.1289
P(PRP | <s>) = 0.0626
P(EX | <s>) = 0.0044
P(`` | <s>) = 0.0756
P(CD | <s>) = 0.0084
P(RBR | <s>) = 0.0008
P(NNS | <s>) = 0.0467
P(NN | <s>) = 0.0444
P(JJ | <s>) = 0.0365
P(JJR | <s>) = 0.0031
P(RB | <s>) = 0.0447
P(WRB | <s>) = 0.0064
P(CC | <s>) = 0.0513
P(-NONE- | <s>) = 0.0210
P(VBG | <s>) = 0.0044
P(WDT | <s>) = 0.0005
P(-LRB- | <s>) = 0.0018
P(WP | <s>) = 0.0036
P(PRP$ | <s>) = 0.0074
P(JJS | <s>) = 0.0016
P(NNPS | <s>) = 0.0026
P(VBZ | <s>) = 0.0023
P(TO | <s>) = 0.0013
P(VBN | <s>) = 0.0018
P(LS | <s>) = 0.0018
P('' | <s>) = 0.0003
P(: | <s>) = 0.0028
P(PDT | <s>) = 0.0008
P(UH | <s>) = 0.0003
P(MD | <s>) = 0.0003
P($ | <s>) = 0.0013
P(VB | <s>) = 0.0008
P(RBS | <s>) = 0.0005
P(VBD | <s>) = 0.0003


In [295]:
print("\nTransition Probabilities:")
for prev_tag in transition_probs.conditions():
    for next_tag in transition_probs[prev_tag].samples():
        print(f"P({next_tag} | {prev_tag}) = {transition_probs[prev_tag].prob(next_tag):.4f}")



Transition Probabilities:
P(NNP | <s>) = 0.1976
P(DT | <s>) = 0.2310
P(IN | <s>) = 0.1289
P(PRP | <s>) = 0.0626
P(EX | <s>) = 0.0044
P(`` | <s>) = 0.0756
P(CD | <s>) = 0.0084
P(RBR | <s>) = 0.0008
P(NNS | <s>) = 0.0467
P(NN | <s>) = 0.0444
P(JJ | <s>) = 0.0365
P(JJR | <s>) = 0.0031
P(RB | <s>) = 0.0447
P(WRB | <s>) = 0.0064
P(CC | <s>) = 0.0513
P(-NONE- | <s>) = 0.0210
P(VBG | <s>) = 0.0044
P(WDT | <s>) = 0.0005
P(-LRB- | <s>) = 0.0018
P(WP | <s>) = 0.0036
P(PRP$ | <s>) = 0.0074
P(JJS | <s>) = 0.0016
P(NNPS | <s>) = 0.0026
P(VBZ | <s>) = 0.0023
P(TO | <s>) = 0.0013
P(VBN | <s>) = 0.0018
P(LS | <s>) = 0.0018
P('' | <s>) = 0.0003
P(: | <s>) = 0.0028
P(PDT | <s>) = 0.0008
P(UH | <s>) = 0.0003
P(MD | <s>) = 0.0003
P($ | <s>) = 0.0013
P(VB | <s>) = 0.0008
P(RBS | <s>) = 0.0005
P(VBD | <s>) = 0.0003
P(NNP | NNP) = 0.3821
P(, | NNP) = 0.1531
P(CD | NNP) = 0.0202
P(VBZ | NNP) = 0.0367
P(VBG | NNP) = 0.0008
P(NN | NNP) = 0.0554
P(WDT | NNP) = 0.0005
P(NNS | NNP) = 0.0226
P(IN | NNP) = 0.0436
P

In [325]:
print("\Emission Probabilities:")
for prev_tag in emission_probs.conditions():
    for next_tag in emission_probs[prev_tag].samples():
        print(f"P({next_tag} | {prev_tag}) = {emission_probs[prev_tag].prob(next_tag):.4f}")

\Emission Probabilities:
P(<s> | <s>) = 1.0000
P(Pierre | NNP) = 0.0001
P(Vinken | NNP) = 0.0002
P(Nov. | NNP) = 0.0024
P(Mr. | NNP) = 0.0388
P(Elsevier | NNP) = 0.0001
P(N.V. | NNP) = 0.0003
P(Dutch | NNP) = 0.0001
P(Rudolph | NNP) = 0.0003
P(Agnew | NNP) = 0.0001
P(Consolidated | NNP) = 0.0002
P(Gold | NNP) = 0.0002
P(Fields | NNP) = 0.0002
P(PLC | NNP) = 0.0014
P(Kent | NNP) = 0.0008
P(Lorillard | NNP) = 0.0004
P(Inc. | NNP) = 0.0101
P(Loews | NNP) = 0.0001
P(Corp. | NNP) = 0.0121
P(New | NNP) = 0.0165
P(England | NNP) = 0.0024
P(Journal | NNP) = 0.0010
P(Medicine | NNP) = 0.0005
P(James | NNP) = 0.0023
P(A. | NNP) = 0.0017
P(Talcott | NNP) = 0.0004
P(Boston | NNP) = 0.0018
P(Dana-Farber | NNP) = 0.0001
P(Cancer | NNP) = 0.0002
P(Institute | NNP) = 0.0016
P(Dr. | NNP) = 0.0014
P(National | NNP) = 0.0021
P(Harvard | NNP) = 0.0003
P(University | NNP) = 0.0023
P(West | NNP) = 0.0010
P(Groton | NNP) = 0.0001
P(Mass. | NNP) = 0.0008
P(Hollingsworth | NNP) = 0.0002
P(Vose | NNP) = 0.0002


  print("\Emission Probabilities:")
