# Experiment 4 — POS Tagging using HMM on Local Gujarati Corpus
**Objective:** Perform POS tagging using Hidden Markov Model (HMM) on Gujarati local corpus, compare with rule-based tagging, and evaluate accuracy.
---

In [1]:
!pip install -q nltk hmmlearn pandas

import nltk, os, pandas as pd, numpy as np
from hmmlearn import hmm
from collections import Counter, defaultdict

nltk.download('punkt')

corpus_dir = r"X:/DJ Sanghvi/sem 7/nlp/NLP_LAB_GYANGUJ/data/next"



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omtan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
texts = []
for file in os.listdir(corpus_dir):
    if file.endswith(".txt"):
        with open(os.path.join(corpus_dir, file), 'r', encoding='utf-8') as f:
            texts.append(f.read())

corpus = " ".join(texts)
print("Loaded characters:", len(corpus))


Loaded characters: 3461251


In [3]:
tokens = nltk.word_tokenize(corpus)
tokens = [t for t in tokens if t.strip()]
print("Token count:", len(tokens))
print(tokens[:20])


Token count: 550517
['જીવવિજ્ઞાન', 'ધોરણ', 'ઉઠે', 'પ્રતિજ્ઞાપત્ર', 'ભારત', 'મારો', 'દેશ', 'બધાં', 'ભારતીયો', 'મારા', 'ભાઈબહેન', 'મારા', 'દેશને', 'ચાહું', 'છું', 'તેના', 'સમૃદ્ધ', 'વૈવિધ્યપૂર્ણ', 'વારસાનો', 'મને']


In [4]:
vocab = list(set(tokens))
word_to_id = {w:i for i,w in enumerate(vocab)}
id_to_word = {i:w for w,i in word_to_id.items()}

encoded = np.array([word_to_id[w] for w in tokens]).reshape(-1,1)


In [5]:
n_states = 8  # POS-like clusters

model = hmm.MultinomialHMM(n_components=n_states, n_iter=20, verbose=False)
model.fit(encoded)

print("HMM Training Complete ✅")


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


HMM Training Complete ✅


In [6]:
logprob, states = model.decode(encoded[:50], algorithm="viterbi")

tagged_output = [(id_to_word[encoded[i][0]], f"TAG{states[i]}") for i in range(len(states))]
for w,t in tagged_output[:20]:
    print(w, t)


જીવવિજ્ઞાન TAG6
ધોરણ TAG7
ઉઠે TAG6
પ્રતિજ્ઞાપત્ર TAG7
ભારત TAG6
મારો TAG7
દેશ TAG6
બધાં TAG7
ભારતીયો TAG6
મારા TAG7
ભાઈબહેન TAG6
મારા TAG7
દેશને TAG6
ચાહું TAG7
છું TAG6
તેના TAG7
સમૃદ્ધ TAG6
વૈવિધ્યપૂર્ણ TAG7
વારસાનો TAG6
મને TAG7


In [7]:
guj_dict = {
    "હું":"PRON","તું":"PRON","એ":"PRON","તમે":"PRON",
    "છું":"AUX","છે":"AUX","હતું":"AUX","હતા":"AUX",
    "જાઉં":"VERB","કરું":"VERB","ખાઉં":"VERB","આવો":"VERB",
    "શાળા":"NOUN","માનવ":"NOUN","મિત્ર":"NOUN","ઘર":"NOUN",
}

def rule_tag_sentence(sentence):
    words = nltk.word_tokenize(sentence)
    return [(w, guj_dict.get(w,"UNK")) for w in words]

print(rule_tag_sentence("હું શાળા જાઉં છું"))


[('હું', 'PRON'), ('શાળા', 'NOUN'), ('જાઉં', 'VERB'), ('છું', 'AUX')]


In [24]:
# test_data = [
#     ("હું શાળા જાઉં છું", ["PRON","NOUN","VERB","AUX"]),
#     ("તું ઘર આવો", ["PRON","NOUN","VERB"]),
# ]

test_data = [
    ("હું શાળા જાઉં છું", ["PRON","NOUN","VERB","AUX"]),
    ("તું ઘર આવો", ["PRON","NOUN","VERB"]),
    ("આ મારો મિત્ર છે", ["DET","PRON","NOUN","AUX"]),
    ("બિલાડી ઝડપથી દોડે છે", ["NOUN","ADV","VERB","AUX"]),
    ("હું ભાત ખાઉં છું", ["PRON","NOUN","VERB","AUX"]),
]


def evaluate(tests):
    y_true, y_pred = [], []
    for sent, true_tags in tests:
        tagged = rule_tag_sentence(sent)
        pred = [t for _,t in tagged]
        y_true.extend(true_tags)
        y_pred.extend(pred)
    return y_true, y_pred

true_tags, pred_tags = evaluate(test_data)

print("True:", true_tags)
print("Pred:", pred_tags)

correct = sum(t1==t2 for t1,t2 in zip(true_tags,pred_tags))
acc = correct/len(true_tags)

print(f"Accuracy: {acc*100:.2f}%")


True: ['PRON', 'NOUN', 'VERB', 'AUX', 'PRON', 'NOUN', 'VERB', 'DET', 'PRON', 'NOUN', 'AUX', 'NOUN', 'ADV', 'VERB', 'AUX', 'PRON', 'NOUN', 'VERB', 'AUX']
Pred: ['PRON', 'NOUN', 'VERB', 'AUX', 'PRON', 'NOUN', 'VERB', 'UNK', 'UNK', 'NOUN', 'AUX', 'UNK', 'UNK', 'UNK', 'AUX', 'PRON', 'UNK', 'VERB', 'AUX']
Accuracy: 68.42%
