# EXP4 — HMM POS Tagging (with Transition & Emission Matrices)

**This notebook is an edited version of `EXP4_HMM_POS_LOCAL.ipynb` that adds cells to extract, label, and save the HMM transition and emission matrices.**

**How to use:** run all cells in order. The HMM is trained unsupervised on your local Gujarati corpus and the notebook prints and saves the following:
- start probabilities (startprob_)
- transition matrix (transmat_) with states labelled S0..S{n}
- emission matrix (emissionprob_) with columns labelled by top vocabulary tokens

Generated: 2025-10-30T09:06:40.432919

In [1]:

!pip install -q nltk hmmlearn pandas numpy

import os, sys
import numpy as np
import pandas as pd
import nltk
from hmmlearn import hmm

nltk.download('punkt', quiet=True)
print("Imports ready")


Imports ready


In [2]:

# Local corpus path - edit if needed
corpus_dir = r"X:/DJ Sanghvi/sem 7/nlp/NLP_LAB_GYANGUJ/data/next"

texts = []
for fn in os.listdir(corpus_dir):
    if fn.endswith('.txt'):
        with open(os.path.join(corpus_dir, fn), 'r', encoding='utf-8') as f:
            texts.append(f.read())

corpus = " ".join(texts)
print("Loaded characters:", len(corpus))


Loaded characters: 3461251


In [3]:

# Tokenize (simple whitespace splitting) - adapt tokenization if needed
tokens = []
for part in nltk.tokenize.sent_tokenize(corpus):
    for t in part.split():
        if t.strip():
            tokens.append(t.strip())

print("Total tokens:", len(tokens))
# build vocab
vocab = sorted(list(set(tokens)))
word_to_id = {w:i for i,w in enumerate(vocab)}
id_to_word = {i:w for w,i in word_to_id.items()}
print("Vocab size:", len(vocab))


Total tokens: 507737
Vocab size: 60194


In [10]:

# Encode tokens as integer symbol IDs
import numpy as np
X = np.array([word_to_id[w] for w in tokens]).reshape(-1,1)

# Train unsupervised Multinomial HMM
n_states = 4  # adjust if desired
model = hmm.MultinomialHMM(n_components=n_states, n_iter=30, verbose=True, random_state=42)
model.fit(X)

print("HMM trained: n_states =", n_states)


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340
         1       0.00000000             +nan


HMM trained: n_states = 4


         2       0.00000000      +0.00000000


In [11]:

# Decode a short prefix to show state sequence
logprob, states = model.decode(X[:100], algorithm='viterbi')
decoded = [(id_to_word[int(X[i])], int(states[i])) for i in range(min(100, len(states)))]
print("First 30 decoded tokens and states:")
for w,s in decoded[:30]:
    print(f"{w:20s} -> S{s}")


First 30 decoded tokens and states:
જીવવિજ્ઞાન           -> S1
ધોરણ                 -> S2
ઉઠે                  -> S2
પ્રતિજ્ઞાપત્ર        -> S2
ભારત                 -> S2
મારો                 -> S2
દેશ                  -> S2
બધાં                 -> S2
ભારતીયો              -> S2
મારા                 -> S2
ભાઈબહેન              -> S2
મારા                 -> S2
દેશને                -> S2
ચાહું                -> S2
છું                  -> S2
તેના                 -> S2
સમૃદ્ધ               -> S2
વૈવિધ્યપૂર્ણ         -> S2
વારસાનો              -> S2
મને                  -> S2
ગર્વ                 -> S2
સદાય                 -> S2
તેને                 -> S2
લાયક                 -> S2
બનવા                 -> S2
પ્રયત્ત              -> S2
કરીશ                 -> S2
મારાં                -> S2
માતાપિતા,            -> S2
શિક્ષકો              -> S2


  decoded = [(id_to_word[int(X[i])], int(states[i])) for i in range(min(100, len(states)))]


In [12]:

# Extract HMM matrices
start_prob = model.startprob_        # shape (n_states,)
trans_mat = model.transmat_          # shape (n_states, n_states)
emission_mat = model.emissionprob_   # shape (n_states, n_symbols)

print("startprob_ shape:", start_prob.shape)
print("transmat_ shape:", trans_mat.shape)
print("emissionprob_ shape:", emission_mat.shape)


startprob_ shape: (4,)
transmat_ shape: (4, 4)
emissionprob_ shape: (4, 1)


In [13]:

# Create DataFrame for transition matrix with labelled rows/cols
state_labels = [f"S{i}" for i in range(model.n_components)]
df_trans = pd.DataFrame(trans_mat, index=state_labels, columns=state_labels)
print("Transition matrix (rows: from-state, cols: to-state):")
display(df_trans.round(4))

# Save transition matrix
df_trans.to_csv('/exp4_transition_matrix.csv', index=True)
print("\nSaved transition matrix to /exp4_transition_matrix.csv")


Transition matrix (rows: from-state, cols: to-state):


Unnamed: 0,S0,S1,S2,S3
S0,0.1997,0.0,0.7986,0.0017
S1,0.0449,0.1824,0.7345,0.0382
S2,0.2389,0.0088,0.7522,0.0001
S3,0.9136,0.0033,0.0831,0.0001



Saved transition matrix to /exp4_transition_matrix.csv


In [14]:

# For emission matrix, showing full matrix is large (states x vocab). We'll show top-k emissions per state
top_k = 15
n_states, n_symbols = emission_mat.shape
top_words_per_state = {}

for s in range(n_states):
    probs = emission_mat[s]  # length n_symbols
    top_idx = np.argsort(-probs)[:top_k]
    top_words = [(id_to_word[i], float(probs[i])) for i in top_idx]
    top_words_per_state[f"S{s}"] = top_words

# Display top words per state
for s, lst in top_words_per_state.items():
    print(f"\nState {s} top {top_k} emissions:")
    for w,p in lst:
        print(f"  {w:20s} : {p:.5f}")

# Save a reduced emission CSV: rows = states, columns = top_k words (word:prob)
rows = []
for s, lst in top_words_per_state.items():
    row = {'state': s}
    for i, (w,p) in enumerate(lst):
        row[f"top{i+1}_word"] = w
        row[f"top{i+1}_prob"] = p
    rows.append(row)

df_emit_reduced = pd.DataFrame(rows)
df_emit_reduced.to_csv('/exp4_emission_topk.csv', index=False)
print("\nSaved reduced emission table to /exp4_emission_topk.csv")



State S0 top 15 emissions:
  !                    : 1.00000

State S1 top 15 emissions:
  !                    : 1.00000

State S2 top 15 emissions:
  !                    : 1.00000

State S3 top 15 emissions:
  !                    : 1.00000

Saved reduced emission table to /exp4_emission_topk.csv


In [None]:

test_data = [
    ("હું શાળા જાઉં છું",        ["PRON","NOUN","VERB","AUX"]),
    ("તું ઘર આવો",              ["PRON","NOUN","VERB"]),
    ("તે પુસ્તકો વાંચે છે",      ["PRON","NOUN","VERB","AUX"]),
    ("મિત્રો સાથે રમે છે",      ["NOUN","ADP","VERB","AUX"]),
    ("આજે વાતાવરણ સારો છે",    ["ADV","NOUN","ADJ","AUX"]),
]


gold_words = []
gold_tags  = []
for sent, tags in test_data:
    words = sent.split()
    gold_words += words
    gold_tags  += tags


test_ids = [word_to_id[w] for w in gold_words if w in word_to_id]
test_X   = np.array(test_ids).reshape(-1,1)

# ---- Predict HMM states ----
_, pred_states = model.decode(test_X, algorithm="viterbi")
pred_states = list(pred_states)

# ---- Map HMM states -> Gold POS tags using majority vote ----
from collections import defaultdict, Counter

state_tag_map = defaultdict(list)
idx = 0
for w, tag in zip(gold_words, gold_tags):
    if w in word_to_id:
        state_tag_map[pred_states[idx]].append(tag)
        idx += 1

final_map = {
    state: Counter(tags).most_common(1)[0][0] 
    for state, tags in state_tag_map.items()
}

print("State → POS tag mapping:")
for st, tg in final_map.items():
    print(f"S{st} → {tg}")

# ---- Convert predicted states to predicted tags ----
pred_tags = [final_map[s] for s in pred_states]

# ---- Compute accuracy ----
from sklearn.metrics import accuracy_score, classification_report

print("\n✅ Accuracy:", round(accuracy_score(gold_tags[:len(pred_tags)], pred_tags) * 100, 2), "%")
print("\nClassification Report:")
print(classification_report(gold_tags[:len(pred_tags)], pred_tags))


State → POS tag mapping:
S1 → NOUN
S2 → NOUN

✅ Accuracy: 33.33 %

Classification Report:
              precision    recall  f1-score   support

         AUX       0.00      0.00      0.00         2
        NOUN       0.33      1.00      0.50         4
        PRON       0.00      0.00      0.00         3
        VERB       0.00      0.00      0.00         3

    accuracy                           0.33        12
   macro avg       0.08      0.25      0.12        12
weighted avg       0.11      0.33      0.17        12



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
