<a href="https://colab.research.google.com/github/mokshi0824/Natural-Language-Processing-Tasks/blob/main/Task_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import treebank
# from nltk.sklearn.linear_model import LogisticRegression # This module does not exist
from sklearn.metrics import accuracy_score
import random

# download required data
nltk.download('treebank')
nltk.download('universal_tagset')

# load data
sentences = list(treebank.tagged_sents(tagset='universal'))
random.shuffle(sentences)

train_data = sentences[:3000]
test_data = sentences[3000:]

# ==== HMM with Viterbi Decoding ====
from nltk.tag import hmm

trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train_supervised(train_data)

hmm_accuracy = hmm_tagger.evaluate(test_data)
print("HMM (Viterbi) Accuracy:", hmm_accuracy)

# ==== Feature Extraction ====
def extract_features(sentence, i):
    word = sentence[i]
    features = {
        'word': word,
        'is_capitalized': word[0].isupper(),
        'is_digit': word.isdigit(),
        'prefix-1': word[0],
        'suffix-1': word[-1],
        'suffix-2': word[-2:]
    }
    if i > 0:
        features['prev_word'] = sentence[i-1]
    else:
        features['prev_word'] = '<START>'
    return features

# ==== Prepare train and test sets ====
def prepare_dataset(tagged_sents):
    X, y = [], []
    for sent in tagged_sents:
        words, tags = zip(*sent)
        for i in range(len(words)):
            feats = extract_features(words, i)
            X.append(feats)
            y.append(tags[i])
    return X, y

X_train, y_train = prepare_dataset(train_data)
X_test, y_test = prepare_dataset(test_data)

# ==== Convert features to dict vector ====
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=True)

X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

# ==== Train log-linear model ====
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
log_linear_accuracy = accuracy_score(y_test, y_pred)
print("Log-Linear Model Accuracy:", log_linear_accuracy)

# ==== Comparison ====
print("\nPerformance Comparison")
print(f"HMM (Viterbi): {hmm_accuracy:.4f}")
print(f"Log-Linear Model: {log_linear_accuracy:.4f}")

# ==== Result ====
print("\nThus the above program is executed successfully")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  hmm_accuracy = hmm_tagger.evaluate(test_data)
  O[i, k] = self._output_logprob(si, self._symbols[k])
  O[i, k] = self._output_logprob(si, self._symbols[k])


HMM (Viterbi) Accuracy: 0.45041462176341174
Log-Linear Model Accuracy: 0.9588339820612625

Performance Comparison
HMM (Viterbi): 0.4504
Log-Linear Model: 0.9588

Thus the above program is executed successfully
