In [18]:
import numpy as np
import pandas as pd
from collections import defaultdict

# Load the training data
train_data = pd.read_csv('idn-tagged-corpus-master\Indonesian_Manually_Tagged_Corpus.tsv', header=None, sep='\t')

# Create a dictionary to store the word-tag pairs
word_tag_pairs = defaultdict(list)
for row in train_data.itertuples():
    word, tag = row[1], row[2]
    word_tag_pairs[word].append(tag)

# Create a dictionary to store the tag transition probabilities
tag_transition_probs = defaultdict(lambda: defaultdict(float))
for i in range(len(train_data) - 1):
    tag1, tag2 = train_data.iloc[i, 1], train_data.iloc[i+1, 1]
    tag_transition_probs[tag1][tag2] += 1
for tag1 in tag_transition_probs:
    total = sum(tag_transition_probs[tag1].values())
    for tag2 in tag_transition_probs[tag1]:
        tag_transition_probs[tag1][tag2] /= total

# Create a dictionary to store the word emission probabilities
word_emission_probs = defaultdict(lambda: defaultdict(float))
for word, tags in word_tag_pairs.items():
    total = len(tags)
    for tag in set(tags):
        word_emission_probs[word][tag] = tags.count(tag) / total

# Define the HMM class
class HMM:
    def __init__(self, tag_transition_probs, word_emission_probs):
        self.tag_transition_probs = tag_transition_probs
        self.word_emission_probs = word_emission_probs

    def viterbi(self, sentence):
        tags = list(self.tag_transition_probs.keys())
        trellis = np.zeros((len(sentence), len(tags)))
        backtrace = np.zeros((len(sentence), len(tags)))

        # Initialize the trellis and backtrace matrices
        for i, word in enumerate(sentence):
            for j, tag in enumerate(tags):
                if i == 0:
                    trellis[i, j] = self.word_emission_probs[word][tag]
                else:
                    max_prob = 0
                    max_tag = None
                    for k, prev_tag in enumerate(tags):
                        prob = trellis[i-1, k] * self.tag_transition_probs[prev_tag][tag] * self.word_emission_probs[word][tag]
                        if prob > max_prob:
                            max_prob = prob
                            max_tag = k
                    trellis[i, j] = max_prob
                    backtrace[i, j] = max_tag

        # Backtrace to find the most likely tag sequence
        tag_sequence = []
        max_prob = 0
        max_tag = None
        for j, tag in enumerate(tags):
            if trellis[-1, j] > max_prob:
                max_prob = trellis[-1, j]
                max_tag = j
        tag_sequence.append(tags[max_tag])
        for i in range(len(sentence) - 1, 0, -1):
            max_tag = int(backtrace[i, max_tag])
            tag_sequence.append(tags[max_tag])
        tag_sequence.reverse()

        return tag_sequence

# Create an instance of the HMM class
hmm = HMM(tag_transition_probs, word_emission_probs)

# Test the HMM on a sample sentence
sentence = ['monyet', 'di', 'ibukota', 'India', 'mengganggu', 'ketertiban']
tag_sequence = hmm.viterbi(sentence)
print('Predicted tags:', tag_sequence)

Predicted tags: ['NN', 'IN', 'NNP', 'NNP', 'VB', 'NN']
