In [169]:
import re
from datasets import load_dataset

In [170]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/lpossner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [171]:
ds = load_dataset("knowledgator/biomed_NER")
df = ds["train"].to_pandas()
dct = df.to_dict()

In [172]:
chars = r'[.,!@#$%&*(){}[\]:;\'"<>?/_+-=|\\~`]'

def clean_text(text):
    return re.sub(chars, '', text).strip()   

In [176]:
corpus = []
for ((_, text), (_, labels)) in zip(dct["text"].items(), dct["entities"].items()):
    indices = [(entity["start"], entity["end"]) for entity in labels.tolist()]
    entities = [entity["class"] for entity in labels.tolist()]
    other_indices = [(0, 0)] + indices
    other_indices = other_indices + [(len(text), len(text))]
    other_indices = [(char_index_1[1], char_index_2[0]) for char_index_1, char_index_2 in zip(other_indices[:-1], other_indices[1:])]
    other_entities = ["OTHER"] * len(other_indices)
    indices = indices + other_indices
    entities = entities + other_entities
    indices, entities = zip(*sorted(zip(indices, entities), key=lambda x: x[0]))
    words = [text[index[0]:index[1]] for index in indices]
    sentences = [(word, entity) for word, entity in zip(words, entities)]
    corpus.append(sentences)

corpus = [[(clean_text(words[0]), words[1]) for words in sentences] for sentences in corpus]

In [179]:
new_corpus = []
for sentences in corpus:
    new_sentences = []
    for words in sentences:
        if words[1] == "OTHER":
            tokenized_words = word_tokenize(words[0])
            if tokenized_words:
                new_words = [(word, "OTHER") for word in tokenized_words]
                new_sentences.extend(new_words)
        else:
            new_sentences.append(words)
    new_corpus.append(new_sentences)
        
corpus = new_corpus

In [14]:
import numpy as np
from collections import defaultdict, Counter


class HMM_NER:
    
    def __init__(self):
        self.states = []
        self.observations = []
        self.start_prob = defaultdict(float)
        self.trans_prob = defaultdict(lambda: defaultdict(float))
        self.emit_prob = defaultdict(lambda: defaultdict(float))

    def train(self, data):
        # Count occurrences to calculate probabilities
        start_counts = Counter()
        trans_counts = defaultdict(Counter)
        emit_counts = defaultdict(Counter)
        state_counts = Counter()

        for sentence in data:
            prev_state = None
            for word, state in sentence:
                state_counts[state] += 1
                emit_counts[state][word] += 1
                if prev_state is None:
                    start_counts[state] += 1
                else:
                    trans_counts[prev_state][state] += 1
                prev_state = state

        # Calculate initial probabilities
        total_starts = sum(start_counts.values())
        for state in start_counts:
            self.start_prob[state] = start_counts[state] / total_starts

        # Calculate transition probabilities
        for state in trans_counts:
            total_transitions = sum(trans_counts[state].values())
            for next_state in trans_counts[state]:
                self.trans_prob[state][next_state] = (
                    trans_counts[state][next_state] / total_transitions
                )

        # Calculate emission probabilities
        for state in emit_counts:
            total_emissions = sum(emit_counts[state].values())
            for word in emit_counts[state]:
                self.emit_prob[state][word] = emit_counts[state][word] / total_emissions

        self.states = list(state_counts.keys())
        self.observations = list({word for sentence in data for word, _ in sentence})

    def viterbi(self, sentence):
        V = [{}]
        path = {}

        # Initialize the base cases (t == 0)
        for state in self.states:
            V[0][state] = self.start_prob[state] * self.emit_prob[state].get(
                sentence[0], 1e-6
            )
            path[state] = [state]

        # Run Viterbi for t > 0
        for t in range(1, len(sentence)):
            V.append({})
            new_path = {}

            for curr_state in self.states:
                (prob, prev_state) = max(
                    (
                        V[t - 1][prev_state]
                        * self.trans_prob[prev_state].get(curr_state, 1e-6)
                        * self.emit_prob[curr_state].get(sentence[t], 1e-6),
                        prev_state,
                    )
                    for prev_state in self.states
                )

                V[t][curr_state] = prob
                new_path[curr_state] = path[prev_state] + [curr_state]

            path = new_path

        # Find the most probable state sequence
        (prob, final_state) = max(
            (V[len(sentence) - 1][state], state) for state in self.states
        )
        return path[final_state]

In [181]:
hmm_ner = HMM_NER()
hmm_ner.train(corpus)


In [189]:
sentence = """
Weed seed inactivation in soil mesocosms via biosolarization with mature compost and tomato processing waste amendments Biosolarization is a fumigation alternative that combines passive solar heating with amendment-driven soil microbial activity to temporarily create antagonistic soil conditions, such as elevated temperature and acidity, that can inactivate weed seeds and other pest propagules.
"""

sentence = sentence.strip().split(" ")
predicted_tags = hmm_ner.viterbi(sentence)
print(predicted_tags)


['ORGANISM', 'ORGANISM', 'FUNCTION', 'OTHER', 'CHEMICALS', 'OTHER', 'OTHER', 'ACTIVITY', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'ORGANISM', 'OTHER', 'OTHER', 'OTHER', 'ACTIVITY', 'OTHER', 'OTHER', 'ACTIVITY', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'CHEMICALS', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'CHEMICALS', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'ORGANISM', 'ORGANISM', 'OTHER', 'OTHER', 'ORGANISM', 'OTHER']


In [191]:
for word, tag in zip(sentence, predicted_tags):
    print(word, tag)

Weed ORGANISM
seed ORGANISM
inactivation FUNCTION
in OTHER
soil CHEMICALS
mesocosms OTHER
via OTHER
biosolarization ACTIVITY
with OTHER
mature OTHER
compost OTHER
and OTHER
tomato ORGANISM
processing OTHER
waste OTHER
amendments OTHER
Biosolarization ACTIVITY
is OTHER
a OTHER
fumigation ACTIVITY
alternative OTHER
that OTHER
combines OTHER
passive OTHER
solar OTHER
heating OTHER
with OTHER
amendment-driven OTHER
soil CHEMICALS
microbial OTHER
activity OTHER
to OTHER
temporarily OTHER
create OTHER
antagonistic OTHER
soil CHEMICALS
conditions, OTHER
such OTHER
as OTHER
elevated OTHER
temperature OTHER
and OTHER
acidity, OTHER
that OTHER
can OTHER
inactivate OTHER
weed ORGANISM
seeds ORGANISM
and OTHER
other OTHER
pest ORGANISM
propagules. OTHER
