In [1]:
import numpy as np
from collections import defaultdict

In [2]:
class BigramLM:
    def __init__(self):
        self.vocab = set()
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.bigram_probs = None

    def learn_from_dataset(self, dataset):
        for sentence in dataset:
            tokens = sentence.split()
            for i in range(len(tokens) - 1):
                word1, word2 = tokens[i], tokens[i + 1]
                self.vocab.add(word1)
                self.vocab.add(word2)
                self.bigram_counts[word1][word2] += 1
                self.unigram_counts[word1] += 1
                
        self.vocab = list(self.vocab)
        self.calculate_bigram_probs()

    def calculate_bigram_probs(self):
        num_words = len(self.vocab)
        self.bigram_probs = np.zeros((num_words, num_words))
        
        for i, word1 in enumerate(self.vocab):
            for j, word2 in enumerate(self.vocab):
                if self.unigram_counts[word1] > 0:
                    self.bigram_probs[i][j] = self.bigram_counts[word1][word2] / self.unigram_counts[word1]    
    
    # KNESER-NEY REMAINS

    def generate_next_word(self, current_word):
        if current_word not in self.vocab:
            raise ValueError(f"{current_word} not found in the vocabulary.")

        word_index = list(self.vocab).index(current_word)
        next_word_probs = self.bigram_probs[word_index]

        # CONFUSED HERE
        next_word_index = list(next_word_probs).index(max(next_word_probs))
        # next_word_index = np.random.choice(len(self.vocab), p=next_word_probs)

        next_word = list(self.vocab)[next_word_index]

        return next_word

In [4]:
corpus = open('../Dataset/corpus.txt')
dataset = []
for i in corpus.readlines():
    dataset.append(i)

In [5]:
model = BigramLM()
model.learn_from_dataset(dataset)
print(model.bigram_probs)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
current_word = "im"
generated_sentence = [current_word]

for _ in range(5):
    current_word = model.generate_next_word(current_word)
    generated_sentence.append(current_word)

generated_sentence

['im', 'feeling', 'a', 'little', 'bit', 'of']