In [72]:
import numpy as np
from collections import defaultdict
import random
# from utils import emotion_scores

In [98]:
class BigramLM:
    def __init__(self):
        self.vocab = set()
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.bigram_probs = None

    def learn_from_dataset(self, dataset):
        for sentence in dataset:
            tokens = sentence.split()                        
            for i in range(len(tokens) - 1):
                word1, word2 = tokens[i], tokens[i + 1]
                self.vocab.add(word1)
                self.vocab.add(word2)
                self.bigram_counts[word1][word2] += 1
                self.unigram_counts[word1] += 1                            
                
        self.vocab = list(self.vocab)
        self.calculate_bigram_probs_laplace()

    def calculate_bigram_probs(self):
        num_words = len(self.vocab)
        self.bigram_probs = np.zeros((num_words, num_words))        
        
        for i, word1 in enumerate(self.vocab):
            for j, word2 in enumerate(self.vocab):
                if self.unigram_counts[word1] > 0:
                    self.bigram_probs[i][j] = float(self.bigram_counts[word1][word2]) / float(self.unigram_counts[word1])

    def calculate_bigram_probs_laplace(self):
        num_words = len(self.vocab)
        self.bigram_probs = np.zeros((num_words, num_words))

        for i, word1 in enumerate(self.vocab):
            for j, word2 in enumerate(self.vocab):
                self.bigram_probs[i][j] = (self.bigram_counts[word1][word2] + 1) / (self.unigram_counts[word1] + num_words)
    
    # KNESER-NEY REMAINS

    def generate_next_word(self, current_word):
        if current_word not in self.vocab:
            raise ValueError(f"{current_word} not found in the vocabulary.")

        word_index = list(self.vocab).index(current_word)
        next_word_probs = self.bigram_probs[word_index]

        # CONFUSED HERE
        next_word_index = list(next_word_probs).index(max(next_word_probs))
        # next_word_index = np.random.choice(len(self.vocab), p=next_word_probs)

        next_word = list(self.vocab)[next_word_index]

        return next_word

In [94]:
corpus = open('../Dataset/corpus.txt')
dataset = []
for i in corpus.readlines():
    dataset.append(i)

In [99]:
model = BigramLM()
model.learn_from_dataset(dataset)
print(model.bigram_probs)

[[0.00018416 0.00018416 0.00018416 ... 0.00018416 0.00018416 0.00018416]
 [0.00018416 0.00018416 0.00018416 ... 0.00018416 0.00018416 0.00018416]
 [0.00018416 0.00018416 0.00018416 ... 0.00018416 0.00018416 0.00018416]
 ...
 [0.00018406 0.00018406 0.00018406 ... 0.00018406 0.00018406 0.00018406]
 [0.00018416 0.00018416 0.00018416 ... 0.00018416 0.00018416 0.00018416]
 [0.00018416 0.00018416 0.00018416 ... 0.00018416 0.00018416 0.00018416]]


In [107]:
current_word = random.choice(model.vocab)
generated_sentence = [current_word]

for _ in range(5):
    current_word = model.generate_next_word(current_word)
    generated_sentence.append(current_word)

generated_sentence

['familiarity', 'that', 'i', 'feel', 'like', 'i']

In [None]:
emotion_scores("i am happy today")

In [None]:
import numpy as np
from collections import defaultdict

class BigramLM:
    def __init__(self):
        self.vocab = set()
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.bigram_probs = None
        self.emotion_scores = None  # Placeholder for emotion scores

    def learn_from_dataset(self, dataset):
        for sentence in dataset:
            tokens = sentence.split()
            for i in range(len(tokens) - 1):
                word1, word2 = tokens[i], tokens[i + 1]
                self.vocab.add(word1)
                self.vocab.add(word2)
                self.bigram_counts[word1][word2] += 1
                self.unigram_counts[word1] += 1
                
        self.vocab = list(self.vocab)
        self.calculate_bigram_probs()

    def calculate_bigram_probs(self, emotion=None, beta=0.0):
        num_words = len(self.vocab)
        self.bigram_probs = np.zeros((num_words, num_words))
        
        for i, word1 in enumerate(self.vocab):
            for j, word2 in enumerate(self.vocab):
                if self.unigram_counts[word1] > 0:
                    prob = self.bigram_counts[word1][word2] / self.unigram_counts[word1]
                    if emotion:
                        prob += beta * self.emotion_scores.get(emotion, 0)
                    self.bigram_probs[i][j] = prob

    def set_emotion_scores(self, sentence):
        # Assuming emotion_scores function is defined elsewhere and imported
        scores = emotion_scores(sentence)
        self.emotion_scores = {score['label']: score['score'] for score in scores}

    def generate_emotion_oriented_sample(self, start_word, emotion, beta=0.1, length=10):
        if start_word not in self.vocab:
            raise ValueError(f"The word '{start_word}' is not in the vocabulary.")
        
        self.set_emotion_scores(start_word)  # Set emotion scores based on the start word
        self.calculate_bigram_probs(emotion, beta)  # Recalculate bigram probabilities with emotion component

        current_word = start_word
        sentence = [current_word]
        for _ in range(length - 1):
            current_idx = self.vocab.index(current_word)
            probabilities = self.bigram_probs[current_idx]
            
            next_word_idx = np.random.choice(len(self.vocab), p=probabilities/sum(probabilities))
            next_word = self.vocab[next_word_idx]

            sentence.append(next_word)
            current_word = next_word

        return ' '.join(sentence)
