In [231]:
import numpy as np
import spacy
from datasets import load_dataset

PROB = 0
WORD = 1
UNIGRAM = 1
BIGRAM = 2

In [232]:
import json


class Corpus:
    def __init__(self):
        self.nlp = None
        self.unigram_data = {}  # {w:count}
        self.bigram_data = {}  # {w_prev: {w:count}}
        self.M = 0
        self.m = 0
        self.unigram_data_size = 0
        self.bigram_data_size = {}

    def add_to_unigram(self, w):
        self.unigram_data_size += 1
        if w in self.unigram_data:
            self.unigram_data[w] += 1
        else:
            self.unigram_data[w] = 1

    def add_to_bigram(self, w, w_prev):
        if w_prev not in self.bigram_data:
            self.bigram_data[w_prev] = {}
            self.bigram_data_size[w_prev] = 0
        self.bigram_data_size[w_prev] += 1
        w_prev_dict = self.bigram_data[w_prev]
        if w in w_prev_dict:
            w_prev_dict[w] += 1
        else:
            w_prev_dict[w] = 1

    def load_data(self):
        self.nlp = spacy.load("en_core_web_sm")
        dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split="train")
        for text in dataset['text']:
            self.m +=1
            doc = self.nlp(text)
            w_prev = 'START'
            for w in doc:
                if w.is_alpha:
                    self.M +=1
                    self.add_to_unigram(w.lemma_)
                    self.add_to_bigram(w.lemma_, w_prev)
                    w_prev = w.lemma_  # todo: check first word in extreme cases is indeed START

        with open('unigram_data.txt', 'w') as f:
            json.dump(self.unigram_data, f, ensure_ascii=False)
        with open('bigram_data.txt', 'w') as f:
            json.dump(self.bigram_data, f, ensure_ascii=False)
        with open('metadata.txt', 'w') as f:
            json.dump({'M': self.M , 'm':self.m , 'unigram_data_size': self.unigram_data_size  ,
                       'bigram_data_size' :self.bigram_data_size}, f, ensure_ascii=False)



In [233]:
class Unigram:

    def __init__(self, n):
        with open('unigram_data.txt') as json_file:
            self.corpus = json.load(json_file)
        with open('metadata.txt') as json_file:
            meta = json.load(json_file)
        self.M = meta['M'] #total num of tokens
        self.m = meta['m'] #total num of sentences
        self.max_prob = - np.inf
        self.max_prob_word = None
        self.corpus_size = meta['unigram_data_size']


    def train(self):
        for w in self.corpus.keys():
            count = self.corpus[w]
            prob_w = np.log(count / self.corpus_size)
            self.corpus[w] = prob_w
            if prob_w > self.max_prob:
                self.max_prob = prob_w
                self.max_prob_word = w


    def predict(self, sentence):
        return self.max_prob_word


    def probability(self, sentence):
        prob = 0
        for w in sentence:
            if w.lemma_ not in self.corpus:
                return - np.inf
            prob += self.corpus[w.lemma_]
        return prob


    def perplexity(self ,test_set):
        M = 0
        prob_sum = 0
        for sentence in test_set:
            M += len(sentence)
            prob_sum += self.probability(sentence)
        l = prob_sum / M
        return np.exp(-l)






In [234]:
class Bigram:

    def __init__(self, n):
        with open('bigram_data.txt') as json_file:
            self.corpus = json.load(json_file)
        with open('metadata.txt') as json_file:
            meta = json.load(json_file)
        self.M = meta['M'] #total num of tokens
        self.m = meta['m'] #total num of sentences
        self.max_probs = {}
        self.pair_counts = meta['bigram_data_size']


    def train(self):
        for first_w  in self.corpus.keys():
            next_w = self.corpus[first_w]
            self.max_probs[first_w] = [- np.inf, None]
            for w in next_w.keys():
                count = next_w[w]
                pair_prob = np.log(count / self.pair_counts[first_w])
                next_w[w] = pair_prob
                if pair_prob > self.max_probs[first_w][0]:
                    self.max_probs[first_w][PROB] = pair_prob
                    self.max_probs[first_w][WORD] = w


    def predict(self, sentence):
        last_word = sentence[-1]
        last_word = last_word.lemma_
        if last_word not in self.corpus:
            return "STOP"
        return self.max_probs[last_word][WORD]


    def probability(self, sentence):
        if sentence[0].lemma_ not in self.corpus['START']:
            return - np.inf
        prob = self.corpus['START'][sentence[0].lemma_]
        for i in range(2, len(sentence)):
            w_prev = sentence[i - 1].lemma_
            w_curr = sentence[i].lemma_
            if w_prev not in self.corpus:
                return - np.inf
            if w_curr not in self.corpus[w_prev]:
                return - np.inf
            prob += self.corpus[w_prev][w_curr]
        return prob


    def perplexity(self , test_set):
        M = 0
        prob_sum = 0
        for sentence in test_set:
            M += len(sentence)
            prob_sum += self.probability(sentence)
        l = prob_sum / M
        return np.exp(-l)

In [235]:
class LinearInterpolation:
    def __init__(self, unigram, bigram , l1 , l2):
        self.unigram_model = unigram
        self.bigram_model = bigram
        self.l1 = l1
        self.l2 = l2

    def probability(self, sentence):
        unigram_prob = self.unigram_model.probability(sentence)
        if unigram_prob == -np.inf:
            unigram_prob = 0
        bigram_prob = self.bigram_model.probability(sentence)
        if bigram_prob == -np.inf:
            bigram_prob = 0
        return (self.l1 * unigram_prob) + (self.l2 * bigram_prob)

    def perplexity(self , test_set):
        M = 0
        prob_sum = 0
        for sentence in test_set:
            M += len(sentence)
            prob_sum += self.probability(sentence)
        l = prob_sum / M
        return np.exp(-l)

In [236]:
# corpus = Corpus()
# corpus.load_data()

In [237]:
"""1. Train maximum-likelihood unigram and bigram language models based on the above training data."""

unigram_model = Unigram(UNIGRAM)


In [238]:
bigram_model = Bigram(BIGRAM)

In [239]:
unigram_model.train()

In [240]:
bigram_model.train()


In [241]:
"""2. Using the bigram model, continue the following sentence with the most probable word predicted by the model: “ I
have a house in ... """
nlp = spacy.load("en_core_web_sm")

sentence = nlp('I have a house in')
predicted_word = bigram_model.predict(sentence)
print(predicted_word)

the


In [242]:
""" 3. Using the bigram model:
(a) compute the probability of the following two sentences (for each sentence separately).
(b) compute the perplexity of both the following two sentences (treating them as a single test set with 2 sentences).

Brad Pitt was born in Oklahoma
The actor was born in USA
"""
sentence1 = nlp('Brad Pitt was born in Oklahoma')
sentence2 = nlp('The actor was born in USA')

sentence1_prob = bigram_model.probability(sentence1)
sentence2_prob = bigram_model.probability(sentence2)
print(f'sentence 1 probability: {sentence1_prob}')
print(f'sentence 2 probability: {sentence2_prob}')

sentence 1 probability: -inf
sentence 2 probability: -21.516756502901373


In [243]:
perplexity = bigram_model.perplexity([sentence1 , sentence2])
print(f'sentence 1 ans 2 perplexity: {perplexity}')

sentence 1 ans 2 perplexity: inf


In [244]:
"""4. Now we use linear interpolation smoothing between the bigram model and unigram model with λbigram = 2/3 and
λunigram = 1/3, using the same training data. Given this new model, compute the probability and the perplexity of the
same sentences such as in the previous question. """
interpolation = LinearInterpolation(unigram_model , bigram_model , 1/3 , 2/3)
sentence1_inter_prob = interpolation.probability(sentence1)
sentence2_inter_prob = interpolation.probability(sentence2)
print(f'sentence 1 interpolated probability: {sentence1_inter_prob}')
print(f'sentence 2 interpolated probability: {sentence2_inter_prob}')

sentence 1 interpolated probability: -16.742848003834535
sentence 2 interpolated probability: -26.45504299020583


In [245]:
perplexity = interpolation.perplexity([sentence1 , sentence2])
print(f'sentence 1 ans 2 interpolated perplexity: {perplexity}')

sentence 1 ans 2 interpolated perplexity: 36.591802850992856
