Import libraries

In [104]:
import math
import os
from tqdm import tqdm

cwd = os.getcwd()

In [105]:
# Read the data and build vocabulary
class DatasetLoader:
    def __init__(self, filename):
        self.raw_data = self.read_data(filename)
        self.vocab = {}
        self.tokenized_data = []
    
    def tokenize(self, data):
        tokenized_data = []
        for sentence in data:
            tokenized_data.append(['<START>'] + sentence.split() + ['<STOP>'])
        return tokenized_data
        
    def read_data(self, filename):
        with open(cwd + '/data/' + filename, "r", encoding="UTF-8") as f:
            lines = [line.rstrip() for line in f]
        return lines
    
    def process(self):
        # tokenize data
        self.data = self.tokenize(self.raw_data)

        # create a dictionary of tokens with their counts
        for sentence in self.data:
            for word in sentence:
                self.vocab[word] = self.vocab.get(word, 0) + 1

        # remove tokens with count less than 3
        for t in self.vocab:
            if self.vocab[t] < 3:
                self.vocab[t] = self.vocab.get(t, 0) + 1
                self.vocab[t] = 0
        self.vocab = {k: v for (k, v) in self.vocab.items() if v > 0}

        # replace tokens with count less than 3 with <UNK>
        for sentence in self.data:
            for i, word in enumerate(sentence):
                if self.vocab.get(word, 0) < 3:
                    sentence[i] = '<UNK>'
                    self.vocab['<UNK>'] = self.vocab.get('<UNK>', 0) + 1
                    

In [106]:
# Load the training data, tokenize it, and create a vocabulary
dataset = DatasetLoader("1b_benchmark.train.tokens")
dataset.process()

print("Vocabulary size: ", len(dataset.vocab)-1)  # -1 for <START> token

Vocabulary size:  26602


##### Language Model

In [107]:
# language model
class LanguageModel:
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab
        self.M = 0  # total tokens
        self.vocab_size = len(vocab)
        # unigrams, bigrams, and trigrams
        self.unigrams = {}
        self.bigrams = {}
        self.trigrams = {}
        # probabilities for unigrams, bigrams, and trigrams
        self.unigram_probs = {}
        self.bigram_probs = {}
        self.trigram_probs = {}
        

# we need the language model to be a class-based to get object from it
# and use it for train, test, and dev data

# create a language model
train_lm = LanguageModel(dataset.data, dataset.vocab)

##### Create N-grams

In [108]:
  # find total number of tokens 
    # present in the train data
def find_total_tokens(lm):
    counter = 0
    for sentence in lm.data:
        counter += len(sentence)-1
    return counter

train_lm.M = find_total_tokens(train_lm)
print("Total number of tokens in the training data: ", train_lm.M)


def create_ngrams(sentences):
    unigrams, bigrams, trigrams = {}, {}, {}

    for sentence in tqdm(sentences):
        for i in range(len(sentence)):
            # unigrams
            unigrams[sentence[i]] = unigrams.get(sentence[i], 0) + 1

            # bigrams
            if i < len(sentence) - 1:
                bigram = (sentence[i], sentence[i+1])
                bigrams[bigram] = bigrams.get(bigram, 0) + 1

            # trigrams
            if i < len(sentence) - 2:
                trigram = (sentence[i], sentence[i+1], sentence[i+2])
                trigrams[trigram] = trigrams.get(trigram, 0) + 1

    return unigrams, bigrams, trigrams


# create unigrams, bigrams, and trigrams
unigrams, bigrams, trigrams = create_ngrams(train_lm.data)
train_lm.unigrams = unigrams
train_lm.bigrams = bigrams
train_lm.trigrams = trigrams


print("# of unigrams: ", len(unigrams))
print("# of bigrams: ", len(bigrams))
print("# of trigrams: ", len(trigrams))

Total number of tokens in the training data:  1622905


100%|██████████| 61530/61530 [00:03<00:00, 19559.03it/s]


# of unigrams:  26603
# of bigrams:  510391
# of trigrams:  1111319


##### Getting probabilities of unigrams, bigrams and trigrams

In [109]:

# get unigram probabilities
def get_unigram_probs(lm):
    probs = {}
    for u in tqdm(lm.unigrams):
        probs[u] = unigrams[u] / lm.M
    return probs


# get bigram probabilities
def find_probs_bigram(lm):
    probs = {}
    # find the probability for each unique bigram
    for b in tqdm(lm.bigrams):
        if b[0] not in lm.unigrams:
            probs[b] = 0
        else:
            probs[b] = lm.bigrams[b] / lm.unigrams[b[0]]
    return probs


# get trigram probabilities
def find_probs_trigram(lm):
    probs = {}
    # find the probability for each unique trigram
    for t in tqdm(lm.trigrams):
        if t[0:2] not in lm.bigrams:
            probs[t] = 0
        else:
            probs[t] = lm.trigrams[t] / lm.bigrams[t[0:2]]
    return probs


# get unigram, bigram, and trigram probabilities
print("Calc-ing unigram probs ...")
train_lm.unigram_probs = get_unigram_probs(train_lm)
print("Calc-ing bigram probs ...")
train_lm.bigram_probs = find_probs_bigram(train_lm)
print("Calc-ing trigram probs ...")
train_lm.trigram_probs = find_probs_trigram(train_lm)


Calc-ing unigram probs ...


100%|██████████| 26603/26603 [00:00<00:00, 1474126.66it/s]


Calc-ing bigram probs ...


100%|██████████| 510391/510391 [00:00<00:00, 590503.28it/s]


Calc-ing trigram probs ...


100%|██████████| 1111319/1111319 [00:02<00:00, 371077.96it/s]


In [110]:
st = 400

p = list(train_lm.bigram_probs.items())[st:st+5]
print("Bigram probabilities: \n", p)

print()

t = list(train_lm.trigram_probs.items())[st:st+5]
print("Trigram probabilities: \n", t)


Bigram probabilities: 
 [(('last', 'year'), 0.23150816522574447), (('year', '.'), 0.23281393217231897), (('<START>', 'Bush'), 0.000520071509832602), (('Bush', 'is'), 0.04722222222222222), (('is', 'remembered'), 0.0002680246582685607)]

Trigram probabilities: 
 [(("'", 'Coming', 'Home'), 1.0), (('Coming', 'Home', "'"), 0.5), (('Home', "'", 'was'), 1.0), (("'", 'was', 'released'), 0.4), (('was', 'released', 'in'), 0.11428571428571428)]


##### Perplexity

In [111]:
# Now we can calculate the perplexity

def get_log_prob(prob):
    if prob == 0:  # if prob == 0, log(prob) = -inf
        return 0
    else:
        return math.log2(prob)


def get_M(data):
    M = 0
    for sentence in data:
        M += len(sentence) - 1
    return M


# calculate perplexity of unigram model
def unigram_perplexity(lm, data):
    log_prob = 0
    for sentence in data:
        for word in sentence:
            if word == '<START>':
                continue
            log_prob += get_log_prob(lm.unigram_probs.get(word, 0))
    return math.pow(2, -log_prob / get_M(data))


# calculate perplexity of bigram model
def bigram_perplexity(lm, data):
    log_prob = 0
    for sentence in data:
        for i in range(len(sentence)):            
            bigram = (sentence[i-1], sentence[i])
            log_prob += get_log_prob(lm.bigram_probs.get(bigram, 0))
    return math.pow(2, -log_prob / get_M(data))



# calculate perplexity of trigram model
def trigram_perplexity(lm, data):
    log_prob = 0
    for sentence in data:
        for i in range(len(sentence)):
            # if the word is <START>, use the bigram of 2nd and 3rd word
            if "<START>" in sentence[i-2:i]:
                b = (sentence[i-1], sentence[i])
                prob = lm.bigram_probs.get(b, 0)
            else:
                t = (sentence[i-2], sentence[i-1], sentence[i])
                prob = lm.trigram_probs.get(t, 0)
            log_prob += get_log_prob(prob)

    return math.pow(2, -log_prob / get_M(data))


# calculate perplexity of unigram, bigram, and trigram models
print("Perplexity of unigram model: ", unigram_perplexity(train_lm, train_lm.data))
print("Perplexity of bigram model: ", bigram_perplexity(train_lm, train_lm.data))
print("Perplexity of trigram model: ", trigram_perplexity(train_lm, train_lm.data))



Perplexity of unigram model:  976.5437422251438
Perplexity of bigram model:  77.07346595596329
Perplexity of trigram model:  6.55994749293112


##### Linear Interpolation

In [112]:

def linear_interpolation_perplexity(lm, l1, l2, l3, data):
    log_prob = 0
    for sentence in data:
        for i in range(len(sentence)):
            # trigram
            # if the word is <START>, use the bigram of 2nd and 3rd word
            if "<START>" in sentence[i-2:i]:
                b = (sentence[i-1], sentence[i])
                prob = l3 * lm.bigram_probs.get(b, 0)
            else:
                t = (sentence[i-2], sentence[i-1], sentence[i])
                prob = l3 * lm.trigram_probs.get(t, 0)
            
            # bigram
            b = (sentence[i-1], sentence[i])
            prob += l2 * lm.bigram_probs.get(b, 0)
            
            # unigram
            if sentence[i] == "<START>":
                prob += l1 * 0
            else:
                prob += l1 * lm.unigram_probs.get(sentence[i], 0)

            log_prob += get_log_prob(prob)    

    return math.pow(2, -log_prob / get_M(data))


# calculate perplexity of unigram, bigram, and trigram models
l_list = [
    (0.5, 0.2, 0.3),
    (0.1, 0.2, 0.7),
    (0.1, 0.3, 0.6),
    (0.2, 0.3, 0.5),
    (0.3, 0.4, 0.3),
]

print("Linear interpolation perplexity: ")
linear_interpolation_perplexity(train_lm, 0.5, 0.2, 0.3, train_lm.data)

Linear interpolation perplexity: 


20.80836378514529

##### Debug

In [113]:
# debug code

def preprocess(sentences):
    # preprocess the sentences
    preprocessed = []
    for s in sentences:
        words = ['<START>'] + s.split() + ['<STOP>']
        for word in words:
            if word not in train_lm.vocab.keys():
                words[words.index(word)] = '<UNK>'
        preprocessed.append(words)
    return preprocessed


debug_data = ['HDTV .']
p = preprocess(debug_data)
print(p)
print("Debug mode: ")
print("Perplexity of unigram model: ", unigram_perplexity(train_lm, p))
print("Perplexity of bigram model: ", bigram_perplexity(train_lm, p))
print("Perplexity of trigram model: ", trigram_perplexity(train_lm, p))
print("Linear interpolation perplexity: ", linear_interpolation_perplexity(train_lm, 0.1, 0.3, 0.6, p))

[['<START>', 'HDTV', '.', '<STOP>']]
Debug mode: 
Perplexity of unigram model:  658.0445066285465
Perplexity of bigram model:  63.70757362051903
Perplexity of trigram model:  1.5874010519681994
Linear interpolation perplexity:  98.63583306527288


##### Dev and Test

In [114]:

# dev dataset
dev_ds = DatasetLoader("1b_benchmark.dev.tokens")
dev_ds.process()

dev_lm = LanguageModel(dev_ds.data, train_lm.vocab)

print("Dev dataset perplexity: \n")
print("Perplexity of unigram model: ", unigram_perplexity(train_lm, dev_lm.data))
print("Perplexity of bigram model: ", bigram_perplexity(train_lm, dev_lm.data))
print("Perplexity of trigram model: ", trigram_perplexity(train_lm, dev_lm.data))
print("Linear interpolation perplexity: ", linear_interpolation_perplexity(train_lm, 0.1, 0.3, 0.6, dev_lm.data))


print("\n")

# test dataset
test_ds = DatasetLoader("1b_benchmark.test.tokens")
test_ds.process()

test_lm = LanguageModel(test_ds.data, train_lm.vocab)

print("Test dataset perplexity: \n")
print("Perplexity of unigram model: ", unigram_perplexity(train_lm, test_lm.data))
print("Perplexity of bigram model: ", bigram_perplexity(train_lm, test_lm.data))
print("Perplexity of trigram model: ", trigram_perplexity(train_lm, test_lm.data))
print("Linear interpolation perplexity: ", linear_interpolation_perplexity(train_lm, 0.1, 0.3, 0.6, test_lm.data))


Dev dataset perplexity: 

Perplexity of unigram model:  625.7605102630304
Perplexity of bigram model:  32.77505030825911
Perplexity of trigram model:  2.977545749514274
Linear interpolation perplexity:  251.4977414572197


Test dataset perplexity: 

Perplexity of unigram model:  625.7229264239381
Perplexity of bigram model:  32.69482904011273
Perplexity of trigram model:  2.9797598678973816
Linear interpolation perplexity:  250.24339103648967
