In [1]:
import re
from pathlib import Path
import string
from functools import reduce
from math import log
import itertools
import nltk
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [3]:
# Enter smoothing or no smoothing.
smoothing = 1
filename = "textfile.txt"

In [4]:
# Loads file
# input - filename.txt
# returns a list of sentences seperated by newline in the textfile.
def load_file(filename):
    ### Write your Code here
    with open(filename , 'r') as file:
       lines = file.readlines()
    return lines

In [5]:
# Tokenizes the sentences meaning split the sentences into words seperated by the "white sapce".
# input - List of sentences
# returns a list of lists of each sentence being tokenized.
def tokenize_sentence(lines):
    ### Write your Code here
    lines = [i.strip("''").split(" ") for i in lines]
    print("No of sentences in Corpus: "+str(len(lines)))
    return lines

In [6]:
def prep_data(lines):
    cleaned_data = []

    for line in lines:
        if line:  # Check if the line is not empty
            # Remove punctuation and convert to lowercase
            cleaned_line = [word.lower() for word in line if word not in string.punctuation]

            # Add <s> at the beginning and </s> at the end of the sentence
            cleaned_line.insert(0, '<s>')
            cleaned_line.append('</s>')

            cleaned_data.append(cleaned_line)

    print("Number of sentences in Corpus: " + str(len(cleaned_data)) + "\n")
    return cleaned_data


In [7]:
dataset = load_file(filename)
dataset = tokenize_sentence(dataset)
dataset = prep_data(dataset)

No of sentences in Corpus: 10059
Number of sentences in Corpus: 10059



In [8]:
print(dataset)



In [9]:
# Creates the vocabulary file of the dataset.
def vocabulary(dataset):
    dataset_vocab = set(itertools.chain.from_iterable(dataset))
    # remove <s> and </s> from the vocabulary of the dataset
    dataset_vocab.remove('<s>')
    dataset_vocab.remove('</s>')
    dataset_vocab = list(dataset_vocab)
    dataset_vocab.append('<s>')
    dataset_vocab.append('</s>')
    return dataset_vocab

dataset_vocab = vocabulary(dataset)

In [10]:
len(dataset_vocab)

17246

In [32]:
from nltk.internals import Counter
# Counts the no. of times a word repeats (frequency of each word) in the corpus.
# input - list of lists of words obtained from "prep_data"
# returns - a dictionary defined as {word:frequency} for words of the corpus including <s> and </s>.
def freq_of_unique_words(lines):
    count = {}
    words = list(itertools.chain.from_iterable(lines))
    for line in words:
          if line in count:
              count[line] += 1
          else:
               count[line] = 1

    unique_word_count = len(count)
    print("No of unique words in corpus: " , str(unique_word_count))

    return count

In [33]:
unique_word_frequency = freq_of_unique_words(dataset)
#len(unique_word_frequency)

No of unique words in corpus:  17139


QUESTION 1A (5)

In [44]:
# Computes the bigram frequncies
# Bigram frequncies means the number of times a word appears after a given word in the corpus.
# inputs:
# lines - list of lists obtained from "prep_data".
# count - dictionary obtained from "freq_of_unique_words".
# returns - dictionary of bigram frequencies {(word|given word): count(word|given word)} --- count(word|given word)~int.
def compute_bigram_frequencies(lines):
   ### Write your Code here
    bigram_frequencies = {}

    for line in lines:
     for i in range(0,len(line)-1):
            word = line[i]
            next = line[i + 1]
            if word != '<s>' and word != '</s' and next != '</s>':
                bigram = next + '|' + word
                if bigram in bigram_frequencies:
                    bigram_frequencies[bigram] += 1
                else:
                    bigram_frequencies[bigram] = 1
    #The number of bigram_frquencies in the corpus
    print("Number of Bigram Frequencies in Corpus : " , len(bigram_frequencies))
    return bigram_frequencies

In [45]:
bigram_frequencies = compute_bigram_frequencies(dataset)
print("Bigram Frequencies : " , bigram_frequencies)
bigram_unique_word_count = len(unique_word_frequency)
print("\n"+"No of words in bigram: "+str(bigram_unique_word_count))

Number of Bigram Frequencies in Corpus :  115552

No of words in bigram: 17139


QUESTION 1B (5)

In [49]:
# Calculating bigram probability
# bigram probability means P(word|given word) = count(word|given word)/ count(given word).
# if count(word|given word) or count(given word) is 0 then probability is 0.
# input bigram_frquencies and count obtained from "freq_of_unique_words".
# returns dictionary of bigram probabilities {(word|given word): probabilty} --- probability is a float value.
def compute_bigram_probabilities(bigram_frequencies,count):
  ### Write your Code here
    bigram_probabilities = {}
    for bigram, probability in bigram_probabilities.items():
        print(f"'{bigram}': {probability}")
        word, word_ = bigram.split('|')
        count_ = count[word_]
        if count_ == 0:
           bigram_probabilities[bigram] = 0
        else:
            bigram_probabilities[bigram] = frequency / count_

    return bigram_probabilities

In [51]:
bigram_probabilities = compute_bigram_probabilities(bigram_frequencies,unique_word_frequency)
bigram_probabilities

{}

In [52]:
# Bigram frequncies of the test sentence computed using the bigram frequencies of the training data.
# add-one smoothing if 1, no smoothing if 0 ----- smoothing
def compute_bigram_count_test_sentence(given_word,word,smoothing):
    if smoothing==0:
        return 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))
    elif smoothing == 1:
        return 1 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))+1

In [63]:
# A table showing the bigram counts for test sentence.
def print_bigram_freq_test_sentence(test_sentence_vocab,smoothing):
    print("A table showing the bigram counts for the test sentence.\nSmoothing = " + str(smoothing))
    print("\t\t", end="")
    for word in test_sentence_vocab:
        if word != '<s>':
            print(word, end="\t\t")
    print("")
    for given_word in test_sentence_vocab:
        if given_word != '</s>':
            if smoothing == 1:
                print(unique_word_frequency + bigram_unique_word_count, end="\t")
            elif smoothing == 0:
                print(unique_word_frequency, end="\t")
            print(given_word, end="\t\t")
            for word in test_sentence_vocab:
                if word != '<s>':
                    print("{0:}".format(compute_bigram_count_test_sentence(given_word, word, smoothing)), end="\t\t")
            print("")

In [54]:
# Bigram probabilities of the test sentence computed using the bigram probabilities of the training data.
# add-one smoothing if 1, no smoothing if 0 ---- smoothing

def compute_bigram_prob_test_sentence(given_word,word,smoothing):
    bigram_freq = 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))
    uni_freq = 0 if unique_word_frequency.get((given_word))==None else unique_word_frequency.get((given_word))
    if smoothing==0:
        return 0 if bigram_probabilities.get((given_word,word))==None else bigram_probabilities.get((given_word,word))
    elif smoothing == 1:
        numerator = bigram_freq+1
        denominator = uni_freq+bigram_unique_word_count
        return 0.0 if numerator == 0 or denominator == 0 else float(numerator) / float(denominator)

QUESTION 1C (5)

In [55]:
# A table showing the bigram probabilities for test sentence.
def print_bigram_probabilities_test_sentence(test_sentence_vocab,smoothing):
   ### Write your Code here
   print("Bigram Probabilities :\n")
   print(f"{'Bigram':<20}{'Probability':<10}")

   for i in range(len(test_sentence_vocab) - 1):
        word, next = test_sentence_vocab[i], test_sentence_vocab[i + 1]
        bigram = f"{next}|{word}"
        probability = bigram_probabilities.get(bigram, smoothing)
        print(f"{bigram:<20}{probability:.6f}")

In [57]:
# Print the probability of the test sentence
# for add-one smoothing if 1, no smoothing if 0
def compute_prob_test_sentence(sentence,smoothing):
   ### Write your Code here
   prob = 1.0
   count = {}
   for i in range(0,len(sentence) - 1):
        word, next= sentence[i], sentence[i + 1]
        bigram = f"{next}|{word}"
        if bigram in bigram_probabilities:
            prob *= bigram_probabilities[bigram]
        else:
            if smoothing == 1:
                V = len(bigram_probabilities)
                probability *= smoothing / (V + count[word])
            elif smoothing == 0:
                probability = 0
                break

   return prob

In [58]:
# Test sentence here
test_sentences = [['upon this the captain started , and eagerly desired to know more .'],['thus , because no man can follow another into these halls .']]

In [64]:
for i in range (len(test_sentences)):
    test_sentence = test_sentences[i]
    print("!!!!!!!!!!The test Sentence is!!!!!!!!!!")
    print(test_sentence)
    test_sentence = tokenize_sentence(test_sentence)
    test_sentence = prep_data(test_sentence)

    # Vocabulary of test sentence
    test_sentence_vocab = vocabulary(test_sentence)

    test_sentence = list(itertools.chain.from_iterable(test_sentence))
    #test_sentence

    # A table showing the bigram counts for test sentence.
    print_bigram_freq_test_sentence(test_sentence_vocab,smoothing)

    # A table showing the bigram probabilities for test sentence.
    print_bigram_probabilities_test_sentence(test_sentence_vocab,smoothing)

    # The probability of the sentence under the trained model
    print("The probability of the sentence under the trained model"+"\nsmoothing ="+str(smoothing))
    print(compute_prob_test_sentence(test_sentence,0))

!!!!!!!!!!The test Sentence is!!!!!!!!!!
['upon this the captain started , and eagerly desired to know more .']
No of sentences in Corpus: 1
Number of sentences in Corpus: 1

A table showing the bigram counts for the test sentence.
Smoothing = 1
		the		more		know		eagerly		and		this		captain		started		to		desired		upon		</s>		
34278	the		1		1		1		1		1		1		1		1		1		1		1		1		
34278	more		1		1		1		1		1		1		1		1		1		1		1		1		
34278	know		1		1		1		1		1		1		1		1		1		1		1		1		
34278	eagerly		1		1		1		1		1		1		1		1		1		1		1		1		
34278	and		1		1		1		1		1		1		1		1		1		1		1		1		
34278	this		1		1		1		1		1		1		1		1		1		1		1		1		
34278	captain		1		1		1		1		1		1		1		1		1		1		1		1		
34278	started		1		1		1		1		1		1		1		1		1		1		1		1		
34278	to		1		1		1		1		1		1		1		1		1		1		1		1		
34278	desired		1		1		1		1		1		1		1		1		1		1		1		1		
34278	upon		1		1		1		1		1		1		1		1		1		1		1		1		
34278	<s>		1		1		1		1		1		1		1		1		1		1		1		1		
Bigram Probabilities :

Bigram              Probability
more|the            