In [1]:
##
import os.path
import sys
import random
from operator import itemgetter
from collections import defaultdict

In [2]:
#----------------------------------------
#  Data input
#----------------------------------------

In [3]:
# Read a text file into a corpus (list of sentences (which in turn are lists of words))
# (taken from nested section of HW0)
def readFileToCorpus(f):
    """ Reads in the text file f which contains one sentence per line.
    """
    if os.path.isfile(f):
        file = open(f, "r") # open the input file in read-only mode
        i = 0 # this is just a counter to keep track of the sentence numbers
        corpus = [] # this will become a list of sentences
        print("Reading file ", f)
        for line in file:
            i += 1
            sentence = line.split() # split the line into a list of words
            #append this list as an element to the list of sentences
            corpus.append(sentence)
            if i % 1000 == 0:
    	#print a status message: str(i) turns int i into a string
    	#so we can concatenate it
                sys.stderr.write("Reading sentence " + str(i) + "\n")
        #endif
    #endfor
        return corpus
    else:
    #ideally we would throw an exception here, but this will suffice
        print("Error: corpus file ", f, " does not exist")
        sys.exit() # exit the script
    #endif
#enddef

In [4]:
# Constants
UNK = "UNK"     # Unknown word token
start = "<s>"   # Start-of-sentence token
end = "</s>"    # End-of-sentence-token

In [5]:
# Preprocess the corpus
def preprocess(corpus):
    #find all the rare words
    freqDict = defaultdict(int)
    for sen in corpus:
	    for word in sen:
	       freqDict[word] += 1
	#endfor
    #endfor

    #replace rare words with unk
    for sen in corpus:
        for i in range(0, len(sen)):
            word = sen[i]
            if freqDict[word] < 2:

                sen[i] = UNK
	    #endif
	#endfor
    #endfor

    #bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    #endfor

    return corpus
#enddef

In [6]:
def preprocessTest(vocab, corpus):
    #replace test words that were unseen in the training with unk
    for sen in corpus:
        for i in range(0, len(sen)):
            word = sen[i]
            if word not in vocab:
                sen[i] = UNK
	    #endif
	#endfor
    #endfor

    #bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    #endfor

    return corpus
#enddef

In [7]:
#--------------------------------------------------------------
# Language models and data structures
#--------------------------------------------------------------

In [8]:
class LanguageModel:
    def __init__(self, corpus):
        pass
    #enddef

    def generateSentence(self):
        pass
    #emddef

    def getSentenceProbability(self, sen):
        pass
    #enddef

    def getCorpusPerplexity(self, corpus):
        pass
    #enddef

    def generateSentencesToFile(self, numberOfSentences, filename):
        filePointer = open(filename, 'w+')
        for i in range(0,numberOfSentences):
            sen = self.generateSentence()
            prob = self.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)

	#endfor
    #enddef
#endclass

In [9]:
import math

In [10]:
# Unigram language model
class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        self.counts = defaultdict(float)
        self.total = 0.0
        self.train(corpus)
    #endddef

    # Add observed counts from corpus to the distribution
    def train(self, corpus):
        for sen in corpus:
            for word in sen:
                if word == '<s>':
                    continue
                self.counts[word] += 1.0
                self.total += 1.0
            #endfor
        #endfor
    #enddef

    # Returns the probability of word in the distribution
    def prob(self, word):
        return self.counts[word]/self.total
    #enddef

    # Generate a single random word according to the distribution
    def draw(self):
        rand = random.random()
        for word in self.counts.keys():
            rand -= self.prob(word)
            if rand <= 0.0:
                return word
	    #endif
	#endfor
    #enddef

    def generateSentence(self):
        sentence = []
        sentence.append('<s>')
        while True:
            word = self.draw()
            if word == '</s>':
                sentence.append(word)
                break
            sentence.append(word)
        return sentence

    def getSentenceProbability(self, sen):
        senProbability = 1.0
        for word in sen:
            senProbability *= self.prob(word)
        return senProbability

    def getCorpusPerplexity(self, corpus):
        corpusProbability = 1.0
        word_count = 0.0
        for sen in corpus:
            for word in sen:
                corpusProbability *= self.prob(word)
                word_count += 1
        normalized_corpusProbability = corpusProbability / word_count
        corpusPerplexity = -normalized_corpusProbability
        return corpusPerplexity
    #enddef
#endclass

In [11]:
#Smoothed unigram language model
class SmoothedUnigramModel(LanguageModel):
    def __init__(self, corpus):
        self.counts = defaultdict(float)
        self.total = 0.0
        self.train(corpus)
        self.vocab_size = 0.0
    #endddef

    # Add observed counts from corpus to the distribution
    def train(self, corpus):
        for sen in corpus:
            for word in sen:
                if word == '<s>':
                    continue
                self.counts[word] += 1.0
                self.total += 1.0
            #endfor
        #endfor
        self.vocab_size = len(self.counts)
    #enddef

    # Returns the probability of word in the distribution
    def prob(self, word):
        return (self.counts[word]+1.0)/(self.total+self.vocab_size)
    #enddef

    # Generate a single random word according to the distribution
    def draw(self):
        rand = random.random()
        for word in self.counts.keys():
            rand -= self.prob(word)
            if rand <= 0.0:
                return word
	    #endif
	#endfor
    #enddef

    def generateSentence(self):
        sentence = []
        sentence.append('<s>')
        while True:
            word = self.draw()
            if word == '</s>':
                sentence.append(word)
                break
            sentence.append(word)
        return sentence

    def getSentenceProbability(self, sen):
        senProbability = 0.0
        for word in sen:
            senProbability += math.log(self.prob(word))
        senProbability = math.exp(senProbability)
        return senProbability

    def getCorpusPerplexity(self, corpus):
        corpusProbability = 0.0
        word_count = 0.0
        for sen in corpus:
            for word in sen:
                corpusProbability += math.log(self.prob(word))
                word_count += 1
        normalized_corpusProbability = corpusProbability / word_count
        corpusPerplexity = math.exp(-normalized_corpusProbability)
        return corpusPerplexity
    #enddef
#endclass

In [12]:
# Unsmoothed bigram language model
class BigramModel(LanguageModel):
    def __init__(self, corpus):
        self.unigram_counts = defaultdict(float)
        self.bigram_counts = defaultdict(lambda: defaultdict(float))
        self.total = 0.0
        self.train(corpus)
    #endddef

    # Add observed counts from corpus to the distribution
    def train(self, corpus):
        for sen in corpus:
            prev_word = '<s>'
            for word in sen:
                if word == '<s>':
                    continue
                self.unigram_counts[prev_word] += 1.0
                self.bigram_counts[prev_word][word] += 1.0
                self.total += 1.0
                prev_word = word
            #endfor
            self.unigram_counts[prev_word] += 1.0
            self.total += 1.0
        #endfor
        self.unigram_counts['<s>'] += len(corpus)
        self.total += len(corpus)
    #enddef

    # Returns the probability of word in the distribution
    def prob(self, prev_word, word):
        return self.bigram_counts[prev_word][word]/self.unigram_counts[prev_word]
    #enddef

    # Generate a single random word according to the distribution
    def draw(self, prev_word):
        rand = random.random()
        for word in self.bigram_counts[prev_word].keys():
            rand -= self.prob(prev_word, word)
            if rand <= 0.0:
                return word
	    #endif
	#endfor
        return word
    #enddef

    def generateSentence(self):
        sentence = []
        prev_word = '<s>'
        sentence.append(prev_word)
        while True:
            word = self.draw(prev_word)
            if word == '</s>':
                sentence.append(word)
                break
            sentence.append(word)
            prev_word = word
        return sentence

    def getSentenceProbability(self, sen):
        senProbability = 1.0
        prev_word = '<s>'
        for word in sen:
            senProbability *= self.prob(prev_word, word)
            prev_word = word
        return senProbability

    def getCorpusPerplexity(self, corpus):
        corpusProbability = 1.0
        word_count = 0.0
        for sen in corpus:
            prev_word = '<s>'
            for word in sen:
                corpusProbability *= self.prob(prev_word, word)
                word_count += 1
                prev_word = word
        normalized_corpusProbability = corpusProbability / word_count
        corpusPerplexity = -normalized_corpusProbability
        return corpusPerplexity
    #enddef
#endclass

In [13]:
# Smoothed bigram language model
class SmoothedBigramModelKN(LanguageModel):
    def __init__(self, corpus):
        self.unigram_counts = defaultdict(float)
        self.bigram_counts = defaultdict(lambda: defaultdict(float))
        self.total = 0.0
        self.train(corpus)
    #endddef

    # Add observed counts from corpus to the distribution
    def train(self, corpus):
        for sen in corpus:
            prev_word = '<s>'
            for word in sen:
                if word == '<s>':
                    continue
                self.unigram_counts[prev_word] += 1.0
                self.bigram_counts[prev_word][word] += 1.0
                self.total += 1.0
                prev_word = word
            #endfor
            self.unigram_counts[prev_word] += 1.0
            self.total += 1.0
        #endfor
        self.unigram_counts['<s>'] += len(corpus)
        self.total += len(corpus)
    #enddef

    # Returns the probability of word in the distribution
    def prob(self, prev_word, word):
        lambda1 = 0.5
        lambda2 = 0.5
        unigram_probability = self.unigram_counts[word]/self.total
        bigram_probability = self.bigram_counts[prev_word][word]/self.unigram_counts[prev_word]
        return ((lambda1 * unigram_probability) + (lambda2 * bigram_probability))
    #enddef

    # Generate a single random word according to the distribution
    def draw(self, prev_word):
        rand = random.random()
        for word in self.bigram_counts[prev_word].keys():
            rand -= self.prob(prev_word, word)
            if rand <= 0.0:
                return word
	    #endif
	#endfor
        return word
    #enddef

    def generateSentence(self):
        sentence = []
        prev_word = '<s>'
        sentence.append(prev_word)
        while True:
            word = self.draw(prev_word)
            if word == '</s>':
                sentence.append(word)
                break
            sentence.append(word)
            prev_word = word
        return sentence

    def getSentenceProbability(self, sen):
        senProbability = 1.0
        prev_word = '<s>'
        for word in sen:
            senProbability *= self.prob(prev_word, word)
            prev_word = word
        return senProbability

    def getCorpusPerplexity(self, corpus):
        corpusProbability = 0.0
        word_count = 0.0
        for sen in corpus:
            prev_word = '<s>'
            for word in sen:
                corpusProbability += math.log(self.prob(prev_word, word))
                word_count += 1
                prev_word = word
        normalized_corpusProbability = corpusProbability / word_count
        corpusPerplexity = math.exp(-normalized_corpusProbability)
        return corpusPerplexity
    #enddef
#endclass

In [14]:
#-------------------------------------------
# The main routine
#-------------------------------------------
if __name__ == "__main__":
    #read your corpora
    trainCorpus = readFileToCorpus('train.txt')
    trainCorpus = preprocess(trainCorpus)

    posTestCorpus = readFileToCorpus('pos_test.txt')
    negTestCorpus = readFileToCorpus('neg_test.txt')

    vocab = set(word for sent in trainCorpus for word in sent)


    posTestCorpus = preprocessTest(vocab, posTestCorpus)
    negTestCorpus = preprocessTest(vocab, negTestCorpus)

    unigramModel = UnigramModel(trainCorpus)
    smoothed_unigramModel = SmoothedUnigramModel(trainCorpus)
    bigramModel = BigramModel(trainCorpus)
    smoothed_bigramModel = SmoothedBigramModelKN(trainCorpus)

    unigramModel.generateSentencesToFile(20, 'unigram_output.txt')
    smoothed_unigramModel.generateSentencesToFile(20, 'smooth_unigram_output.txt')
    bigramModel.generateSentencesToFile(20, 'bigram_output.txt')
    smoothed_bigramModel.generateSentencesToFile(20, 'smooth_bigram_kn_output.txt')

    unigram_perplexity_4negTest = unigramModel.getCorpusPerplexity(negTestCorpus)
    unigram_perplexity_4posTest = unigramModel.getCorpusPerplexity(posTestCorpus)
    smoothed_unigram_perplexity_4negTest = smoothed_unigramModel.getCorpusPerplexity(negTestCorpus)
    smoothed_unigram_perplexity_4posTest = smoothed_unigramModel.getCorpusPerplexity(posTestCorpus)

    bigram_perplexity_4negTest = bigramModel.getCorpusPerplexity(negTestCorpus)
    bigram_perplexity_4posTest = bigramModel.getCorpusPerplexity(posTestCorpus)
    smoothed_bigram_perplexity_4negTest = smoothed_bigramModel.getCorpusPerplexity(negTestCorpus)
    smoothed_bigram_perplexity_4posTest = smoothed_bigramModel.getCorpusPerplexity(posTestCorpus)

    print('Unigram Perplexity (Negative Test Corpus):', unigram_perplexity_4negTest)
    print('Unigram Perplexity (Positive Test Corpus):', unigram_perplexity_4posTest)
    print('Smoothed Unigram Perplexity (Negative Test Corpus):', smoothed_unigram_perplexity_4negTest)
    print('Smoothed Unigram Perplexity (Positive Test Corpus):', smoothed_unigram_perplexity_4posTest)

    print('Bigram Perplexity (Negative Test Corpus):', bigram_perplexity_4negTest)
    print('Bigram Perplexity (Positive Test Corpus):', bigram_perplexity_4posTest)
    print('Smoothed Bigram Perplexity (Negative Test Corpus):', smoothed_bigram_perplexity_4negTest)
    print('Smoothed Bigram Perplexity (Positive Test Corpus):', smoothed_bigram_perplexity_4posTest)

Reading file  train.txt


Reading sentence 1000
Reading sentence 2000
Reading sentence 3000
Reading sentence 4000
Reading sentence 5000
Reading sentence 6000
Reading sentence 7000
Reading sentence 8000
Reading sentence 9000
Reading sentence 10000
Reading sentence 11000
Reading sentence 12000
Reading sentence 13000
Reading sentence 14000
Reading sentence 15000
Reading sentence 16000
Reading sentence 17000
Reading sentence 18000
Reading sentence 19000
Reading sentence 20000
Reading sentence 21000
Reading sentence 22000
Reading sentence 23000
Reading sentence 24000
Reading sentence 25000
Reading sentence 26000
Reading sentence 27000
Reading sentence 28000
Reading sentence 29000
Reading sentence 30000


Reading file  pos_test.txt
Reading file  neg_test.txt


Reading sentence 1000
Reading sentence 1000


Unigram Perplexity (Negative Test Corpus): -0.0
Unigram Perplexity (Positive Test Corpus): -0.0
Smoothed Unigram Perplexity (Negative Test Corpus): 793.4530597408765
Smoothed Unigram Perplexity (Positive Test Corpus): 804.4353683904808
Bigram Perplexity (Negative Test Corpus): -0.0
Bigram Perplexity (Positive Test Corpus): -0.0
Smoothed Bigram Perplexity (Negative Test Corpus): 240.89746020937125
Smoothed Bigram Perplexity (Positive Test Corpus): 234.33789802250433


In [15]:
#(Question#01)
#In Unigram model, the length of the generated sentences is controlled by the probability of occurrence of individual words in a
#training corpus. Rather in Bigram model, generation is controlled by trasition probabilities between consecutive words. This
#means that the next word in the sentence is chosen based on the probability of it following the previous word, which can result
#in more structured sentences as compared to unigram model.

#(Question#02)
#Yes, the models do assign drastically different probabilities to the different sets of sentences. This is because each model 
#captures different aspects of language structure. Unigram model considers each word independently, which may result in less
#realistic sentences. On the other hand, Bigram model take into account the relationships between adjacent words, resulting in 
#more realistic sentences.

In [16]:
#(Question#03)

In [17]:
bigramModel.generateSentencesToFile(5, 'bigram_output_(2).txt')
smoothed_bigramModel.generateSentencesToFile(5, 'smooth_bigram_kn_output_(2).txt')

In [18]:
#In my opinion, the Smoothed Bigram model produces better sentences because, it incorporates Linear 
#Interpolation smoothing to handle unseen word pairs, resulting in more realistic sentences as compared to the basic bigram 
#model.

In [19]:
#(Question#04)
#Perplexity values for each model:-
#Unigram Perplexity (Negative Test Corpus): -0.0
#Unigram Perplexity (Positive Test Corpus): -0.0
#Smoothed Unigram Perplexity (Negative Test Corpus): 793.4530597408765
#Smoothed Unigram Perplexity (Positive Test Corpus): 804.4353683904808
#Bigram Perplexity (Negative Test Corpus): -0.0
#Bigram Perplexity (Positive Test Corpus): -0.0
#Smoothed Bigram Perplexity (Negative Test Corpus): 240.89746020937125
#Smoothed Bigram Perplexity (Positive Test Corpus): 234.33789802250433

#The Smoothed Unigram model has the highest perplexity among the four models. This is because, the model fails to capture the
#underlying language patterns in the test corpus, leading to higher perplexity as compared to the other models.