In [None]:


import os.path
import sys
import random
from operator import itemgetter
from collections import defaultdict
#----------------------------------------
#  Data input
#----------------------------------------

# Read a text file into a corpus (list of sentences (which in turn are lists of words))
# (taken from nested section of HW0)

def readFileToCorpus(file_path):
    """ Reads in the text file file_path which contains one sentence per line.
    """
    if os.path.isfile(file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            i = 0  # This is just a counter to keep track of the sentence numbers
            corpus = []  # This will become a list of sentences
            print("Reading file ", file_path)
            for line in file:
                i += 1
                sentence = line.split()  # Split the line into a list of words
                # Append this list as an element to the list of sentences
                corpus.append(sentence)
                if i % 1000 == 0:
                    # Print a status message: str(i) turns int i into a string
                    # so we can concatenate it
                    sys.stderr.write("Reading sentence " + str(i) + "\n")
            # end for
        return corpus
    else:
        # Ideally, we would throw an exception here, but this will suffice
        print("Error: corpus file ", file_path, " does not exist")
        sys.exit()  # Exit the script
    # end if
# end def



# Preprocess the corpus
from collections import defaultdict

def preprocess(corpus):
    UNK = '<UNK>'  # Assuming UNK is a constant string
    start = '<start>'  # Assuming start is a constant string
    end = '<end>'  # Assuming end is a constant string

    # Find all the rare words
    freqDict = defaultdict(int)
    for sen in corpus:
        for word in sen:
            freqDict[word] += 1
    # endfor

    # Replace rare words with UNK
    for sen in corpus:
        for i in range(0, len(sen)):
            word = sen[i]
            print(word)
            print(freqDict[word])
            if freqDict[word] < 2:
                sen[i] = UNK
            # endif
        # endfor
    # endfor

    # Bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    # endfor

    return corpus
# enddef

def preprocessTest(vocab, corpus):
    #replace test words that were unseen in the training with unk
    for sen in corpus:
        for i in range(0, len(sen)):
            word = sen[i]
            if word not in vocab:
                sen[i] = UNK
	    #endif
	#endfor
    #endfor

    #bookend the sentences with start and end tokens
    for sen in corpus:
        sen.insert(0, start)
        sen.append(end)
    #endfor

    return corpus
#enddef

# Constants
UNK = "UNK"     # Unknown word token
start = "<s>"   # Start-of-sentence token
end = "</s>"    # End-of-sentence-token


#--------------------------------------------------------------
# Language models and data structures
#--------------------------------------------------------------

# Parent class for the three language models you need to implement
class LanguageModel:
    # Initialize and train the model (ie, estimate the model's underlying probability
    # distribution from the training corpus)
    def __init__(self, corpus):
        print("""Your task is to implement four kinds of n-gram language models:
      a) an (unsmoothed) unigram model (UnigramModel)
      b) a unigram model smoothed using Laplace smoothing (SmoothedUnigramModel)
      c) an unsmoothed bigram model (BigramModel)
      d) a bigram model smoothed using linear interpolation smoothing (SmoothedBigramModelInt)
      """)
    #enddef

    # Generate a sentence by drawing words according to the
    # model's probability distribution
    # Note: think about how to set the length of the sentence
    #in a principled way
    def generateSentence(self):
        print("Implement the generateSentence method in each subclass")
        return "mary had a little lamb ."
    #emddef

    # Given a sentence (sen), return the probability of
    # that sentence under the model
    def getSentenceProbability(self, sen):
        print("Implement the getSentenceProbability method in each subclass")
        return 0.0
    #enddef

    # Given a corpus, calculate and return its perplexity
    #(normalized inverse log probability)
    def getCorpusPerplexity(self, corpus):
        print("Implement the getCorpusPerplexity method")
        return 0.0
    #enddef

    # Given a file (filename) and the number of sentences, generate a list
    # of sentences and write each to file along with its model probability.
    # Note: you shouldn't need to change this method
    def generateSentencesToFile(self, numberOfSentences, filename):
        filePointer = open(filename, 'w+')
        for i in range(0,numberOfSentences):
            sen = self.generateSentence()
            prob = self.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)

	#endfor
    #enddef
#endclass

# Unigram language model

# prompt: class UnigramModel(LanguageModel):     def __init__(self, corpus):         print("Subtask: implement the unsmoothed unigram language model")     #endddef #endclass

class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        print("Subtask: implement the unsmoothed unigram language model")
        self.unigramCounts = defaultdict(int)
        self.total = 0
        for sentence in corpus:
            for word in sentence:
                self.unigramCounts[word] += 1
                self.total += 1
        #end for
    #enddef

    def generateSentence(self):
        sentence = []
        while True:
            word = random.choices(list(self.unigramCounts.keys()), weights=list(self.unigramCounts.values()))[0]
            if word == end:
                break
            sentence.append(word)
        #end while
        return sentence
    #enddef

    def getSentenceProbability(self, sen):
        prob = 1.0
        for word in sen:
            count = self.unigramCounts[word]
            prob *= count / self.total
        #end for
        return prob
    #enddef

    def getCorpusPerplexity(self, corpus):
        totalProb = 0.0
        sentenceCount = 0
        for sentence in corpus:
            sentenceCount += 1
            totalProb += math.log(self.getSentenceProbability(sentence))
        #end for
        return math.exp(-totalProb / sentenceCount)

    def generateSentencesToFile(self, numberOfSentences, filename):
        filePointer = open(filename, 'w+')
        for i in range(0, numberOfSentences):
            sen = self.generateSentence()
            prob = self.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)

        filePointer.close()
    #enddef
#endclass

# prompt: complete the following code and function     def __init__(self, corpus):         print("Subtask: implement the smoothed bigram language model with kneser-ney smoothing")     #endddef #endclass

class SmoothedBigramModelKneserNey(LanguageModel):
    def __init__(self, corpus):
        print("Subtask: implement the smoothed bigram language model with kneser-ney smoothing")
        self.bigramCounts = defaultdict(lambda: defaultdict(int))
        self.unigramCounts = defaultdict(int)
        self.total = 0
        for sentence in corpus:
            for i in range(0, len(sentence) - 1):
                word1 = sentence[i]
                word2 = sentence[i + 1]
                self.bigramCounts[word1][word2] += 1
                self.unigramCounts[word1] += 1
                self.total += 1
            #end for
        #end for
    #enddef

    def generateSentence(self):
        sentence = []
        while True:
            word1 = random.choices(list(self.unigramCounts.keys()), weights=list(self.unigramCounts.values()))[0]
            word2 = random.choices(list(self.bigramCounts[word1].keys()), weights=list(self.bigramCounts[word1].values()))[0]
            if word2 == end:
                break
            sentence.append(word1)
            sentence.append(word2)
            word1 = word2
        #end while
        return sentence
    #enddef

    def getSentenceProbability(self, sen):
        prob = 1.0
        for i in range(0, len(sen) - 1):
            word1 = sen[i]
            word2 = sen[i + 1]
            count = self.bigramCounts[word1][word2]
            prob *= count / self.unigramCounts[word1]
        #end for
        return prob
    #enddef

    def getCorpusPerplexity(self, corpus):
        totalProb = 0.0
        sentenceCount = 0
        for sentence in corpus:
            sentenceCount += 1
            totalProb += math.log(self.getSentenceProbability(sentence))
        #end for
        return math.exp(-totalProb / sentenceCount)

    def generateSentencesToFile(self, numberOfSentences, filename):
        filePointer = open(filename, 'w+')
        for i in range(0, numberOfSentences):
            sen = self.generateSentence()
            prob = self.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)

        filePointer.close()
    #enddef
#endclass


smoothedBigramModelKneserNey = SmoothedBigramModelKneserNey(corpus)

generatedSentence = smoothedBigramModelKneserNey.generateSentence()
print("Generated sentence:", " ".join(generatedSentence))

perplexity = smoothedBigramModelKneserNey.getCorpusPerplexity(corpus)
print("Corpus perplexity:", perplexity)

smoothedBigramModelKneserNey.generateSentencesToFile(10, "smoothed_bigram_sentences_kneser_ney.txt")





#Smoothed unigram language model (use laplace for smoothing)
# prompt: class SmoothedUnigramModel(LanguageModel):     def __init__(self, corpus):         print("Subtask: implement the smoothed unigram language model")     #endddef #endclass

class SmoothedUnigramModel(LanguageModel):
    def __init__(self, corpus):
        print("Subtask: implement the smoothed unigram language model")
        self.unigramCounts = defaultdict(int)
        self.total = 0
        for sentence in corpus:
            for word in sentence:
                self.unigramCounts[word] += 1
                self.total += 1
        #end for
        self.V = len(self.unigramCounts)
    #enddef

    def generateSentence(self):
        sentence = []
        while True:
            word = random.choices(list(self.unigramCounts.keys()), weights=list(self.unigramCounts.values()))[0]
            if word == end:
                break
            sentence.append(word)
        #end while
        return sentence
    #enddef

    def getSentenceProbability(self, sen):
        prob = 1.0
        for word in sen:
            count = self.unigramCounts[word]
            prob *= (count + 1) / (self.total + self.V)
        #end for
        return prob
    #enddef

    def getCorpusPerplexity(self, corpus):
        totalProb = 0.0
        sentenceCount = 0
        for sentence in corpus:
            sentenceCount += 1
            totalProb += math.log(self.getSentenceProbability(sentence))
        #end for
        return math.exp(-totalProb / sentenceCount)

    def generateSentencesToFile(self, numberOfSentences, filename):
        filePointer = open(filename, 'w+')
        for i in range(0, numberOfSentences):
            sen = self.generateSentence()
            prob = self.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)

        filePointer.close()
    #enddef
#endclass


# Unsmoothed bigram language model
# prompt: kindly do the SmoothedBigramModelKN with all functions

class SmoothedBigramModelKN(LanguageModel):
    def __init__(self, corpus):
        print("Subtask: implement the smoothed bigram language model with kneser-ney smoothing")
        self.bigramCounts = defaultdict(lambda: defaultdict(int))
        self.unigramCounts = defaultdict(int)
        self.total = 0
        for sentence in corpus:
            for i in range(0, len(sentence) - 1):
                word1 = sentence[i]
                word2 = sentence[i + 1]
                self.bigramCounts[word1][word2] += 1
                self.unigramCounts[word1] += 1
                self.total += 1
            #end for
        #end for
        self.V = len(self.unigramCounts)
    #enddef

    def generateSentence(self):
        sentence = []
        while True:
            word1 = random.choices(list(self.unigramCounts.keys()), weights=list(self.unigramCounts.values()))[0]
            word2 = random.choices(list(self.bigramCounts[word1].keys()), weights=list(self.bigramCounts[word1].values()))[0]
            if word2 == end:
                break
            sentence.append(word1)
            sentence.append(word2)
            word1 = word2
        #end while
        return sentence
    #enddef

    def getSentenceProbability(self, sen):
        prob = 1.0
        for i in range(0, len(sen) - 1):
            word1 = sen[i]
            word2 = sen[i + 1]
            count = self.bigramCounts[word1][word2]
            prob *= (count + 1) / (self.unigramCounts[word1] + self.V)
        #end for
        return prob
    #enddef

    def getCorpusPerplexity(self, corpus):
        totalProb = 0.0
        sentenceCount = 0
        for sentence in corpus:
            sentenceCount += 1
            totalProb += math.log(self.getSentenceProbability(sentence))
        #end for
        return math.exp(-totalProb / sentenceCount)

    def generateSentencesToFile(self, numberOfSentences, filename):
        filePointer = open(filename, 'w+')
        for i in range(0, numberOfSentences):
            sen = self.generateSentence()
            prob = self.getSentenceProbability(sen)

            stringGenerated = str(prob) + " " + " ".join(sen)
            print(stringGenerated, end="\n", file=filePointer)

        filePointer.close()
    #enddef
#endclass



# Sample class for a unsmoothed unigram probability distribution
# Note:
#       Feel free to use/re-use/modify this class as necessary for your
#       own code (e.g. converting to log probabilities after training).
#       This class is intended to help you get started
#       with your implementation of the language models above.


class UnigramDist:
    def __init__(self, corpus):
        self.counts = defaultdict(float)
        self.total = 0.0
        self.train(corpus)
    #endddef

    # Add observed counts from corpus to the distribution
    def train(self, corpus):
        for sen in corpus:
            for word in sen:
                if word == start:
                    continue
                self.counts[word] += 1.0
                self.total += 1.0
            #endfor
        #endfor
    #enddef

    # Returns the probability of word in the distribution
    def prob(self, word):
        return self.counts[word]/self.total
    #enddef

    # Generate a single random word according to the distribution
    def draw(self):
        rand = random.random()
        for word in self.counts.keys():
            rand -= self.prob(word)
            if rand <= 0.0:
                return word
	    #endif
	#endfor
    #enddef
#endclass


unigram_dist = UnigramDist(corpus)

print(unigram_dist.prob("the"))

print(unigram_dist.prob("aardvark"))

print(unigram_dist.draw())




#-------------------------------------------
# The main routine
#-------------------------------------------


if __name__ == "__main__":
    #read your corpora
    trainCorpus = readFileToCorpus('train.txt')
    trainCorpus = preprocess(trainCorpus)

    posTestCorpus = readFileToCorpus('pos_test.txt')
    negTestCorpus = readFileToCorpus('neg_test.txt')



    from collections import Counter
    import re

    def read_corpus(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        # Preprocess the lines (remove leading/trailing whitespaces and split into words)
        corpus = [re.findall(r'\b\w+\b', line.lower()) for line in lines]
        return corpus

    def create_vocabulary(train_corpus):
        # Flatten the list of sentences into a single list of words
        all_words = [word for sentence in train_corpus for word in sentence]

        # Use Counter to count the frequency of each word
        word_counts = Counter(all_words)

        # Create a vocabulary containing unique words
        vocabulary = list(word_counts.keys())

        return vocabulary

    # Example usage:
    train_file_path = 'train.txt'
    train_corpus = read_corpus(train_file_path)
    train_vocabulary = create_vocabulary(train_corpus)

    print("Vocabulary Size:", len(train_vocabulary))
    print("First 10 Words in Vocabulary:", train_vocabulary[:10])






    #vocab = set()
    # Please write the code to create the vocab over here before the function preprocessTest
    #print("""Task 0: create a vocabulary(collection of word types) for the train corpus""")


    posTestCorpus = preprocessTest(train_vocabulary, posTestCorpus)
    negTestCorpus = preprocessTest(train_vocabulary, negTestCorpus)

    # Run sample unigram dist code
    unigramDist = UnigramDist(trainCorpus)
    print("Sample UnigramDist output:")
    print("Probability of \"picture\": ", unigramDist.prob("picture"))
    print("\"Random\" draw: ", unigramDist.draw())
    print("\"Random\" draw: ", unigramDist.draw())


