In [1]:
with open('wiki.train.raw', 'r', encoding='utf-8') as file1:
    corpus = file1.read()

# Preprocess the text to remove punctuation
corp1 = ""
for char in corpus:
    if char not in string.punctuation:
        corp1 += char

# Tokenizing
words = corp1.lower().split()

# Bigram
def generate_bigrams(words):
    bg_pairs = dict()
    for i in range(len(words) - 1):
        bg_pair = (words[i], words[i + 1])
        bg_pairs[bg_pair] = bg_pairs.get(bg_pair, 0) + 1
    return bg_pairs

sentence_bigrams = generate_bigrams(words)

for bigram, count in sentence_bigrams.items():
    print(bigram, ":", count)


#Counts the no. of times a word repeats
def freq_of_unique_words(words):
    count = {}
    corpus_word_count = 0
    for word in words:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1
        corpus_word_count += 1  
    
    unique_word_count = len(count)

    print("No of unique words in corpus:", unique_word_count)
    print("No of words in corpus:", corpus_word_count)
    
    return count
word_freq = freq_of_unique_words(words)
#print(word_freq)

def count_bigram_frequencies(words):
    bigram_frequencies = defaultdict(int)
    for i in range(len(words) - 1):
        bigram = (words[i], words[i + 1])
        bigram_frequencies[bigram] += 1
    return bigram_frequencies
    
bigram_freq = count_bigram_frequencies(words)
print("Number of unique bigrams:", len(bigram_freq)) 

# Calculating bigram probability with Add-one smoothing
def compute_bigram_probabilities(sentence_bigrams, word_freq):
    V = len(word_freq)  
    bigram_probabilities = {}
    for bigram, freq in sentence_bigrams.items():
        word, next_word = bigram
        numerator = freq+1 
        denominator = word_freq.get(word, 0) + V  
        probability = numerator / denominator
        bigram_probabilities[bigram] = probability
    return bigram_probabilities

bigram_probabilities = compute_bigram_probabilities(sentence_bigrams, word_freq)
bigram_probabilities


def compute_unigram_probabilities(word_freq):
    unigram_probabilities = {}
    total_word_count = sum(word_freq.values())
    for word, freq in word_freq.items():
        probability = (freq + 1) / (total_word_count + len(word_freq))  
        unigram_probabilities[word] = probability
    return unigram_probabilities


def probability_with_backoff(bigram, bigram_probabilities, unigram_probabilities, backoff_weight=0.5):
    if isinstance(bigram_probabilities, defaultdict):
        if bigram in bigram_probabilities:
            return bigram_probabilities[bigram]
        else:
            word, next_word = bigram
            unigram_probability = unigram_probabilities.get(next_word, 1 / len(unigram_probabilities))
            return backoff_weight * unigram_probability
    else:  # If bigram_probabilities is a regular dictionary
        return bigram_probabilities.get(bigram, 0)
with open('wiki.valid.raw', 'r', encoding='utf-8') as file2:
    valid = file2.read()
corp2 = ""
for char in valid:
    if char not in string.punctuation:
        corp2 += char
val_data= corp2.lower().split()

word_freq = freq_of_unique_words(val_data)
#print(word_freq)
sentence_bigrams = generate_bigrams(val_data)
#bigram_freq = count_bigram_frequencies(train_data)
bigram_freq = count_bigram_frequencies(val_data)

unigram_probabilities = compute_unigram_probabilities(word_freq)
bigram_probabilities = compute_bigram_probabilities(bigram_freq, word_freq)
model = {
    'unigram_probabilities': unigram_probabilities,
    'bigram_probabilities': bigram_probabilities,
}


with open('wiki.test.raw', 'r', encoding='utf-8') as file3:
    test = file3.read()
corp3 = ""
for char in test:
  if char not in string.punctuation:
     corp3 += char
test_data = corp3.lower().split()

def perplexity_without_backoff(words, model):
    total_log_probability = 0
    num_words = 0
    
    unigram_probabilities = model['unigram_probabilities']
    bigram_probabilities = model['bigram_probabilities']
    V = len(unigram_probabilities)
    for i in range(len(test_data) - 1):
        word, next_word = test_data[i], test_data[i + 1]
        bigram_probability = bigram_probabilities.get((word, next_word), 0)
        if bigram_probability == 0:
            unigram_probability = unigram_probabilities.get(next_word, 1) / V
            total_log_probability += math.log(unigram_probability)
        else:
            total_log_probability += math.log(bigram_probability)

            num_words += 1
    perplexity = math.exp(-total_log_probability / num_words)
    
    return perplexity
    
perplexity = perplexity_without_backoff(test_data, model)
print("Perplexity:",perplexity)

def compute_bigram_probabilities_with_backoff(bigram_freq, unigram_probabilities, backoff_weight):
    bigram_probabilities = {}
    V = len(unigram_probabilities)
    for bigram, freq in bigram_freq.items():
        word, next_word = bigram
        denominator = unigram_probabilities[word] if word in unigram_probabilities else 0
        bigram_probabilities[bigram] = (freq + backoff_weight) / (denominator + V * backoff_weight)
    return bigram_probabilities

# Function to calculate perplexity with Backoff
def perplexity_with_backoff(test_data, model):
    total_log_probability = 0
    num_words = 0
    
    unigram_probabilities = model['unigram_probabilities']
    bigram_probabilities = model['bigram_probabilities']
    backoff_weight = model.get('backoff_weight', 0.5)  
    
    V = len(unigram_probabilities)
    
    for i in range(len(test_data) - 1):
        word, next_word = test_data[i], test_data[i + 1]
        bigram_probability = bigram_probabilities.get((word, next_word), 0)
        if bigram_probability == 0:
            unigram_probability = unigram_probabilities.get(next_word, 1) / V
            interpolated_probability = backoff_weight * unigram_probability
            total_log_probability += math.log(interpolated_probability)
        else:
            total_log_probability += math.log(bigram_probability)
        num_words += 1
    
    perplexity = math.exp(-total_log_probability / num_words)
    return perplexity

# Create models with and without Backoff
model_without_backoff = {
    'unigram_probabilities': unigram_probabilities,
    'bigram_probabilities': bigram_probabilities,
}

model_with_backoff = {
    'unigram_probabilities': unigram_probabilities,
    'bigram_probabilities': compute_bigram_probabilities_with_backoff(bigram_freq, unigram_probabilities, 0.1),  # Using Backoff with a weight of 0.1
}

# Evaluate both models using the testing dataset
perplexity_without_backoff_value = perplexity_without_backoff(test_data, model_without_backoff)
perplexity_with_backoff_value = perplexity_with_backoff(test_data, model_with_backoff)

# Compare perplexity scores
print("Perplexity without Backoff:", perplexity_without_backoff_value)
print("Perplexity with Backoff:", perplexity_with_backoff_value)

