# NLP Assignment 1 (Group 16)

## Bhargava Siva Naga Sai Potluri (bxp230045)

## Nikhil Sesha Sai Kondapalli (nxk240025)

## Kavimayil Periyakoravampalayam Komarasamy (kxp230053)

## Sakshi Tokekar (sxt230143)

In [1]:
!pip install contractions

import re
import string
import contractions
import pandas as pd
import os
import math



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Created Assignment 1 class and implemented different methods used in the Assignment

In [3]:
import re
import string
import contractions
import pandas as pd
import os
import math

class Assignment1:
    def __init__(self):
        self.unk_token = '<UNK>'
        self.vocabulary = set()

    def read_data(self, path):
        # read data from input file
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        return [line.strip() for line in lines if line.strip()]

    def preprocessing(self, data, remove_stopwords=False):
        # pre-process individual sentence
        processed_sentences = []
        for line in data:
            processed_line = self.preprocess_single_sentence(line, remove_stopwords)
            if processed_line.strip():
                processed_sentences.append(processed_line)
        return ' '.join(processed_sentences)

    def preprocess_single_sentence(self, sentence, remove_stopwords=False):
        # Lowercasing
        sentence = sentence.lower()
        # Contractions expansion
        sentence = contractions.fix(sentence)
        # Normalize numbers -> <num>
        sentence = re.sub(r'\d+', '<num>', sentence)
        # Add sentence boundaries <s> </s> for each line/review
        sentence = sentence.strip()
        if sentence:
            sentence = '<s> ' + sentence + ' </s>'
        return sentence

    def tokenize(self, data):
        return data.split()

    def unigram_model(self, tokens):
        unigram_counts = {}

        # Count occurrences of each token
        for token in tokens:
            unigram_counts[token] = unigram_counts.get(token, 0) + 1

        # Calculate total number of tokens
        total_tokens = len(tokens)

        # Compute probabilities
        unigram_probabilities = {}
        for unigram, count in unigram_counts.items():
            unigram_probabilities[unigram] = count / total_tokens

        return unigram_probabilities, unigram_counts

    def bigram_model(self, tokens):
        bigram_counts = {}
        unigram_counts = {}

        # Count unigrams
        for token in tokens:
            unigram_counts[token] = unigram_counts.get(token, 0) + 1

        # Count bigrams
        for i in range(len(tokens) - 1):
            bigram = (tokens[i], tokens[i+1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

        bigram_probabilities = {}

        # Compute conditional probabilities P(w2|w1) = Count(w1,w2) / Count(w1)
        for bigram, count in bigram_counts.items():
            first_token = bigram[0]
            if first_token in unigram_counts and unigram_counts[first_token] > 0:
                bigram_probabilities[bigram] = count / unigram_counts[first_token]
            else:
                bigram_probabilities[bigram] = 0.0

        return bigram_probabilities, bigram_counts

    def handle_unknown_words(self, tokens, unigram_counts, min_freq=1):
        """
        Replaces words with a frequency less than or equal to min_freq
        with the <UNK> token.
        """
        # Create a set of rare words for efficient lookup
        rare_words = set()

        # Loop over each word-count pair in the unigram_counts dictionary
        for word, count in unigram_counts.items():
            if count <= min_freq:
                rare_words.add(word)  # add the rare word to the set

        if rare_words:
            print(f"\nFound {len(rare_words)} unique words with frequency <= {min_freq}. Replacing them with '<UNK>'.")

        processed_tokens = []
        for token in tokens:
            if token in rare_words:        # if token is rare
                processed_tokens.append('<UNK>')  # replace with <UNK>
            else:
                processed_tokens.append(token)    # otherwise keep as is

        return processed_tokens

    def export_to_csv(self, unigram_probs, unigram_counts, bigram_probs, bigram_counts, output_dir="/content/", unigrams_file="unigrams.csv", bigrams_file="bigrams.csv"):
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        os.makedirs(output_dir, exist_ok=True)

        # Convert to df
        df_unigrams = pd.DataFrame([
            {
                "word": word,
                "count": unigram_counts.get(word, 0),
                "probability": unigram_probs[word]
            }
            for word in unigram_probs.keys()
        ])

        df_bigrams = pd.DataFrame([
            {
                "w1": bigram[0],
                "w2": bigram[1],
                "bigram": f"{bigram[0]} {bigram[1]}",
                "count": bigram_counts.get(bigram, 0),
                "probability": bigram_probs[bigram]
            }
            for bigram in bigram_probs.keys()
        ])

        df_unigrams = df_unigrams.sort_values('probability', ascending=False).reset_index(drop=True)

        df_bigrams = df_bigrams.sort_values('probability', ascending=False).reset_index(drop=True)

        unigram_file = os.path.join(output_dir, unigrams_file)
        bigram_file = os.path.join(output_dir, bigrams_file)
        df_unigrams.to_csv(unigram_file, index=False)
        df_bigrams.to_csv(bigram_file, index=False)

        print("\n-------------------------- EXPORT SUMMARY --------------------------")
        print("Unigrams exported: ", len(df_unigrams), "entries")
        print("Bigrams exported: ", len(df_bigrams), "entries")

        return unigram_file, bigram_file, df_unigrams, df_bigrams

    def display_top_ngrams(self, df_unigrams, df_bigrams, n=10):
        # Display top entries for verification
        print("\n-------------------------- TOP ", n, " UNIGRAMS --------------------------")
        print(df_unigrams.head(n).to_string(index=False))
        print("\n-------------------------- TOP ",n," BIGRAMS --------------------------")
        print(df_bigrams.head(n).to_string(index=False))

    def print_sample_probabilities(self, unigram_probs, bigram_probs, n=10):
        print("-------------------------- SAMPLE UNIGRAM PROBABILITIES --------------------------")
        count = 0
        for token, prob in sorted(unigram_probs.items(), key=lambda x: x[1], reverse=True):
          if count >= n:
            break
          print(f"P({token}) = {prob:.4f}")
          count += 1

        print("\n-------------------------- SAMPLE BIGRAM PROBABILITIES --------------------------")
        count = 0
        for (w1, w2), prob in sorted(bigram_probs.items(), key=lambda x: x[1], reverse=True):
          if count >= n:
            break
          print(f"P({w2}|{w1}) = {prob:.4f}")
          count += 1

    def unigram_model_smoothed(self, tokens, k=1.0):
        """
        Build a smoothed unigram model using Add-k (Laplace by default: k=1.0)
        Returns smoothed unigram probabilities and counts.
        """
        unigram_counts = {}

        for token in tokens:
            unigram_counts[token] = unigram_counts.get(token, 0) + 1

        # Vocabulary size
        V = len(unigram_counts)
        total_tokens = len(tokens)

        unigram_probs = {}
        for word in unigram_counts:
            unigram_probs[word] = (unigram_counts[word] + k) / (total_tokens + k * V)

        # Probability for UNK
        # unigram_probs[self.unk_token] = k / (total_tokens + k * V)
        return unigram_probs, unigram_counts

    def bigram_model_smoothed(self, tokens, k=1.0):
        """
        Build a smoothed bigram model using Add-k smoothing. Returns smoothed bigram probabilities and counts.
        """
        bigram_counts = {}
        unigram_counts = {}

        for token in tokens:
            unigram_counts[token] = unigram_counts.get(token, 0) + 1

        for i in range(len(tokens) - 1):
            bigram = (tokens[i], tokens[i+1])
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1

        # Vocabulary size
        V = len(unigram_counts)

        bigram_probs = {}
        for (w1, w2) in bigram_counts:
            bigram_probs[(w1, w2)] = (
                bigram_counts[(w1, w2)] + k
            ) / (unigram_counts[w1] + k * V)

        # For all possible bigram pairs (including unseen ones)
        all_vocab = list(unigram_counts.keys())
        for w1 in all_vocab:
            for w2 in all_vocab:
                if (w1, w2) not in bigram_probs:
                    bigram_probs[(w1, w2)] = k / (unigram_counts[w1] + k * V)

        return bigram_probs, bigram_counts

    def calculate_perplexity_unigram(self, test_tokens, unigram_probs):
        # Function to Calculate perplexity for unigram model
        N = len(test_tokens)
        log_prob_sum = 0

        for token in test_tokens:
            if token in unigram_probs:
                prob = unigram_probs[token]
            else:
                # Handle unseen words by assigning probability of them
                prob = unigram_probs.get(self.unk_token, 1e-10)

            if prob > 0:
                log_prob_sum += math.log(prob)
            else:
                log_prob_sum += math.log(1e-10)

        perplexity = math.exp(-log_prob_sum / N)
        return perplexity

    def calculate_perplexity_bigram(self, test_tokens, bigram_probs, unigram_probs):
        # Function to Calculate perplexity for bigram model
        N = len(test_tokens) - 1
        if N <= 0:
            return float('inf')

        log_prob_sum = 0

        for i in range(len(test_tokens) - 1):
            bigram = (test_tokens[i], test_tokens[i+1])

            if bigram in bigram_probs:
                prob = bigram_probs[bigram]
            else:
                # Backoff to unigram
                if test_tokens[i+1] in unigram_probs:
                    prob = unigram_probs[test_tokens[i+1]]
                else:
                    prob = unigram_probs.get(self.unk_token, 1e-10)

            if prob > 0:
                log_prob_sum += math.log(prob)
            else:
                log_prob_sum += math.log(1e-10)

        perplexity = math.exp(-log_prob_sum / N)
        return perplexity

## Read data

In [4]:
assignment = Assignment1()
raw_data = assignment.read_data('/content/train.txt')
print("Read {len(raw_data)} reviews from corpus")

Read {len(raw_data)} reviews from corpus


In [5]:
test_data = assignment.read_data('/content/val.txt')
print("Read {len(test_data)} reviews from corpus")

Read {len(test_data)} reviews from corpus


## Data preprocessing

In [6]:
processed_data = assignment.preprocessing(raw_data, remove_stopwords=False)

In [7]:
test_processed_data = assignment.preprocessing(test_data, remove_stopwords=False)

## Unknown Words handling & Tokenization

In [8]:
initial_tokens = assignment.tokenize(processed_data)
print(f"Total tokens after preprocessing: {len(initial_tokens)}")

Total tokens after preprocessing: 90761


In [9]:
_, initial_unigram_counts = assignment.unigram_model(initial_tokens)
print(f"Initial vocabulary size (unique words): {len(initial_unigram_counts)}")

Initial vocabulary size (unique words): 6101


In [10]:
# Replace rare words (frequency <= 1) with the <UNK> token.
tokens = assignment.handle_unknown_words(initial_tokens, initial_unigram_counts, min_freq=3)
print(f"Total tokens after preprocessing & Unknown Word handling: {len(tokens)}")


Found 4345 unique words with frequency <= 3. Replacing them with '<UNK>'.
Total tokens after preprocessing & Unknown Word handling: 90761


## Unknown words handling & Tokenization for Test Data

In [11]:
test_initial_tokens = assignment.tokenize(test_processed_data)
print(f"Total tokens after preprocessing: {len(test_initial_tokens)}")

Total tokens after preprocessing: 9956


In [12]:
_, test_initial_unigram_counts = assignment.unigram_model(test_initial_tokens)
print(f"Initial vocabulary size (unique words): {len(test_initial_unigram_counts)}")

Initial vocabulary size (unique words): 1729


In [13]:
# Replace rare words (frequency <= 1) with the <UNK> token.
test_tokens = assignment.handle_unknown_words(test_initial_tokens, test_initial_unigram_counts, min_freq=0)
# print("Total tokens after preprocessing: {len(test_tokens)}")

## Compute unigram probabilities

In [14]:
unigram_probabilities, unigram_counts = assignment.unigram_model(tokens)
print(f"Unique unigrams: {len(unigram_probabilities)}")

Unique unigrams: 1757


## Compute bigram probabilities

In [15]:
bigram_probabilities, bigram_counts = assignment.bigram_model(tokens)
print(f"Unique bigrams: {len(bigram_probabilities)}")

Unique bigrams: 27839


## Sample probabilities

In [16]:
assignment.print_sample_probabilities(unigram_probabilities, bigram_probabilities)

-------------------------- SAMPLE UNIGRAM PROBABILITIES --------------------------
P(<UNK>) = 0.0666
P(the) = 0.0583
P(.) = 0.0517
P(,) = 0.0325
P(and) = 0.0286
P(a) = 0.0247
P(to) = 0.0230
P(was) = 0.0201
P(i) = 0.0189
P(in) = 0.0139

-------------------------- SAMPLE BIGRAM PROBABILITIES --------------------------
P(suite|junior) = 1.0000
P(<UNK>|david) = 1.0000
P(room|pump) = 1.0000
P(to|due) = 1.0000
P(of|sort) = 1.0000
P(to|able) = 1.0000
P(to|forward) = 1.0000
P(club|health) = 1.0000
P(n't|ca) = 1.0000
P(away|blown) = 1.0000


## Export to CSV files for report

In [17]:
unigram_file, bigram_file, df_unigrams, df_bigrams = assignment.export_to_csv(
    unigram_probabilities, unigram_counts,
    bigram_probabilities, bigram_counts
)


-------------------------- EXPORT SUMMARY --------------------------
Unigrams exported:  1757 entries
Bigrams exported:  27839 entries


## Display top n-grams for verification

In [18]:
assignment.display_top_ngrams(df_unigrams, df_bigrams, n=10)


-------------------------- TOP  10  UNIGRAMS --------------------------
 word  count  probability
<UNK>   6048     0.066637
  the   5295     0.058340
    .   4692     0.051696
    ,   2949     0.032492
  and   2593     0.028570
    a   2246     0.024746
   to   2091     0.023039
  was   1828     0.020141
    i   1712     0.018863
   in   1259     0.013872

-------------------------- TOP  10  BIGRAMS --------------------------
         w1        w2                bigram  count  probability
continental breakfast continental breakfast      5          1.0
       pump      room             pump room     12          1.0
   supposed        to           supposed to     13          1.0
        due        to                due to     16          1.0
      based        on              based on     14          1.0
       able        to               able to     38          1.0
  attention        to          attention to      4          1.0
      heart        of              heart of      5       

# Apply Laplace Smoothing (k=1)

In [19]:
unigram_probs_laplace, unigram_counts = assignment.unigram_model_smoothed(tokens, k=1.0)
bigram_probs_laplace, bigram_counts = assignment.bigram_model_smoothed(tokens, k=1.0)

# Display Laplace Smoothed unigram and bigram probabilities

In [20]:
assignment.print_sample_probabilities(unigram_probs_laplace, bigram_probs_laplace)

-------------------------- SAMPLE UNIGRAM PROBABILITIES --------------------------
P(<UNK>) = 0.0654
P(the) = 0.0572
P(.) = 0.0507
P(,) = 0.0319
P(and) = 0.0280
P(a) = 0.0243
P(to) = 0.0226
P(was) = 0.0198
P(i) = 0.0185
P(in) = 0.0136

-------------------------- SAMPLE BIGRAM PROBABILITIES --------------------------
P(<s>|</s>) = 0.2257
P(the|in) = 0.1349
P(the|.) = 0.1347
P(the|at) = 0.1331
P(the|of) = 0.1226
P(the|on) = 0.0955
P(<UNK>|a) = 0.0907
P(hotel|this) = 0.0889
P(.|<UNK>) = 0.0861
P(<UNK>|the) = 0.0820


## Export to CSV files of Smoothed Probabilities

In [21]:
unigram_file_laplace, bigram_file_laplace, df_unigrams, df_bigrams = assignment.export_to_csv(
    unigram_probs_laplace, unigram_counts,
    bigram_probs_laplace, bigram_counts, unigrams_file="unigrams_smoothed.csv", bigrams_file="bigrams_smoothed.csv"
)


-------------------------- EXPORT SUMMARY --------------------------
Unigrams exported:  1757 entries
Bigrams exported:  3087049 entries


## Display top smoothed n-grams for verfication

In [22]:
assignment.display_top_ngrams(df_unigrams, df_bigrams, n=10)


-------------------------- TOP  10  UNIGRAMS --------------------------
 word  count  probability
<UNK>   6048     0.065382
  the   5295     0.057243
    .   4692     0.050725
    ,   2949     0.031886
  and   2593     0.028038
    a   2246     0.024287
   to   2091     0.022612
  was   1828     0.019769
    i   1712     0.018515
   in   1259     0.013619

-------------------------- TOP  10  BIGRAMS --------------------------
   w1    w2     bigram  count  probability
 </s>   <s>   </s> <s>    511     0.225650
   in   the     in the    406     0.134947
    .   the      . the    868     0.134750
   at   the     at the    332     0.133094
   of   the     of the    343     0.122594
   on   the     on the    228     0.095536
    a <UNK>    a <UNK>    362     0.090682
 this hotel this hotel    208     0.088898
<UNK>     .    <UNK> .    671     0.086099
  the <UNK>  the <UNK>    577     0.081963


# Apply Smoothing: Add-k (k=0.5)

In [23]:
unigram_probs_add05, unigram_counts = assignment.unigram_model_smoothed(tokens, k=0.5)
bigram_probs_add05, bigram_counts = assignment.bigram_model_smoothed(tokens, k=0.5)

# Display Add-k Smoothed unigram and bigram probabilities

In [24]:
assignment.print_sample_probabilities(unigram_probs_add05, bigram_probs_add05)

-------------------------- SAMPLE UNIGRAM PROBABILITIES --------------------------
P(<UNK>) = 0.0660
P(the) = 0.0578
P(.) = 0.0512
P(,) = 0.0322
P(and) = 0.0283
P(a) = 0.0245
P(to) = 0.0228
P(was) = 0.0200
P(i) = 0.0187
P(in) = 0.0137

-------------------------- SAMPLE BIGRAM PROBABILITIES --------------------------
P(<s>|</s>) = 0.3679
P(the|at) = 0.2048
P(the|in) = 0.1902
P(the|of) = 0.1782
P(the|.) = 0.1559
P(the|on) = 0.1505
P(hotel|this) = 0.1416
P(<num>|$) = 0.1384
P(was|it) = 0.1219
P(the|from) = 0.1202


## Computing Perplexity for the un-smoothed training data

In [25]:
p1 = assignment.calculate_perplexity_unigram(tokens, unigram_probabilities)
p2 = assignment.calculate_perplexity_bigram(tokens, bigram_probabilities, unigram_probabilities)

In [26]:
p1

247.55057125002799

In [27]:
p2

32.29509071773678

## Computing Perplexity for the add-k smoothed training data [with k=0.5]:

In [28]:
p3 = assignment.calculate_perplexity_unigram(tokens, unigram_probs_add05)
p4 = assignment.calculate_perplexity_bigram(tokens, bigram_probs_add05, unigram_probs_add05)

In [29]:
p3

247.60531672269684

In [30]:
p4

138.78916853651518

## Computing Perplexity for the laplace smoothed training data:

In [31]:
p5 = assignment.calculate_perplexity_unigram(tokens, unigram_probs_laplace)
p6 = assignment.calculate_perplexity_bigram(tokens, bigram_probs_laplace, unigram_probs_laplace)

In [32]:
p5

247.75652625415904

In [33]:
p6

196.69982420357786

## Computing Perplexity for the un-smoothed test(validation) data

In [34]:
p7 = assignment.calculate_perplexity_unigram(test_tokens, unigram_probabilities)
p8 = assignment.calculate_perplexity_bigram(test_tokens, bigram_probabilities, unigram_probabilities)

In [35]:
p7

231.86077727545114

In [36]:
p8

57.02984483256454

## Computing Perplexity for the laplace smoothed test(validation) data:

In [37]:
p9 = assignment.calculate_perplexity_unigram(test_tokens, unigram_probs_laplace)
p10 = assignment.calculate_perplexity_bigram(test_tokens, bigram_probs_laplace, unigram_probs_laplace)

In [38]:
p9

232.4233020939776

In [39]:
p10

204.96389763585861

## Computing Perplexity for the add-k smoothed test(validation) data:

In [40]:
p11 = assignment.calculate_perplexity_unigram(test_tokens, unigram_probs_add05)
p12 = assignment.calculate_perplexity_bigram(test_tokens, bigram_probs_add05, unigram_probs_add05)

In [41]:
p11

232.10437995410112

In [42]:
p12

161.55801279313644

#Computing Add-k Smoothing with k=0.1 and Perplexity [train and validation set]

In [43]:
unigram_probs_add01, unigram_counts = assignment.unigram_model_smoothed(tokens, k=0.1)
bigram_probs_add01, bigram_counts = assignment.bigram_model_smoothed(tokens, k=0.1)

# train set with k=0.1
p13 = assignment.calculate_perplexity_unigram(tokens, unigram_probs_add01)
p14 = assignment.calculate_perplexity_bigram(tokens, bigram_probs_add01, unigram_probs_add01)

# test set with k=0.1
p15 = assignment.calculate_perplexity_unigram(test_tokens, unigram_probs_add01)
p16 = assignment.calculate_perplexity_bigram(test_tokens, bigram_probs_add01, unigram_probs_add01)

In [44]:
p13

247.55287969926633

In [45]:
p14

69.58610266404538

In [46]:
p15

231.9027569696718

In [47]:
p16

108.37057740343162

# Computing Add-k Smoothing with k=0.05 and Perplexity [train and validation set]

In [48]:
unigram_probs_add005, unigram_counts = assignment.unigram_model_smoothed(tokens, k=0.005)
bigram_probs_add005, bigram_counts = assignment.bigram_model_smoothed(tokens, k=0.005)

# train set with k=0.05
p17 = assignment.calculate_perplexity_unigram(tokens, unigram_probs_add005)
p18 = assignment.calculate_perplexity_bigram(tokens, bigram_probs_add005, unigram_probs_add005)

# test set with k=0.05
p19 = assignment.calculate_perplexity_unigram(test_tokens, unigram_probs_add005)
p20 = assignment.calculate_perplexity_bigram(test_tokens, bigram_probs_add005, unigram_probs_add005)

In [49]:
p17

247.5505770975103

In [50]:
p18

36.479045208539695

In [51]:
p19

231.86279082452248

In [52]:
p20

98.71689407165164

## Displaying the Final Results

In [53]:
print("-------------------------- FINAL RESULTS --------------------------")
print("\n--------------- PERPLEXITY VALUES OF TRAINING DATA --------------")
print("\nFor un-smoothed data: Unigram Model perplexity: "+ str(p1) + " and Bigram Model perplexity: " + str(p2))
print("\nFor Laplace smoothed data: Unigram Model perplexity: "+ str(p5) + " and Bigram Model perplexity: " + str(p6))
print("\nFor Add-k (k = 0.5) smoothed data: Unigram Model perplexity: "+ str(p3) + " and Bigram Model perplexity: " + str(p4))
print("\nFor Add-k (k = 0.1) smoothed data: Unigram Model perplexity: "+ str(p13) + " and Bigram Model perplexity: " + str(p14))
print("\nFor Add-k (k = 0.05) smoothed data: Unigram Model perplexity: "+ str(p17) + " and Bigram Model perplexity: " + str(p18))
print("\n------------ PERPLEXITY VALUES OF VALIDATION DATA ---------------")
print("\nFor un-smoothed data: Unigram Model validation perplexity is "+ str(p7) + " and Bigram Model perplexity: " + str(p8))
print("\nFor Laplace smoothed data: Unigram Model perplexity: "+ str(p9) + " and Bigram Model perplexity: " + str(p10))
print("\nFor Add-k (k = 0.5) smoothed data: Unigram Model perplexity: "+ str(p11) + " and Bigram Model perplexity: " + str(p12))
print("\nFor Add-k (k = 0.1) smoothed data: Unigram Model perplexity: "+ str(p15) + " and Bigram Model perplexity: " + str(p16))
print("\nFor Add-k (k = 0.05) smoothed data: Unigram Model perplexity: "+ str(p19) + " and Bigram Model perplexity: " + str(p20))

-------------------------- FINAL RESULTS --------------------------

--------------- PERPLEXITY VALUES OF TRAINING DATA --------------

For un-smoothed data: Unigram Model perplexity: 247.55057125002799 and Bigram Model perplexity: 32.29509071773678

For Laplace smoothed data: Unigram Model perplexity: 247.75652625415904 and Bigram Model perplexity: 196.69982420357786

For Add-k (k = 0.5) smoothed data: Unigram Model perplexity: 247.60531672269684 and Bigram Model perplexity: 138.78916853651518

For Add-k (k = 0.1) smoothed data: Unigram Model perplexity: 247.55287969926633 and Bigram Model perplexity: 69.58610266404538

For Add-k (k = 0.05) smoothed data: Unigram Model perplexity: 247.5505770975103 and Bigram Model perplexity: 36.479045208539695

------------ PERPLEXITY VALUES OF VALIDATION DATA ---------------

For un-smoothed data: Unigram Model validation perplexity is 231.86077727545114 and Bigram Model perplexity: 57.02984483256454

For Laplace smoothed data: Unigram Model perple

In [54]:
import pandas as pd

print("=" * 90)
print(" " * 25 + "PERPLEXITY RESULTS")
print("=" * 90)

# Create data for the table
data = {
    'Smoothing Method': [
        'Un-smoothed',
        'Laplace (k=1)',
        'Add-k (k=0.5)',
        'Add-k (k=0.1)',
        'Add-k (k=0.05)'
    ],
    'Training Unigram': [p1, p5, p3, p13, p17],
    'Training Bigram': [p2, p6, p4, p14, p18],
    'Validation Unigram': [p7, p9, p11, p15, p19],
    'Validation Bigram': [p8, p10, p12, p16, p20]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the table
print("\n" + df.to_string(index=False))
print("\n" + "=" * 90)

# The Perplexity results are saved to CSV file
df.to_csv('perplexity_results.csv', index=False)

                         PERPLEXITY RESULTS

Smoothing Method  Training Unigram  Training Bigram  Validation Unigram  Validation Bigram
     Un-smoothed        247.550571        32.295091          231.860777          57.029845
   Laplace (k=1)        247.756526       196.699824          232.423302         204.963898
   Add-k (k=0.5)        247.605317       138.789169          232.104380         161.558013
   Add-k (k=0.1)        247.552880        69.586103          231.902757         108.370577
  Add-k (k=0.05)        247.550577        36.479045          231.862791          98.716894

