# Download and Process the IMDB Dataset

In [None]:
!pip install --quiet gdown

# 1. Download the zipped IMDB dataset from Drive
# this is the unsup part of https://ai.stanford.edu/~amaas/data/sentiment/

!gdown "https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB" -O imdb_dataset.zip

# 2. Unzip the downloaded file
!unzip -q imdb_dataset.zip -d imdb_data


Downloading...
From (original): https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB
From (redirected): https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB&confirm=t&uuid=1ace5eb1-0959-41aa-bde3-549a43beb9f8
To: /content/imdb_dataset.zip
100% 44.7M/44.7M [00:00<00:00, 63.0MB/s]


In [None]:
import os
import re
import string
import random
from collections import defaultdict, Counter
import math
from math import log, exp


In [None]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Ensure stopwords are downloaded
nltk.download("stopwords")
nltk.download("wordnet")

CUSTOM_STOPWORDS = {"the", "a", "an", "is", "are", "was", "were", "this", "that", "to", "of", "and", "on"}


STOP_WORDS = set(stopwords.words("english"))  # Load stopwords
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def load_imdb_unsup_sentences(folder_path):
    """
    Loads text files from the IMDB 'unsup' (unsupervised) folder.
    - Reads all `.txt` files from the given folder.
    - Splits text by newline, strips each line, and returns a list of raw lines.
    - Replaces <br /> tags with a special token <nl>.
    """
    all_sentences = []

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and file_name.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        line = re.sub(r"<br\s*/?>", " <nl> ", line)  # Replace <br /> with <nl>
                        all_sentences.append(line)

    return all_sentences

def remove_punctuation(text):
    """
    Removes punctuation from the text while preserving <nl> tokens.
    Also removes apostrophes and stopwords.
    """
    text = re.sub(r"<br\s*/?>", " <nl> ", text)  # Ensure <br /> becomes <nl>
    stop_words = set(stopwords.words('english'))
    regex_pattern = f"[{re.escape(string.punctuation)}]"
    text = re.sub(regex_pattern, "", text)
    processed_words = []
    for word in text.split():
        #if word not in stop_words:
            processed_words.append(word)

    return " ".join(processed_words)

def build_vocabulary(sentences):

    vocab = set()

    for sentence in sentences:
        cleaned_sentence = remove_punctuation(sentence.lower())  # Lowercase & clean
        tokens = cleaned_sentence.split()
        vocab.update(tokens)
    return vocab

def tokinize(sentences, vocab, unknown="<UNK>"):

    tokenized_sentences = []

    for sentence in sentences:
        cleaned_sentence = remove_punctuation(sentence.lower())  # Lowercase & clean
        tokens = [
            token
            if token in vocab
            else unknown
            for token in cleaned_sentence.split()
        ]
        tokenized_sentences.append(tokens)

    return tokenized_sentences


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
imdb_folder = "imdb_data/unsup"
sentences = load_imdb_unsup_sentences(imdb_folder)

print(f"Number of raw sentences loaded: {len(sentences)}")
print(f"Example (first 2 sentences):\n{sentences[:2]}")


Number of raw sentences loaded: 50000
Example (first 2 sentences):
['I saw this movie a while ago, awaiting a good and critical movie. I´m not afraid of a little violence in movies, but in this movie it just made no sense. Random useless violence all the time. There just wasn´t any goal. Maybe the director wanted to bring this feeling to the viewers, but that could have been done in much better ways. <nl>  <nl> The characters in this movie also have a complete lack of personality. The are flat characters who just commit violent stuff. <nl>  <nl> A ´not so good´ movie which stands in no comparison to American History X and Romperstomper.', "Production house Amicus had a rich tradition in serving the so-called horror omnibuses. Long feature films telling three separate tales that mix humor and horror. The Monster Club was the last one and I can't really say I'm sorry for that. I didn't enjoy this film at all since it only has a few good moments and even those aren't highly memorable. Two

In [None]:
assert len(sentences) == 50000, "Expected 50,000 sentences from the unsup folder."

In [None]:
random.seed(42)

def split_data(sentences, test_split=0.1):
    """
      shuffle the sentences
      split them into train and test sets (first 1-test_split of the data is the training)
      return the train and test sets
    """
    shuffled_sentences = sentences[:]  # Copy the list to avoid modifying the original
    random.shuffle(shuffled_sentences)  # Shuffle sentences randomly

    split_idx = int(len(shuffled_sentences) * (1 - test_split))
    train_sentences = shuffled_sentences[:split_idx]
    test_sentences = shuffled_sentences[split_idx:]

    return train_sentences, test_sentences


In [None]:
train_sentences, test_sentences = split_data(sentences)

print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of test sentences: {len(test_sentences)}")

Number of training sentences: 45000
Number of test sentences: 5000


In [None]:
assert len(train_sentences) == 45000, "Expected 45,000 sentences for training."
assert len(test_sentences) == 5000, "Expected 5,000 sentences for testing."


In [None]:
vocab = build_vocabulary(train_sentences)
tokenized_sentences = tokinize(train_sentences, vocab)

print(f"Vocabulary size: {len(vocab)}")
print(f"Example tokens from first sentence: {tokenized_sentences[0][:200] if tokenized_sentences else 'No tokens loaded'} ...")


Vocabulary size: 161276
Example tokens from first sentence: ['peruvian', 'writerdirector', 'josue', 'mendez', 'has', 'made', 'a', 'brave', 'little', 'low', 'budget', 'film', 'that', 'deals', 'with', 'a', 'subject', 'currently', 'burgeoning', 'our', 'hospitals', 'in', 'this', 'country', 'as', 'the', 'fallout', 'of', 'the', 'war', 'on', 'iraq', 'and', 'still', 'plagues', 'the', 'veterans', 'of', 'the', 'vietnam', 'war', 'post', 'traumatic', 'stress', 'syndrome', 'aka', 'battle', 'rattle', 'this', 'is', 'a', 'difficult', 'topic', 'to', 'dramatize', 'without', 'being', 'preachy', 'or', 'maudlin', 'but', 'mendez', 'has', 'succeeded', 'where', 'others', 'have', 'failed', 'nl', 'nl', 'santiago', 'pietro', 'sibille', 'is', 'a', '23yearold', 'retired', 'veteran', 'who', 'was', 'conscripted', 'at', 'age', '16', 'and', 'trained', 'to', 'be', 'a', 'killer', 'assigned', 'to', 'fighting', 'in', 'the', 'war', 'against', 'ecuador', 'against', 'terrorists', 'and', 'against', 'the', 'drug', 'mafia', 'he

In [None]:
assert len(vocab) == 161276, "Expected a vocabulary size of 171,591."
assert len(tokenized_sentences) == 45000, "Expected tokenized sentences count to match raw sentences."

example = "I love Natural language processing, and i want to be a great engineer."
assert len(example) == 70, "Example sentence length (in characters) does not match the expected 70."

example_tokens = tokinize([example], vocab)[0]
assert len(example_tokens) == 13, "Token count for the example sentence does not match the expected 13."


In [None]:
def pad_sentence(tokens, n):
    padded = ['<s>'] * (n - 1) + tokens + ['</s>']
    return padded


def build_ngram_counts(tokenized_sentences, n):
    """
    Builds n-gram counts and (n-1)-gram counts from the given tokenized sentences.
    """
    ngram_counts = Counter()
    context_counts = Counter()

    for sentence in tokenized_sentences:
        padded_sentence = pad_sentence(sentence, n)

        for i in range(len(padded_sentence) - n + 1):
            ngram = tuple(padded_sentence[i:i + n])
            context = tuple(padded_sentence[i:i + n - 1])

            ngram_counts[ngram] += 1
            context_counts[context] += 1

    return ngram_counts, context_counts


def laplace_probability(ngram, ngram_counts, context_counts, vocab_size, alpha=1.0):
    context = ngram[:-1]  # The (n-1)-gram context
    ngram_count = ngram_counts.get(ngram, 0)
    context_count = context_counts.get(context, 0)

    probability = (ngram_count + alpha) / (context_count + alpha * vocab_size)
    return probability


In [None]:
n = 4
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)
print(f"Number of bigrams: {len(ngram_counts)}")
print(f"Number of contexts: {len(context_counts)}")



Number of bigrams: 8808241
Number of contexts: 6085736


In [None]:
import random
from math import log, exp

def predict_next_token(
    context_tokens,
    ngram_counts,
    context_counts,
    vocab,
    n,
    alpha=1.0,
    top_k=5
):

    context = tuple(context_tokens[-(n-1):])  # Extract the last (n-1) tokens
    candidates = []

    for word in vocab:
        ngram = context + (word,)
        count_ngram = ngram_counts.get(ngram, 0)
        count_context = context_counts.get(context, 0)
        prob = (count_ngram + alpha) / (count_context + alpha * len(vocab))
        candidates.append((word, prob))

    # Sort candidates by probability (highest first)
    candidates.sort(key=lambda x: x[1], reverse=True)

    return candidates[:top_k] if candidates else []  # Return top_k words

def generate_text_with_limit(  start_tokens,
    ngram_counts,
    context_counts,
    vocab,
    n,
    alpha=1.0,
    max_length=20,
    top_k=5
):

    generated = list(start_tokens)

    for _ in range(max_length - len(start_tokens)):
        next_word_candidates = predict_next_token(
            generated, ngram_counts, context_counts, vocab, n, alpha, top_k
        )

        if not next_word_candidates:
            break

        # (avoids repetition)
        words, probabilities = zip(*next_word_candidates)
        next_word = random.choices(words, weights=probabilities, k=1)[0]

        if next_word == "</s>":
            break

        generated.append(next_word)

    return generated

context = ["i", "loved"]
generated_seq = generate_text_with_limit(
    start_tokens=context,
    ngram_counts=ngram_counts,
    context_counts=context_counts,
    vocab=vocab,
    n=4,
    alpha=1.0,
    max_length=128
)

print("Generated Sequence:", " ".join(generated_seq))


Generated Sequence: i loved masterowner overstretch spiderloser battlegrounds masterowner masterowner battlegrounds overstretch battlegrounds masterowner blight spiderloser blight spiderloser battlegrounds battlegrounds masterowner spiderloser overstretch blight spiderloser masterowner battlegrounds masterowner battlegrounds spiderloser overstretch blight blight overstretch battlegrounds battlegrounds overstretch battlegrounds spiderloser battlegrounds battlegrounds spiderloser spiderloser overstretch masterowner blight battlegrounds blight battlegrounds overstretch overstretch blight masterowner masterowner masterowner overstretch masterowner battlegrounds spiderloser overstretch blight blight spiderloser blight battlegrounds spiderloser spiderloser overstretch overstretch overstretch blight blight battlegrounds blight overstretch masterowner spiderloser overstretch spiderloser masterowner overstretch blight blight spiderloser masterowner blight overstretch overstretch spiderloser mas

In [None]:
from math import log, exp

def calculate_perplexity(tokenized_sentences, ngram_counts, context_counts, vocab_size, n, alpha=1.0):
    """
    Calculates the perplexity of an n-gram model using the given formula.

    Args:
      tokenized_sentences: List of lists of tokens.
      ngram_counts: Counter of n-grams.
      context_counts: Counter of (n-1)-grams.
      vocab_size: Size of the vocabulary.
      n: n-gram order.
      alpha: Laplace smoothing parameter.

    Returns:
      A float representing the perplexity.
    """
    log_prob_sum = 0.0
    token_count = 0

    for sentence in tokenized_sentences:
        padded_sentence = pad_sentence(sentence, n)  # Pad sentence with <s> and </s>
        for i in range(len(padded_sentence) - n + 1):
            ngram = tuple(padded_sentence[i : i + n])
            context = tuple(padded_sentence[i : i + n - 1])


            ngram_freq = ngram_counts[ngram] + alpha
            context_freq = context_counts[context] + (alpha * vocab_size)
            prob = ngram_freq / context_freq

            log_prob_sum += log(prob)
            token_count += 1
    perplexity = exp(-log_prob_sum / token_count) if token_count > 0 else float("inf")
    return perplexity

perplexity = calculate_perplexity(
    tokenized_sentences,
    ngram_counts,
    context_counts,
    vocab_size=len(vocab),
    n=4,
    alpha=1.0
)
print(f"Perplexity: {perplexity}")


Perplexity: 56621.75317998516


# **Analysis**
n=2 Perplexity=2931.987399

n=3 Perplexity: 24666.424455167256

n=4 Perplexity: 56621.75317998516