<a href="https://colab.research.google.com/github/malakiss/N_gram_nlp/blob/main/N_gram_Ass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download and Process the IMDB Dataset

In [None]:
!pip install --quiet gdown

# 1. Download the zipped IMDB dataset from Drive
# this is the unsup part of https://ai.stanford.edu/~amaas/data/sentiment/

!gdown "https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB" -O imdb_dataset.zip

# 2. Unzip the downloaded file
!unzip -q imdb_dataset.zip -d imdb_data


Downloading...
From (original): https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB
From (redirected): https://drive.google.com/uc?id=1PjJ5cop0pT6tcEw9-ZUstVMujx-o-QTB&confirm=t&uuid=5d11bccf-25e5-4727-aa42-0671d8a0eb12
To: /content/imdb_dataset.zip
100% 44.7M/44.7M [00:00<00:00, 120MB/s]
replace imdb_data/unsup/0_0.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
A


In [None]:
import os
import re
import string
import random
from collections import defaultdict, Counter
import math
from math import log, exp


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

def load_imdb_unsup_sentences(folder_path):
    """
    Loads text files from the IMDB 'unsup' (unsupervised) folder.
    - Reads all `.txt` files from the given folder.
    - Splits text by newline, strips each line, and returns a list of raw lines.
    - Replaces <br /> tags with a special token <nl>.
    """
    all_sentences = []

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and file_name.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        line = re.sub(r"<br\s*/?>", " <nl> ", line)  # Replace <br /> with <nl>
                        all_sentences.append(line)

    return all_sentences

def remove_punctuation(text):
    """
    Removes punctuation from the text while preserving <nl> tokens.
    Also removes apostrophes and stopwords.
    """
    text = re.sub(r"<br\s*/?>", " <nl> ", text)  # Ensure <br /> becomes <nl>
    regex_pattern = f"[{re.escape(string.punctuation)}]"
    text = re.sub(regex_pattern, "", text)
    processed_words = []
    for word in text.split():
          processed_words.append(word)

    return " ".join(processed_words)

def build_vocabulary(sentences):
    vocab = set()
    for sentence in sentences:
        cleaned_sentence = remove_punctuation(sentence.lower())  # Lowercase & clean
        tokens = cleaned_sentence.split()
        vocab.update(tokens)
    return vocab
def tokinize(sentences, vocab, unknown="<UNK>"):
    tokenized_sentences = []
    for sentence in sentences:
        cleaned_sentence = remove_punctuation(sentence.lower())  # Lowercase & clean
        tokens = [
            token
            if token in vocab
            else unknown
            for token in cleaned_sentence.split()
        ]
        tokenized_sentences.append(tokens)
    return tokenized_sentences

In [None]:
imdb_folder = "imdb_data/unsup"
sentences = load_imdb_unsup_sentences(imdb_folder)

print(f"Number of raw sentences loaded: {len(sentences)}")
print(f"Example (first 2 sentences):\n{sentences[:2]}")


Number of raw sentences loaded: 50000
Example (first 2 sentences):
["For the particular movie fan, The Beat That My Heart Skipped, is a slice of intensity, wonder, and subtlety that can only come from Europe. The director/co-writer, Jacques Audiard, has taken a film previously made by James Toback called Fingers, starring Harvey Keitel in the role now occupied by Romain Duris, and made it his own. If I had seen the original version I would make a couple of comparisons to it (at the least, for those who didn't see the original the remake makes you want to check out the original, if only for the acting appeal of Keitel). However I did think about another wonderful French film in the vein of this film- Francois Truffaut's Shoot the Piano Player. <nl>  <nl> While Truffaut's film is a little more concerned about the lead's relationship(s) with women, I felt a kind of connection between the material of the two pieces- sometimes intense, usually lyrical, tales of a person trying to find what 

In [None]:
assert len(sentences) == 50000, "Expected 50,000 sentences from the unsup folder."

In [None]:
random.seed(42)

def split_data(sentences, test_split=0.1):
    """
      shuffle the sentences
      split them into train and test sets (first 1-test_split of the data is the training)
      return the train and test sets
    """
    shuffled_sentences = sentences[:]  # Copy the list to avoid modifying the original
    random.shuffle(shuffled_sentences)  # Shuffle sentences randomly

    split_idx = int(len(shuffled_sentences) * (1 - test_split))
    train_sentences = shuffled_sentences[:split_idx]
    test_sentences = shuffled_sentences[split_idx:]

    return train_sentences, test_sentences


In [None]:
train_sentences, test_sentences = split_data(sentences)

print(f"Number of training sentences: {len(train_sentences)}")
print(f"Number of test sentences: {len(test_sentences)}")

Number of training sentences: 45000
Number of test sentences: 5000


In [None]:
assert len(train_sentences) == 45000, "Expected 45,000 sentences for training."
assert len(test_sentences) == 5000, "Expected 5,000 sentences for testing."


In [None]:
vocab = build_vocabulary(train_sentences)
tokenized_sentences = tokinize(train_sentences, vocab)
test_tokenized = tokinize(test_sentences, vocab)
print(f"Vocabulary size: {len(vocab)}")
print(f"Example tokens from first sentence: {tokenized_sentences[0][:200] if tokenized_sentences else 'No tokens loaded'} ...")


Vocabulary size: 161741
Example tokens from first sentence: ['i', 'saw', 'this', 'recently', 'after', 'seeing', 'terrance', 'davies', 'lovely', 'the', 'house', 'nl', 'nl', 'of', 'mirth', 'and', 'was', 'not', 'disappointed', 'actually', 'they', 'make', 'a', 'nice', 'little', 'nl', 'nl', 'edith', 'wharton', 'weekend', 'if', 'you', 'have', 'a', 'lot', 'of', 'time', 'on', 'your', 'hands', 'nl', 'nl', 'and', 'want', 'to', 'feel', 'so', 'good', 'about', 'your', 'own', 'life', 'and', 'choices', 'that', 'you', 'nl', 'nl', 'want', 'to', 'shoot', 'yourself', 'and', 'i', 'mean', 'that', 'in', 'a', 'good', 'way', 'this', 'movie', 'nl', 'nl', 'is', 'beautifully', 'crafted', 'and', 'features', 'winona', 'ryders', 'best', 'nl', 'nl', 'performance', 'since', 'little', 'women', 'daniel', 'day', 'lewis', 'has', 'the', 'nl', 'nl', 'thankless', 'edith', 'wharton', 'male', 'lead', 'role', 'she', 'wrote', 'best', 'for', 'nl', 'nl', 'women', 'and', 'michelle', 'pfeiffer', 'is', 'that', 'obscure', 'object', '

In [None]:
assert len(vocab) == 161741, "Expected a vocabulary size of 171,591."
assert len(tokenized_sentences) == 45000, "Expected tokenized sentences count to match raw sentences."

example = "I love Natural language processing, and i want to be a great engineer."
assert len(example) == 70, "Example sentence length (in characters) does not match the expected 70."

example_tokens = tokinize([example], vocab)[0]
assert len(example_tokens) == 13, "Token count for the example sentence does not match the expected 13."


In [None]:
def pad_sentence(tokens, n):
    padded = ['<s>'] * (n - 1) + tokens + ['</s>']
    return padded
def build_ngram_counts(tokenized_sentences, n):
    """
    Builds n-gram counts and (n-1)-gram counts from the given tokenized sentences.
    """
    ngram_counts = Counter()
    context_counts = Counter()

    for sentence in tokenized_sentences:
        padded_sentence = pad_sentence(sentence, n)
        for i in range(len(padded_sentence) - n + 1):
            ngram = tuple(padded_sentence[i:i + n])
            context = tuple(padded_sentence[i:i + n - 1])
            ngram_counts[ngram] += 1
            context_counts[context] += 1
    return ngram_counts, context_counts


def laplace_probability(ngram, ngram_counts, context_counts, vocab_size, alpha=1.0):
    context = ngram[:-1]
    ngram_count = ngram_counts.get(ngram, 0)
    context_count = context_counts.get(context, 0)
    probability = (ngram_count + alpha) / (context_count + alpha * vocab_size)
    return probability


In [None]:
n = 1
ngram_counts, context_counts = build_ngram_counts(tokenized_sentences, n=n)
print(f"Number of bigrams: {len(ngram_counts)}")
print(f"Number of contexts: {len(context_counts)}")

Number of bigrams: 161742
Number of contexts: 1


In [None]:
import random

def predict_next_token(context_tokens, ngram_counts, context_counts, vocab, n, alpha=1.0, top_k=5):
    context = tuple(context_tokens[-(n-1):])  # Extract the last (n-1) tokens as context
    candidates = [(token, laplace_probability(context + (token,), ngram_counts, context_counts, len(vocab), alpha))
                  for token in vocab]

    # Sort candidates by probability in descending order
    candidates.sort(key=lambda x: x[1], reverse=True)

    # Select top_k candidates
    top_candidates = candidates[:top_k]

    # Normalize probabilities
    total_prob = sum(prob for _, prob in top_candidates)
    if total_prob > 0:
        top_candidates = [(token, prob / total_prob) for token, prob in top_candidates]

    # Perform weighted random sampling from top_k candidates
    tokens, probs = zip(*top_candidates)
    next_token = random.choices(tokens, weights=probs, k=1)[0]

    return [(next_token, dict(top_candidates).get(next_token, 0.0))]  # Return sampled token with its actual probability
def generate_text_with_limit(start_tokens,ngram_counts,context_counts,vocab,n,alpha=1.0, max_length=20):
    generated = list(start_tokens)
    # Pad the initial context with <s> tokens if it's shorter than n-1
    if len(generated) < n - 1:
        generated =pad_sentence(generated, n)
    while len(generated) < max_length:
        next_token_candidates = predict_next_token(generated, ngram_counts,context_counts,vocab,n=n, alpha=alpha,top_k=5)
        if not next_token_candidates:
            break
        next_token = next_token_candidates[0][0]
        generated.append(next_token)
        if next_token == "</s>":
            break
    return generated
context = ["i", "love"]
generated_seq = generate_text_with_limit(start_tokens=context,ngram_counts=ngram_counts,context_counts=context_counts,vocab=vocab,n=1,alpha=1.0, max_length=128)
print("Generated Sequence:", generated_seq)

Generated Sequence: ['i', 'love', 'forementioned', 'fulfiling', 'economising', 'economising', 'economising', 'forementioned', 'economising', 'forementioned', 'comavision', 'comavision', 'forementioned', 'forementioned', 'forementioned', 'economising', 'fulfiling', 'littlemy', 'fulfiling', 'comavision', 'forementioned', 'forementioned', 'forementioned', 'economising', 'forementioned', 'fulfiling', 'forementioned', 'fulfiling', 'comavision', 'comavision', 'forementioned', 'littlemy', 'comavision', 'comavision', 'forementioned', 'economising', 'economising', 'comavision', 'economising', 'economising', 'economising', 'comavision', 'littlemy', 'comavision', 'economising', 'economising', 'comavision', 'forementioned', 'fulfiling', 'littlemy', 'forementioned', 'fulfiling', 'comavision', 'economising', 'fulfiling', 'fulfiling', 'littlemy', 'littlemy', 'economising', 'economising', 'fulfiling', 'comavision', 'littlemy', 'littlemy', 'forementioned', 'fulfiling', 'fulfiling', 'littlemy', 'comavis

In [None]:
def calculate_perplexity(tokenized_sentences, ngram_counts, context_counts, vocab_size, n, alpha=1.0):
    log_prob_sum = 0.0
    token_count = 0
    for sentence in tokenized_sentences:
        padded_sentence = pad_sentence(sentence, n)
        for i in range(len(padded_sentence) - n + 1):
            ngram = tuple(padded_sentence[i : i + n])
            prob = laplace_probability(ngram, ngram_counts, context_counts, vocab_size, alpha)
            log_prob_sum += log(prob)
            token_count += 1
    perplexity = exp(-log_prob_sum / token_count)
    return perplexity
perplexity = calculate_perplexity(
    test_tokenized,
    ngram_counts,
    context_counts,
    vocab_size=len(vocab),
    n=1,
    alpha=1
)
print(f"Perplexity: {perplexity}")


Perplexity: 1205.8826669851508


In [None]:
from math import log, exp

def calculate_sentence_perplexity(sentence, ngram_counts, context_counts, vocab_size, n, alpha=1.0):
    """
    Calculate the perplexity for a single sentence using an n-gram model.
    """
    log_prob_sum = 0.0
    token_count = 0

    # Pad the sentence with <s> and </s>
    padded_sentence = pad_sentence(sentence, n)

    for i in range(len(padded_sentence) - n + 1):
        ngram = tuple(padded_sentence[i : i + n])
        context = tuple(padded_sentence[i : i + n - 1])

        # Get n-gram and context frequencies (with Laplace smoothing)
        ngram_freq = ngram_counts.get(ngram, 0) + alpha
        context_freq = context_counts.get(context, 0) + (alpha * vocab_size)

        prob = ngram_freq / context_freq
        log_prob_sum += log(prob)
        token_count += 1

    # Compute perplexity
    perplexity = exp(-log_prob_sum / token_count)
    return perplexity

# Define your test sentence
test_sentence = ["i", "loved", "this", "movie"]

# Compute perplexity for the sentence
sentence_perplexity = calculate_sentence_perplexity(
    test_sentence,
    ngram_counts,
    context_counts,
    vocab_size=len(vocab),
    n=1,
    alpha=1.0
)

print(f"Perplexity for sentence 'i loved this movie': {sentence_perplexity}")


Perplexity for sentence 'i loved this movie': 252.000291831992


# **Analysis on Test Set**
n=2 Perplexity: 3410.0383848243023

n=3 Perplexity: 37117.2128829916

n=4 Perplexity: 101534.96229645389

# Analysis on Text
n=1 Perplexity: 252.000291831992

n=2 Perplexity: 98.2766219854523

n=3 Perplexity: 425.8483425242998

n=4 Perplexity: 1864.242523029493