# Natural Language Processing 
### Exercise III
` Mauricio Manuel F. Bergancia & Mherlie Joy U. Chavez `

In [1]:
# Accessing Wikipedia

from nltk import bigrams, trigrams
from nltk.tokenize import word_tokenize

from collections import Counter

import wikipedia
import re
import math

In [2]:
# Accessing Wikipedia topic

topic = "lgbt"

# Get the content of a Wikipedia page

page = wikipedia.page(topic)
text = page.content[:1000]

# Remove punctuation from the text
text = re.sub(r'[^\w\s]', '', text).lower()  # Remove punctuation and make lowercase

# Tokenization
tokens = word_tokenize(text)

# Print the cleaned content of the page
print(text)

lgbtq also commonly seen as lgbt lgbt lgbtq lgbtqia and lgbtqia is an initialism for lesbian gay bisexual transgender and queer or questioning it is an umbrella term originating in the united states broadly referring to all sexualities romantic orientations sex characteristics and gender identities that are not heterosexual heteroromantic cisgender or endosex
in the 1990s gay lesbian and bisexual activists adopted the initialism lgb terminology eventually shifted to lgbt as transgender people gained recognition around that time some activists began to reclaim the term queer seeing it as a more radical and inclusive umbrella term though others reject it due to its history as a pejorative in recognition of this the 2010s saw the adoption of lgbtq and other more inclusive variants
some versions of the term such as lgbt and lgbtq add a plus sign to represent additional identities not captured within the initialism many further variants exist which a


In [3]:
# Bigram Probability Models with Laplace Smoothing

def bigram_probabilities(tokens, alpha=1):
    bigram_counts = Counter(bigrams(tokens))
    unigram_counts = Counter(tokens)
    
    vocab_size = len(unigram_counts)  # Vocabulary size for smoothing
    bigram_probs = {
        bigram: (count + alpha) / (unigram_counts[bigram[0]] + alpha * vocab_size)
        for bigram, count in bigram_counts.items()
    }
    return bigram_probs

In [4]:
# Trigram Probability Models with Laplace Smoothing

def trigram_probabilities(tokens, alpha=1):
    trigram_counts = Counter(trigrams(tokens))
    bigram_counts = Counter(zip(tokens, tokens[1:]))
    
    vocab_size = len(bigram_counts)  # Vocabulary size for smoothing
    trigram_probs = {
        trigram: (count + alpha) / (bigram_counts[(trigram[0], trigram[1])] + alpha * vocab_size)
        for trigram, count in trigram_counts.items()
    }
    return trigram_probs

In [5]:
# Compute probability distributions

wiki_bigram = bigram_probabilities(tokens)
wiki_trigram = trigram_probabilities(tokens)

In [6]:
# Bigram Perplexity Computation using Log Probabilities

def calculate_bigram_perplexity(bigram_probs, test_sentence, alpha=1):
    test_tokens = word_tokenize(test_sentence.lower())
    test_bigrams = list(bigrams(test_tokens))
    
    N = len(test_bigrams)
    log_prob_sum = 0
    vocab_size = len(set(test_tokens))  # Use test set vocab size for smoothing
    
    for bigram in test_bigrams:
        prob = bigram_probs.get(bigram, alpha / (1 + alpha * vocab_size))  # Laplace smoothing for unknown bigrams
        log_prob_sum += math.log2(prob)
    
    return math.pow(2, -log_prob_sum / N) if N > 0 else float('inf')  # Avoid division by zero

In [7]:
# Trigram Perplexity Computation using Log Probabilities

def calculate_trigram_perplexity(trigram_probs, test_sentence, alpha=1):
    test_tokens = word_tokenize(test_sentence.lower())
    test_trigrams = list(trigrams(test_tokens))
    
    N = len(test_trigrams)
    log_prob_sum = 0
    vocab_size = len(set(test_tokens))
    
    for trigram in test_trigrams:
        prob = trigram_probs.get(trigram, alpha / (1 + alpha * vocab_size))  # Laplace smoothing
        log_prob_sum += math.log2(prob)
    
    return math.pow(2, -log_prob_sum / N) if N > 0 else float('inf')  # Avoid division by zero


In [8]:
# Test sentence

test_sentence = "Gender equality means that all people, regardless of their gender, have the same rights, responsibilities, and opportunities."

In [9]:
# Compute perplexities

bigram_perplexity_score = calculate_bigram_perplexity(wiki_bigram, test_sentence)
trigram_perplexity_score = calculate_trigram_perplexity(wiki_trigram, test_sentence)

In [10]:
# Print results

print("Bigram Model Perplexity ->", bigram_perplexity_score)
print("Trigram Model Perplexity ->", trigram_perplexity_score)

Bigram Model Perplexity -> 18.999999999999996
Trigram Model Perplexity -> 18.999999999999996
