In [1]:
import json

with open("data/corpus/dummy_corpus.json") as f:
    dataset = json.load(f)

print(f"Number of documents: {len(dataset)}")
print("Example")
dataset[12]

Number of documents: 13
Example


{'id': '013', 'text': "ELI5: What's quiet quitting?", 'source': 'Reddit'}

In [2]:
import itertools

"""
Extracts n-grams from the text of each entry in the dataset with the following format:
{
    "id": "some_id",
    "text": "some text",
    "sentence n-gram pairs": [
        ["sentence1", ["ngram1", "ngram2", ...]],
        ["sentence2", ["ngram3", "ngram4", ...]],
        ...
    ],
    "source": "some source"
}
"""

def n_grams(dataset, n_low=2, n_high=5):
    def extract_ngrams(sentence, n_low, n_high):
        words = sentence.split()  # Split the sentence into words
        ngram_list = []
        for n in range(n_low, n_high + 1):
            ngrams = list(itertools.islice(zip(*[words[i:] for i in range(n)]), 0, None))
            ngram_list.extend([" ".join(ngram) for ngram in ngrams])
        return ngram_list

    processed_data = []

    for entry in dataset:
        text = entry["text"]
        sentences = [text]  # Assuming each "text" field corresponds to a single sentence
        sentence_ngram_pairs = [
            [sentence, extract_ngrams(sentence, n_low, n_high)]
            for sentence in sentences
        ]

        processed_entry = {
            "id": entry["id"],
            "text": entry["text"],
            "sentence n-gram pairs": sentence_ngram_pairs,
            "source": entry["source"],
        }
        processed_data.append(processed_entry)

    return processed_data

# Example usage:
extracted_n_grams = n_grams(dataset, n_low=2, n_high=5)
extracted_n_grams[12]

{'id': '013',
 'text': "ELI5: What's quiet quitting?",
 'sentence n-gram pairs': [["ELI5: What's quiet quitting?",
   ["ELI5: What's",
    "What's quiet",
    'quiet quitting?',
    "ELI5: What's quiet",
    "What's quiet quitting?",
    "ELI5: What's quiet quitting?"]]],
 'source': 'Reddit'}

In [21]:
"""Given the list of extracted n-grams, create a dictionary with the following structure:
{
    "n-gram": {
        "count": <number of occurrences>,
        "documents": [<list of document IDs where the n-gram occurs>]
        "sentences": [<list of sentences where the n-gram occurs>]
    }
}
"""
def ngram_occurrences(extracted_n_grams):
    ngram_dict = {}

    for entry in extracted_n_grams:
        for _, ngrams in entry["sentence n-gram pairs"]:
            for ngram in ngrams:
                if ngram not in ngram_dict:
                    ngram_dict[ngram] = {
                        "count": 1,
                        "documents": [entry["id"]],
                        "sentences": [entry["text"]],
                    }
                else:
                    ngram_dict[ngram]["count"] += 1
                    ngram_dict[ngram]["documents"].append(entry["id"])
                    ngram_dict[ngram]["sentences"].append(entry["text"])

    return ngram_dict

# Example usage:
ngram_dict = ngram_occurrences(extracted_n_grams)
print(f"Number of unique n-grams: {len(ngram_dict)}")
print("Example of the n-gram dictionary:")
# sample the most frequent n-gram and print the key-value pair
most_frequent = max(ngram_dict, key=lambda x: ngram_dict[x]["count"])
print(f"'{most_frequent}':\n\t {ngram_dict[most_frequent]}")

Number of unique n-grams: 827
Example of the n-gram dictionary:
'quiet quitting':
	 {'count': 4, 'documents': ['002', '003', '004', '007'], 'sentences': ['I never realized I was quiet quitting until someone explained it. It’s crazy how normalized overworking has become.', 'Is quiet quitting really such a bad thing? Doing your job without overextending yourself seems pretty reasonable to me.', 'All this debate about quiet quitting makes me wonder: why is it revolutionary to just do what you’re paid to do?', 'The rise of quiet quitting is just another sign that people are finally valuing their time and energy. It’s about time.']}


In [24]:
from collections import defaultdict
import math

def compute_pmi(corpus, external_ngrams, n):
    # Step 1: Count n-grams and 1-grams
    unigram_counts = defaultdict(int)
    ngram_counts = defaultdict(int)
    total_unigrams = 0
    total_ngrams = 0
    
    # Sliding window to count n-grams
    for line in corpus:
        words = line.strip().split()
        total_unigrams += len(words)
        for i in range(len(words)):
            unigram_counts[words[i]] += 1
            for j in range(1, n+1):
                if i + j <= len(words):
                    ngram = tuple(words[i:i+j])
                    ngram_counts[ngram] += 1
                    if len(ngram) == n:
                        total_ngrams += 1

    # Step 2: Compute PMI for external n-grams
    pmi_scores = {}
    for ngram in external_ngrams:
        ngram_tuple = tuple(ngram.split())
        if len(ngram_tuple) != n:
            raise ValueError(f"External n-gram {ngram} does not match specified n ({n}).")
        
        # Probability of n-gram
        p_ngram = ngram_counts[ngram_tuple] / total_ngrams if total_ngrams > 0 else 0
        
        # Probability of individual unigrams
        p_individuals = math.prod(
            [unigram_counts[word] / total_unigrams for word in ngram_tuple]
        )
        
        # Compute PMI
        if p_ngram > 0 and p_individuals > 0:
            pmi_scores[ngram] = math.log(p_ngram / p_individuals, 2)
        else:
            pmi_scores[ngram] = float('-inf')  # Log(0) case

    return pmi_scores

# Example usage
corpus = [
    "this is a test corpus",
    "this corpus is only a test",
    "n-grams are useful for text processing"
]
external_ngrams = ["this is", "corpus is only", "text processing"]

# Compute PMI for all external n-grams, adjusting `n` dynamically
for ngram in external_ngrams:
    n = len(ngram.split())  # Dynamically adjust `n`
    pmi_scores = compute_pmi(corpus, [ngram], n)
    for ngram, pmi in pmi_scores.items():
        print(f"PMI({ngram}) = {pmi}")

PMI(this is) = 2.367570760443075
PMI(corpus is only) = 6.802956905113721
PMI(text processing) = 4.3675707604430745
