In [1]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import re
import random

# set seed for reproducibility
random.seed(42)

# Load 2% of the dataset: Can be increased for more diversity later
reddit = load_dataset("webis/tldr-17", trust_remote_code=True, split="train[:2%]")
corpus = [data["normalizedBody"] for data in reddit]
# shuffle the data
random.shuffle(corpus)

# Split into sentences and create a searchable structure
sentences = []
for text in corpus:
    # Split on ., !, ? followed by space and capitalize letter
    split_sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    sentences.extend([s.strip() for s in split_sentences if len(s.strip()) > 0])

# Create chunks of 1000 sentences
chunk_size = 1000
sentence_chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

def find_matching_sentences(query_string, num_matches=3):
    matches = []
    for chunk in sentence_chunks:
        for sentence in chunk:
            if query_string.lower() in sentence.lower():
                matches.append(sentence)
                if len(matches) >= num_matches:
                    return matches[:num_matches]
    return matches

# Test the function
query = "machine learning"
matches = find_matching_sentences(query)
print(f"Found {len(matches)} sentences matching '{query}':")
for match in matches:
    print(f"\t•{match}")

Found 3 sentences matching 'machine learning':
	•There are hard (i.e., more interesting) problems to solve, and there can be a lot more math involved, especially if you are of the machine learning / artificial intelligence type.
	•Ironically however, if you could solve a problem like this, your best bet would be to have a machine learning algorithm try to figure it out.
	•Honestly I would like to see machine learning where your IFCS learns on the fly how to control the ship.


In [2]:
import json
from tqdm import tqdm

# load the n-gram datasets with the PMI scores
target = {}

for index in tqdm(range(3, 6)):
    with open(f"../data/corpus/reddit_10/{index}-gram_top_1000_pmi.json", "r", encoding="utf-8") as f:
        target[index] = json.load(f)

    # Find the sentences containing the n-grams from the corpus and add it to the dataset
    for ngram in target[index]:
        matches = find_matching_sentences(ngram)
        target[index][ngram]["sentences"] = matches

# Save the dataset
with open("../data/corpus/reddit_10/top_1000_ngrams_with_counts_pmi_sentences.json", "w", encoding="utf-8") as f:
    json.dump(target, f, indent=4)

print("Structure of the dataset:")
# show first two entries
for ngram in list(target.keys())[:2]:
    print(f"{ngram}-gram:")
    for key, value in list(target[ngram].items())[:2]:
        print(f"\t•{key}: {value}")
        if key == "sentences":
            print(f"\t\t•{value[:2]}")
    print()

100%|██████████| 3/3 [01:03<00:00, 21.29s/it]

Structure of the dataset:
3-gram:
	•a lot of: {'count': 49921, 'PMI': 13.69, 'sentences': ['They all have in common that Helsinki sucks a lot of work from the rest of Finland, meaning that the countryside is emptying and moving to Helsinki.', 'Personally having read them I feel a lot of it is a case of "that doesn\'t mean what you think it means".', 'A very good documentary called "because the bible tells me so" has a lot of wonderful references to explain these verses better.']}
	•one of the: {'count': 27098, 'PMI': 1.53, 'sentences': ['By then I had been promoted to Shireikan and was one of the handful remaining senior officers of the Akodo.', 'The shitty parts came from the fact that we attended one of the wealthiest school districts in the country, so our peers and their parents were huge snobs and didn\'t want to be around "that kind of people". (What kind?', 'If you want general file storage then buy that service (or use one of the many free services designed for that purpose) \n




In [3]:
from openai import OpenAI
from collections import Counter
import requests
from requests.exceptions import Timeout, RequestException

client = OpenAI(
    base_url='http://kt-gpu5.ijs.si:11435/v1',
    api_key='ollama',  # required, but unused
)

def create_chat_completions(model_name, message, timeout=100):
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": message},
            ],
            timeout=timeout
        )
        return response.choices[0].message.content
    except Timeout:
        return None
    except RequestException as e:
        return None

annotator_model = "gemma2:latest"
judge_model = "qwen2.5:72b"

# test the function
message = "What is machine learning?"
response = create_chat_completions(annotator_model, message)
print(f"The response by {annotator_model} to '{message}' is: {response}")
print(response)
message = f"Rate the quality of the response: {response}"
response = create_chat_completions(judge_model, message)
print(f"The response by {judge_model} to '{message}' is: {response}")

The response by gemma2:latest to 'What is machine learning?' is: Imagine teaching a computer to learn without explicitly programming every single rule. That's essentially what machine learning is! 

It's a type of artificial intelligence where algorithms analyze data, identify patterns, and use those patterns to make predictions or decisions. Instead of being told exactly what to do, the algorithm learns from the data itself.

Here's a simple analogy: Think of a child learning to recognize cats. You wouldn't write down every single feature of a cat (fur, whiskers, pointy ears, etc.). Instead, you'd show them pictures of cats and tell them "This is a cat." Over time, the child learns to identify cats on their own based on the patterns they see.

Machine learning works similarly. We feed algorithms massive amounts of data, and they learn to recognize patterns and make predictions.

**Here are some key things to remember about machine learning:**

* **Data-driven:** Machine learning relie

In [4]:
import json
from tqdm import tqdm

def annotate_and_score_mwes(dataset, results_file, annotator_model, judge_model):
    """
    Annotates and scores MWEs from the dataset using annotator and judge prompts.
    
    Parameters:
        dataset (dict): The dataset containing n-grams and their details.
        results_file (str): Path to the file where results should be written.
        annotator_message (str): Template for the annotator prompt.
        judge_message (str): Template for the judge prompt.
        annotator_model (str): Name of the model to use for annotating chat completions.
        judge_model (str): Name of the model to use for judging chat completions.
    """
    for ngram, data in tqdm(dataset.items()):
        for phrase, details in tqdm(data.items()):
            # Extract data for the prompt
            n_gram = phrase
            frequency = details.get("count", "N/A")
            pmi_score = details.get("PMI", "N/A")
            sentences = details.get("sentences", [])

            # Limit to three sentences for prompt context
            sentence_1 = sentences[0] if len(sentences) > 0 else "No example sentence available."
            sentence_2 = sentences[1] if len(sentences) > 1 else "No example sentence available."
            sentence_3 = sentences[2] if len(sentences) > 2 else "No example sentence available."

            annotator_message = f"""
            You are an expert linguist helping to identify multi-word expressions (MWEs) in a large corpus. A multi-word expression is a sequence of words that form a single unit of meaning and cannot be easily deduced by the meanings of individual words.

            **Significance of Pointwise Mutual Information (PMI):**
            PMI is a statistical measure that quantifies how strongly two words are associated beyond what would be expected by random chance. A higher PMI score indicates that the words in the expression co-occur more frequently than expected, suggesting a meaningful relationship. PMI is particularly useful for identifying candidate MWEs as it captures statistical collocations that might signify idiomatic or fixed expressions.

            Here is the information about a potential MWE:

            - **Candidate Expression (n-gram):** "{n_gram}"
            - **Frequency in Corpus:** {frequency}
            - **Pointwise Mutual Information (PMI):** {pmi_score}

            ### Example Sentences:
            1. "{sentence_1}"
            2. "{sentence_2}"
            3. "{sentence_3}"

            **Questions:**
            1. Does this expression seem like a coherent unit of meaning in the context of the provided sentences? Explain why or why not.
            2. Does replacing this n-gram with a single word or phrase preserve the overall meaning of the sentence? If so, suggest a possible replacement.
            3. Does this expression appear to be idiomatic, collocational, or otherwise non-compositional? Provide specific reasons for your assessment.
            4. Based on the above, would you classify this n-gram as a multi-word expression? (Yes/No). Justify your answer.
            5. On a scale of 1-5, how confident are you in this classification? (1 = Not confident, 5 = Very confident). Provide reasoning for your confidence score.

            Provide a detailed explanation for your answers, referencing the statistical significance (PMI score), frequency, and contextual usage in the examples.
            """


            # Fill the annotator prompt
            annotator_prompt = annotator_message.format(
                n_gram=n_gram,
                PMI_score=pmi_score,
                frequency=frequency,
                sentence_1=sentence_1,
                sentence_2=sentence_2,
                sentence_3=sentence_3
            )

            # Get the annotator's response
            annotation = create_chat_completions(annotator_model, annotator_prompt)
            if annotation is None:
                annotation = "Error: Annotation not available."
            
            
            judge_message = f"""You are a judge evaluating the response of a linguist who has classified a multi-word expression (MWE) in a large corpus. Based on that, can you provide a label for the candidate phrase from the following options: "Novel MWE", "Variation of an existing MWE", "not an MWE"? Please provide only the label without any additional information."""

            # Fill the judge prompt
            judge_prompt = judge_message.format(n_gram=n_gram)
            
            # Get the judge's response
            judgement = create_chat_completions(judge_model, judge_prompt)
            if judgement is None:
                judgement = "Error: Judgment not available."

            # Prepare the result entry
            result_entry = {
                "n_gram": n_gram,
                "frequency": frequency,
                "PMI_score": pmi_score,
                "sentences": [sentence_1, sentence_2, sentence_3],
                "annotation": annotation,
                "judgement": judgement
            }

            # Write the result to the file
            with open(results_file, "a") as f:
                f.write(json.dumps(result_entry) + "\n")

# Annotate and score the MWEs
results_file = "../data/corpus/reddit_10/top_1000_ngrams_with_counts_pmi_sentences_results.json"

# test with a small subset of 10  n-grams per n = 2, 3, 4, 5
# target = {k: dict(list(v.items())[:10]) for k, v in target.items()}

# annotate_and_score_mwes
annotate_and_score_mwes(target, results_file, annotator_model, judge_model)

100%|██████████| 1000/1000 [1:28:05<00:00,  5.29s/it]
100%|██████████| 1000/1000 [1:25:46<00:00,  5.15s/it]
100%|██████████| 1000/1000 [1:24:23<00:00,  5.06s/it]
100%|██████████| 3/3 [4:18:16<00:00, 5165.36s/it]  


In [27]:
def merge_ngrams(results):
    from collections import defaultdict

    def is_subsequence(sub, main):
        """Check if `sub` is a subsequence of `main`."""
        sub_len, main_len = len(sub), len(main)
        for i in range(main_len - sub_len + 1):
            if main[i:i + sub_len] == sub:
                return True
        return False

    # Group results by unique n-grams.
    merged_results = []
    merged_map = defaultdict(list)

    for result in results:
        n_gram = result["n_gram"].split()
        found = False

        # Check if the n-gram can be merged with an existing group.
        for existing_n_gram, group in list(merged_map.items()):
            if is_subsequence(n_gram, existing_n_gram) or is_subsequence(existing_n_gram, n_gram):
                # Merge the two groups.
                merged_map[existing_n_gram].append(result)
                found = True
                break

        if not found:
            # Add a new group for this n-gram.
            merged_map[tuple(n_gram)].append(result)

    # Consolidate the merged results.
    for n_grams, group in merged_map.items():
        merged_result = {
            "n_gram": " ".join(n_grams),
            "frequency": sum(item["frequency"] for item in group),
            "PMI_score": sum(item["PMI_score"] for item in group) / len(group),
            "sentences": list(set(sentence for item in group for sentence in item["sentences"])),
            "annotation": list(set(item["annotation"] for item in group)),
            "judgement": list(set(item["judgement"] for item in group))
        }
        merged_results.append(merged_result)

    return merged_results

# Example Usage:
results = [
    {
        "n_gram": "the shit out of",
        "frequency": 5,
        "PMI_score": 0.8,
        "sentences": ["sentence_1", "sentence_2"],
        "annotation": "example",
        "judgement": "positive"
    },
    {
        "n_gram": "shit out of",
        "frequency": 3,
        "PMI_score": 0.6,
        "sentences": ["sentence_3"],
        "annotation": "example",
        "judgement": "neutral"
    },
    {
        "n_gram": "completely unexpected",
        "frequency": 2,
        "PMI_score": 0.9,
        "sentences": ["sentence_4"],
        "annotation": "example",
        "judgement": "positive"
    }
]

merged_results = merge_ngrams(results)
print(merged_results)

[{'n_gram': 'the shit out of', 'frequency': 5, 'PMI_score': 0.8, 'sentences': ['sentence_1', 'sentence_2'], 'annotation': ['example'], 'judgement': ['positive']}, {'n_gram': 'shit out of', 'frequency': 3, 'PMI_score': 0.6, 'sentences': ['sentence_3'], 'annotation': ['example'], 'judgement': ['neutral']}, {'n_gram': 'completely unexpected', 'frequency': 2, 'PMI_score': 0.9, 'sentences': ['sentence_4'], 'annotation': ['example'], 'judgement': ['positive']}]


In [29]:
import json

# Load the results
results_file = "../data/corpus/reddit_10/top_1000_ngrams_with_counts_pmi_sentences_results.json"

# Load the results
with open(results_file, "r", encoding="utf-8") as f:
    results = [json.loads(line) for line in f]

# Standardize the judgment key spelling
for result in results:
    # Check for different spellings and standardize to "judgement"
    if "judgment" in result:
        result["judgement"] = result.pop("judgment")
    elif "jugdement" in result:
        result["judgement"] = result.pop("jugdement")

# unique judgements
judgements = set(result["judgement"] for result in results)
print(f"Unique judgements: {judgements}")

print(f"Number of results: {len(results)}")

# {'"not an MWE"', '["Novel MWE"]', 'not an MWE', 'Novel MWE', 'Variation of an existing MWE', '"Novel MWE"', '["not an MWE"]', '"Variation of an existing MWE"', 'novel MWE', 'Not an MWE', '["Not an MWE"]'} -> 3 unique judgements = ["Novel MWE", "Variation of an existing MWE", "not an MWE"]
# Standardize the judgement labels
judgement_mapping = {
    "novel MWE": "Novel MWE",
    "Novel MWE": "Novel MWE",
    "Variation of an existing MWE": "Variation of an existing MWE",
    "not an MWE": "not an MWE",
    "Not an MWE": "not an MWE",
    "Not an MWE": "not an MWE",
    "['Novel MWE']": "Novel MWE",
    "['Variation of an existing MWE']": "Variation of an existing MWE",
    "['not an MWE']": "not an MWE"
}

for result in results:
    if result["judgement"] in judgement_mapping:
        result["judgement"] = judgement_mapping[result["judgement"]]

# sort the results by high PMI scores
results = sorted(results, key=lambda x: x["PMI_score"], reverse=True)

# Now we can safely access "judgement"
novel_mwes = [result for result in results if "judgement" in result and "Novel MWE" in result["judgement"]]
variations = [result for result in results if "judgement" in result and "Variation of an existing MWE" in result["judgement"]]
not_mwes = [result for result in results if "judgement" in result and "not an MWE" in result["judgement"]]

# shorten the lists so that if sequence abc is novel, then xabc or xyabc is considered only once
novel_mwes = merge_ngrams(novel_mwes)
variations = merge_ngrams(variations)
not_mwes = merge_ngrams(not_mwes)

print(f"Number of novel MWEs found: {len(novel_mwes)}")
if len(novel_mwes) > 0:
    print(f"Example novel MWEs:")
    for result in novel_mwes[:5]:  # Show first 5 examples
        print(f"\t•{result['n_gram']}")
        print(f"\t\t•Frequency: {result['frequency']}")
        print(f"\t\t•PMI Score: {result['PMI_score']}")
        print(f"\t\t•Sentences:")
        for sentence in result['sentences']:
            print(f"\t\t\t•{sentence}")
        # print(f"\t\t•Annotation: {result['annotation']}")

print(f"{'*'*80}")

print(f"\nNumber of variations found: {len(variations)}")
if len(variations) > 0:
    print(f"Example variations:")
    for result in variations[:5]:  # Show first 5 examples
        print(f"\t•{result['n_gram']}")
        print(f"\t\t•Frequency: {result['frequency']}")
        print(f"\t\t•PMI Score: {result['PMI_score']}")
        print(f"\t\t•Sentences:")
        for sentence in result['sentences']:
            print(f"\t\t\t•{sentence}")
        # print(f"\t\t•Annotation: {result['annotation']}")

print(f"{'*'*80}")

print(f"\nNumber of non-MWEs found: {len(not_mwes)}")
if len(not_mwes) > 0:
    print(f"Example non-MWEs:")
    for result in not_mwes[:5]:  # Show first 5 examples
        print(f"\t•{result['n_gram']}")
        # print(f"\t\t•Annotation: {result['annotation']}")

Unique judgements: {'"not an MWE"', '["Novel MWE"]', 'not an MWE', 'Novel MWE', 'Variation of an existing MWE', '"Novel MWE"', '["not an MWE"]', '"Variation of an existing MWE"', 'novel MWE', 'Not an MWE', '["Not an MWE"]'}
Number of results: 5605
Number of novel MWEs found: 53
Example novel MWEs:
	•im not saying
		•Frequency: 2718
		•PMI Score: 18.42
		•Sentences:
			•Im not saying they dont need advocacy.
			•Now Im not saying that it's a bad thing that people are putting their health and future well being (possibly) on the line for the possibility of making millions of dollars.
			•But a cliffhanger people seem to leave out is that most of his rank1 was archieved by duo-Q recently a good example would be his r1 at the moment (Apdo dog2) Duo-Q a shitton with Rekkles and WhiteKnight108 (Another pretty sick soloq guy) 
 Im not saying he wouldnt archieve rank1 without, but it certainly helped.
	•shit out of
		•Frequency: 2055
		•PMI Score: 18.01
		•Sentences:
			•Beat the shit out of me