In [1]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import re
import random

# set seed for reproducibility
random.seed(42)

# Load 2% of the dataset: Can be increased for more diversity later
reddit = load_dataset("webis/tldr-17", trust_remote_code=True, split="train[:2%]")
corpus = [data["normalizedBody"] for data in reddit]
# shuffle the data
random.shuffle(corpus)

# Split into sentences and create a searchable structure
sentences = []
for text in corpus:
    # Split on ., !, ? followed by space and capitalize letter
    split_sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
    sentences.extend([s.strip() for s in split_sentences if len(s.strip()) > 0])

# Create chunks of 1000 sentences
chunk_size = 1000
sentence_chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

def find_matching_sentences(query_string, num_matches=3):
    matches = []
    for chunk in sentence_chunks:
        for sentence in chunk:
            if query_string.lower() in sentence.lower():
                matches.append(sentence)
                if len(matches) >= num_matches:
                    return matches[:num_matches]
    return matches

# Test the function
query = "machine learning"
matches = find_matching_sentences(query)
print(f"Found {len(matches)} sentences matching '{query}':")
for match in matches:
    print(f"\t•{match}")

Found 3 sentences matching 'machine learning':
	•There are hard (i.e., more interesting) problems to solve, and there can be a lot more math involved, especially if you are of the machine learning / artificial intelligence type.
	•Ironically however, if you could solve a problem like this, your best bet would be to have a machine learning algorithm try to figure it out.
	•Honestly I would like to see machine learning where your IFCS learns on the fly how to control the ship.


In [2]:
import json
from tqdm import tqdm

# load the n-gram datasets with the PMI scores
target = {}

for index in tqdm(range(2, 6)):
    with open(f"../data/corpus/reddit_10/{index}-gram_top_1000_pmi.json", "r", encoding="utf-8") as f:
        target[index] = json.load(f)

    # Find the sentences containing the n-grams from the corpus and add it to the dataset
    for ngram in target[index]:
        matches = find_matching_sentences(ngram)
        target[index][ngram]["sentences"] = matches

# Save the dataset
with open("../data/corpus/reddit_10/top_1000_ngrams_with_counts_pmi_sentences.json", "w", encoding="utf-8") as f:
    json.dump(target, f, indent=4)

print("Structure of the dataset:")
# show first two entries
for ngram in list(target.keys())[:2]:
    print(f"{ngram}-gram:")
    for key, value in list(target[ngram].items())[:2]:
        print(f"\t•{key}: {value}")
        if key == "sentences":
            print(f"\t\t•{value[:2]}")
    print()

100%|██████████| 4/4 [01:04<00:00, 16.05s/it]

Structure of the dataset:
2-gram:
	•of the: {'count': 295160, 'PMI': -0.83, 'sentences': ["You know, people go to all the trouble of writing and updating the FAQ, just so that people like you, won't make fools of themselves by making bold, confidant, assertions, without thinking it through, or doing even a little bit of research.", 'Further, to suggest that anti theism has anything to do with hating people demonstrates breath taking ignorance of the subject, or a malicious attempt to defame and inflame.', 'Now, even if you disagree with every single thing he said, surely you must realise that it would have been impossible for him to achieve the success he did as an author and public intellectual if he had tried to maintain a position of hating people because of their affiliations.']}
	•in the: {'count': 261647, 'PMI': 0.09, 'sentences': ["The land on the other side of the Eastern Peninsula is being developed, but it's far from the centre (and has two bottlenecks in the sea), so it's ea




In [3]:
from openai import OpenAI
from collections import Counter
import requests
from requests.exceptions import Timeout, RequestException

client = OpenAI(
    base_url='http://kt-gpu5.ijs.si:11435/v1',
    api_key='ollama',  # required, but unused
)

def create_chat_completions(model_name, message, timeout=100):
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": message},
            ],
            timeout=timeout
        )
        return response.choices[0].message.content
    except Timeout:
        return None
    except RequestException as e:
        return None

annotator_model = "gemma2:latest"
judge_model = "qwen2.5:72b"

# test the function
message = "What is machine learning?"
response = create_chat_completions(annotator_model, message)
print(f"The response by {annotator_model} to '{message}' is: {response}")
print(response)
message = f"Rate the quality of the response: {response}"
response = create_chat_completions(judge_model, message)
print(f"The response by {judge_model} to '{message}' is: {response}")

The response by gemma2:latest to 'What is machine learning?' is: Imagine teaching a dog a new trick. You show it what to do, reward it when it gets close, and correct it when it's wrong. Over time, the dog learns the trick through practice and feedback.

Machine learning is similar! It's a type of artificial intelligence where computers learn from data instead of being explicitly programmed. 

Here's a breakdown:

* **Data:**  Just like the dog needs examples, machine learning algorithms need lots of data to learn from. This data can be anything: images, text, numbers, sound recordings, etc.
* **Algorithm:** This is the set of rules the computer uses to process the data and find patterns. Think of it as the training plan for the algorithm.
* **Model:**  After learning from the data, the algorithm creates a "model" which can make predictions or decisions on new, unseen data.

**Here are some examples of machine learning in action:**

* **Recommendation systems:** Netflix suggesting movi

In [None]:
import json
from tqdm import tqdm

def annotate_and_score_mwes(dataset, results_file, annotator_model, judge_model):
    """
    Annotates and scores MWEs from the dataset using annotator and judge prompts.
    
    Parameters:
        dataset (dict): The dataset containing n-grams and their details.
        results_file (str): Path to the file where results should be written.
        annotator_message (str): Template for the annotator prompt.
        judge_message (str): Template for the judge prompt.
        annotator_model (str): Name of the model to use for annotating chat completions.
        judge_model (str): Name of the model to use for judging chat completions.
    """
    for ngram, data in tqdm(dataset.items()):
        for phrase, details in tqdm(data.items()):
            # Extract data for the prompt
            n_gram = phrase
            frequency = details.get("count", "N/A")
            pmi_score = details.get("PMI", "N/A")
            sentences = details.get("sentences", [])

            # Limit to three sentences for prompt context
            sentence_1 = sentences[0] if len(sentences) > 0 else "No example sentence available."
            sentence_2 = sentences[1] if len(sentences) > 1 else "No example sentence available."
            sentence_3 = sentences[2] if len(sentences) > 2 else "No example sentence available."

            annotator_message = f"""You are an expert linguist helping to identify multi-word expressions (MWEs) in a large corpus. A multi-word expression is a sequence of words that form a single unit of meaning and cannot be easily deduced by the meanings of individual words.

            Here is the information about a potential MWE:

            **Candidate Phrase:** "{n_gram}"  
            **PMI Score:** {pmi_score}  
            **Frequency in Corpus:** {frequency}

            ### Example Sentences:
            1. "{sentence_1}"
            2. "{sentence_2}"
            3. "{sentence_3}"

            **Questions:**
            1. Does the candidate phrase overlap in meaning or structure with any known MWEs? If so, which one(s)?
            2. Could this phrase be considered a new variation or extension of an existing MWE? Why?
            3. If it is novel, does it demonstrate properties of an MWE such as idiomaticity or collocational fixedness?
            4. How likely is it that this expression is becoming a trend in social media language? Rate this likelihood on a scale from 1-5.
            5. Based on the given information, would you classify this candidate as:
                - Novel MWE
                - A variation of an existing MWE
                - Not an MWE

            Explain your decision with examples and reasoning.
            """


            # Fill the annotator prompt
            annotator_prompt = annotator_message.format(
                n_gram=n_gram,
                PMI_score=pmi_score,
                frequency=frequency,
                sentence_1=sentence_1,
                sentence_2=sentence_2,
                sentence_3=sentence_3
            )

            # Get the annotator's response
            annotation = create_chat_completions(annotator_model, annotator_prompt)
            if annotation is None:
                annotation = "Error: Annotation not available."
            
            
            judge_message = f"""You are a judge evaluating the response of a linguist who has classified a multi-word expression (MWE) in a large corpus. Based on that, can you provide a label for the candidate phrase from the following options: ["Novel MWE", "Variation of an existing MWE", "not an MWE"]? Please provide only the label without any additional information."""

            # Fill the judge prompt
            judge_prompt = judge_message.format(n_gram=n_gram)
            
            # Get the judge's response
            judgment = create_chat_completions(judge_model, judge_prompt)
            if judgment is None:
                judgment = "Error: Judgment not available."

            # Prepare the result entry
            result_entry = {
                "n_gram": n_gram,
                "frequency": frequency,
                "PMI_score": pmi_score,
                "sentences": [sentence_1, sentence_2, sentence_3],
                "annotation": annotation,
                "judgment": judgment
            }

            # Write the result to the file
            with open(results_file, "a") as f:
                f.write(json.dumps(result_entry) + "\n")

# Annotate and score the MWEs
results_file = "../data/corpus/reddit_10/top_1000_ngrams_with_counts_pmi_sentences_results.json"

# test with a small subset of 10  n-grams per n = 2, 3, 4, 5
# target = {k: dict(list(v.items())[:10]) for k, v in target.items()}

# annotate_and_score_mwes
annotate_and_score_mwes(target, results_file, annotator_model, judge_model)

100%|██████████| 1000/1000 [1:19:40<00:00,  4.78s/it]
100%|██████████| 1000/1000 [1:19:42<00:00,  4.78s/it]
 50%|█████     | 2/4 [2:39:23<2:39:23, 4781.76s/it]

In [None]:
import json

# Load the results
results_file = "../data/corpus/reddit_10/top_1000_ngrams_with_counts_pmi_sentences_results.json"

# show number of novel MWEs found
with open(results_file, "r", encoding="utf-8") as f:
    results = [json.loads(line) for line in f]

# if results contain a "novel MWE" label string
novel_mwes = [result for result in results if "novel MWE" in result["judgment"]]
print(f"Number of novel MWEs found: {len(novel_mwes)}")
if len(novel_mwes) > 0:
    print(f"Example novel MWEs:")
    for result in novel_mwes:
        print(f"\t•{result['n_gram']}")
        print(f"\t\t•Annotation: {result['annotation']}")

# if results contain a "variation of an existing MWE" label string
variations = [result for result in results if "variation of an existing MWE" in result["judgment"]]
print(f"Number of variations found: {len(variations)}")
if len(variations) > 0:
    print(f"Example variations:")
    for result in variations:
        print(f"\t•{result['n_gram']}")
        print(f"\t\t•Annotation: {result['annotation']}")

# if results contain a "not an MWE" label string
not_mwes = [result for result in results if "not an MWE" in result["judgment"]]
print(f"Number of non-MWEs found: {len(not_mwes)}")
if len(not_mwes) > 0:
    print(f"Example non-MWEs:")
    for result in not_mwes:
        print(f"\t•{result['n_gram']}")
        print(f"\t\t•Annotation: {result['annotation']}")