# UniMorph Dataset Generation for Probing

This notebook extracts morphological paradigms from UniMorph (English) and generates controlled template sentences. The final dataset contains metadata (lexeme, category, inflection label, dimension, etc.) so we can later filter or pair examples for our probing tasks.

In [None]:
import requests
import os

um_url = "https://raw.githubusercontent.com/unimorph/eng/master/eng"

# Check if UniMorph English data exists locally
file_path = "../data/eng.unimorph.tsv"
if os.path.exists(file_path):
    print(f"Using cached UniMorph data from {file_path}")
else:
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    # Download UniMorph English data from GitHub
    print(f"Downloading UniMorph data from {um_url}...")
    response = requests.get(um_url)
    
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(response.text)
        
    num_lines = len(response.text.strip().split('\n'))
    
    print(f"Downloaded UniMorph data with {num_lines} lines and saved to {file_path}")
    
um_lines = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("#"):
            continue
        um_lines.append(line.strip())

Using cached UniMorph data from data/eng.unimorph.tsv


In [2]:
# Check that everything looks good
print(f"First 5 lines of the dataset:")
for line in um_lines[:5]:
    print(line)
print()

print(f"Last 5 lines of the dataset:")
for line in um_lines[-5:]:
    print(line)
print()

# Number of unique lexemes in data
lexemes = set(line.split('\t')[0] for line in um_lines)
print(f"Number of unique lexemes: {len(lexemes)}")

# Print some basic statistics
categories = {}
for line in um_lines:
    parts = line.split('\t')
    if len(parts) == 3:
        features = parts[2]
        if "V;" in features:
            cat = "Verb"
        elif "N;" in features:
            cat = "Noun"
        elif "ADJ" in features:
            cat = "Adjective"
        else:
            cat = "Other"
        categories[cat] = categories.get(cat, 0) + 1

print("\nDistribution by category:")
for cat, count in categories.items():
    print(f"{cat}: {count} entries ({count/len(um_lines)*100:.1f}%)")


First 5 lines of the dataset:
microtome	microtomes	N;PL
microtome	microtomes	V;PRS;3;SG
microtome	microtoming	V;V.PTCP;PRS
microtome	microtomed	V;PST
microtome	microtomed	V;V.PTCP;PST

Last 5 lines of the dataset:
myriadaire	myriadaire	N;SG
dibridgehead	dibridgehead	N;SG
Chicagoese	Chicagoese	N;SG
Druzer	Druzer	N;SG
electrosensible	electrosensible	N;SG

Number of unique lexemes: 399574

Distribution by category:
Noun: 388561 entries (59.6%)
Verb: 127513 entries (19.5%)
Adjective: 136398 entries (20.9%)


Now, we build the paradim table. For each line (format: lexeme \t inflected_form \t features), we map lexemes to a dict of inflection labels.

In [3]:
def get_category_and_label_english(features):
    feats = features.split(";")
    if "V" in feats:
        category = "Verb"
        if "PRES" in feats:
            if "3" in feats and "SG" in feats:
                label = "3rd_pers"
            else:
                label = "base"
        elif "PST" in feats:
            label = "past"
        elif "ING" in feats or "PTCP" in feats:
            label = "present_participle"
        else:
            label = "base"
        dimension = "Tense/Aspect"
    elif "N" in feats:
        category = "Noun"
        label = "plural" if "PL" in feats else "singular"
        dimension = "Number"
    elif "ADJ" in feats:
        category = "Adjective"
        if "COMP" in feats:
            label = "comparative"
        elif "SUP" in feats:
            label = "superlative"
        else:
            label = "positive"
        dimension = "Degree"
    else:
        category, label, dimension = None, None, None
    return category, label, dimension

paradigms = {}
for line in um_lines:
    parts = line.strip().split("\t")
    if len(parts) != 3:
        continue
    lemma, form, feats = parts
    cat, lab, dim = get_category_and_label_english(feats)
    if cat is None:
        continue
    if lemma not in paradigms:
        paradigms[lemma] = {"category": cat, "dimension": dim, "forms": {}}
    if lab not in paradigms[lemma]["forms"]:
        paradigms[lemma]["forms"][lab] = []
    if form not in paradigms[lemma]["forms"][lab]:
        paradigms[lemma]["forms"][lab].append(form)

# Flag to filter lexemes with only one inflection
filter_multi = False  # Set to True to restrict to lexemes with >1 inflection

if filter_multi:
    paradigms = {lemma: info for lemma, info in paradigms.items() if len(info["forms"]) > 1}

print("Using", len(paradigms), "lexemes after filtering (filter_multi =", filter_multi, ")")

Using 399574 lexemes after filtering (filter_multi = False )


In [4]:
import random

random.seed(42)

# print some randomly sampled examples
for lexeme, data in random.sample(list(paradigms.items()), 3):
    print(f"\nLexeme: {lexeme}")
    print(f"Category: {data['category']}")
    print(f"Dimension: {data['dimension']}")
    for label, form in data["forms"].items():
        print(f"{label}: {form}")


Lexeme: telesurgery
Category: Noun
Dimension: Number
singular: ['telesurgery']

Lexeme: overbidder
Category: Noun
Dimension: Number
plural: ['overbidders']
singular: ['overbidder']

Lexeme: floud
Category: Noun
Dimension: Number
plural: ['flouds']
singular: ['floud']


For each lexeme in the paradigm table, generate sentences from fixed templates.

In [None]:
import pandas as pd

# Define the complete set of desired labels for each category.
desired_labels = {
    "Verb": ["base", "3rd_pers", "past", "present_participle"],
    "Noun": ["singular", "plural"],
    "Adjective": ["positive", "comparative", "superlative"]
}

# Define extremely generic (vague) templates.
templates = {
    "Verb": {
        "base": [
            "I {}."
        ],
        "3rd_pers": [
            "He {}."
        ],
        "past": [
            "I {} in the past."
        ],
        "present_participle": [
            "I am {}."
        ]
    },
    "Noun": {
        "singular": [
            "It is {}."
        ],
        "plural": [
            "They are {}."
        ]
    },
    "Adjective": {
        "positive": [
            "It is {}."
        ],
        "comparative": [
            "It is {} than before."
        ],
        "superlative": [
            "It is the {} one."
        ]
    }
}

def select_form_english(category, label, forms_list, lemma):
    # If the unimorph data provides the form for the label, choose it via heuristics;
    # otherwise, fall back to the lemma.
    if forms_list:
        if category == "Verb":
            if label == "base":
                for f in forms_list:
                    if not (f.endswith("s") or f.endswith("ing") or (f.endswith("ed") and f != forms_list[0])):
                        return f
                return forms_list[0]
            elif label == "3rd_pers":
                for f in forms_list:
                    if f.endswith("s"):
                        return f
                return forms_list[0]
            elif label == "past":
                for f in forms_list:
                    if f.endswith("ate"):
                        return f
                for f in forms_list:
                    if not f.endswith("ing") and not f.endswith("en"):
                        return f
                return forms_list[0]
            elif label == "present_participle":
                for f in forms_list:
                    if f.endswith("ing"):
                        return f
                return forms_list[0]
        elif category == "Noun":
            if label == "singular":
                for f in forms_list:
                    if not f.endswith("s"):
                        return f
                return forms_list[0]
            elif label == "plural":
                for f in forms_list:
                    if f.endswith("s"):
                        return f
                return forms_list[0]
        elif category == "Adjective":
            return forms_list[0]
    # Fallback: use the lemma.
    return lemma

controlled_examples = []
for lemma, info in paradigms.items():
    cat = info["category"]
    dim = info["dimension"]
    # For each desired label in the category, either use the available form or fall back to the lemma.
    for lab in desired_labels.get(cat, []):
        if lab in info["forms"]:
            forms_list = info["forms"][lab]
        else:
            forms_list = []
        selected_form = select_form_english(cat, lab, forms_list, lemma)
        for temp in templates[cat][lab]:
            sentence = temp.format(selected_form)
            tokens = sentence.split()
            try:
                target_index = tokens.index(selected_form)
            except ValueError:
                target_index = -1
            if target_index < 0:
                continue
            controlled_examples.append({
                "Sentence": sentence,
                "Target Index": target_index,
                "Lemma": lemma,
                "Category": cat,
                "Inflection Label": lab,
                "Word Form": selected_form,
                "Dimension": dim,
                "Source Type": "Template"
            })

df_controlled = pd.DataFrame(controlled_examples)
print("Generated", len(df_controlled), "controlled template sentences.")
df_controlled.head(10)

# save to CSV
output_file = "../data/controlled_sentences.csv"
df_controlled.to_csv(output_file, index=False)
print(f"Controlled sentences saved to {output_file}")

Generated 261330 controlled template sentences.
Controlled sentences saved to data/controlled_sentences.csv


In [6]:
# Spot check the data
df_controlled.sample(10)

Unnamed: 0,Sentence,Target Index,Lemma,Category,Inflection Label,Word Form,Dimension,Source Type
203761,It is seedborne than before.,2,seedborne,Adjective,comparative,seedborne,Degree,Template
51509,It is right-thinking than before.,2,right-thinking,Adjective,comparative,right-thinking,Degree,Template
186863,It is biophysiochemical than before.,2,biophysiochemical,Adjective,comparative,biophysiochemical,Degree,Template
18075,It is quartzy than before.,2,quartzy,Adjective,comparative,quartzy,Degree,Template
257898,I unstiffen in the past.,1,unstiffen,Verb,past,unstiffen,Tense/Aspect,Template
103227,It is inconstruable than before.,2,inconstruable,Adjective,comparative,inconstruable,Degree,Template
190648,It is the nonesoteric one.,3,nonesoteric,Adjective,superlative,nonesoteric,Degree,Template
181568,It is the paparazzied one.,3,paparazzied,Adjective,superlative,paparazzied,Degree,Template
199876,It is the physiopathogenic one.,3,physiopathogenic,Adjective,superlative,physiopathogenic,Degree,Template
219229,It is Chardinian than before.,2,Chardinian,Adjective,comparative,Chardinian,Degree,Template


In [7]:
import json

# print out the schema
schema = {
    "Sentence": "string",
    "Target Index": "integer",
    "Lemma": "string",
    "Category": "string",
    "Inflection Label": "string",
    "Word Form": "string",
    "Dimension": "string",
    "Source Type": "string"
}
print(json.dumps(schema, indent=2))

# example entry
example_entry = df_controlled.iloc[0].to_dict()
print(json.dumps(example_entry, indent=2))

{
  "Sentence": "string",
  "Target Index": "integer",
  "Lemma": "string",
  "Category": "string",
  "Inflection Label": "string",
  "Word Form": "string",
  "Dimension": "string",
  "Source Type": "string"
}
{
  "Sentence": "I ate in the past.",
  "Target Index": 1,
  "Lemma": "eat",
  "Category": "Verb",
  "Inflection Label": "past",
  "Word Form": "ate",
  "Dimension": "Tense/Aspect",
  "Source Type": "Template"
}


So the sentences we generate don't look very natural, probably because we use very naive templates. I've thought of a couple of ways to improves this (e.g. use a language model to generate sentences), but I think a better approach will be to grab context sentences of wordforms and their inflections from the WikiText dataset. The next few code cells do exactly this.

In [8]:
import re
from datasets import load_dataset
from tqdm import tqdm

# Frequency threshold: candidate words must appear at least this many times.
min_occurrences = 10
# Maximum sentences to store per candidate to avoid memory bloat.
max_sentences_per_candidate = 50

# Compile candidate entries from our paradigms (convert to lowercase for matching)
candidates = []
for lemma, info in paradigms.items():
    cat = info["category"]
    for lab, forms_list in info["forms"].items():
        for form in forms_list:
            candidates.append({
                "lemma": lemma,
                "category": cat,
                "inflection_label": lab,
                "word_form": form.lower()
            })

# Build a dictionary mapping candidate word (lower-case) to a tuple: (count, list_of_sentences)
candidate_sentences = { cand["word_form"]: [0, []] for cand in candidates }
candidate_set = set(candidate_sentences.keys())

In [9]:
wikitext = load_dataset("wikitext", "wikitext-103-v1", split="train", streaming=True)

# Process each WikiText entry with a progress bar.
for entry in tqdm(wikitext, desc="Processing WikiText entries"):
    text = entry["text"]
    if not text.strip():
        continue
    # Naively split text into sentences using period.
    sentences = text.split(".")
    for sentence in sentences:
        sent = sentence.strip()
        if not sent:
            continue
        # Tokenize sentence into words (convert tokens to lower case for matching)
        tokens = re.findall(r"\w+", sent.lower())
        token_set = set(tokens)
        common = candidate_set.intersection(token_set)
        if common:
            for cand in common:
                count, sents_list = candidate_sentences[cand]
                candidate_sentences[cand][0] = count + 1  # increment count
                # Only store up to max_sentences_per_candidate sentences.
                if len(sents_list) < max_sentences_per_candidate:
                    sents_list.append(sent)

# Filter out candidate word forms with fewer occurrences than the threshold.
filtered_candidates = {cand: (count, sents) for cand, (count, sents) in candidate_sentences.items() if count >= min_occurrences}
print("Found", len(filtered_candidates), "candidate word forms with at least", min_occurrences, "occurrences in WikiText.")

Processing WikiText entries: 1801350it [02:03, 14594.14it/s]

Found 55826 candidate word forms with at least 10 occurrences in WikiText.





In [None]:
import random

# Set the desired number of sentences per candidate.
target_examples = 50

# Filter candidates based on the threshold (min_occurrences) from before.
# 'filtered_candidates' was defined earlier as:
#   { cand: (count, sents) for cand, (count, sents) in candidate_sentences.items() if count >= min_occurrences }
# Here, we rebalance so that each candidate with at least target_examples sentences provides exactly target_examples samples.
balanced_candidates = {
    cand: (count, random.sample(sents, target_examples))
    for cand, (count, sents) in filtered_candidates.items()
    if len(sents) >= target_examples
}
print("After balancing, using", len(balanced_candidates), "candidate word forms (each with", target_examples, "sentences).")

# Build a mapping from candidate word (lowercase) to its metadata entries.
metadata_map = {}
for cand in candidates:
    key = cand["word_form"]
    if key not in metadata_map:
        metadata_map[key] = []
    metadata_map[key].append(cand)

# Assemble rows for the balanced natural context dataset.
rows = []
for cand, (count, sents) in balanced_candidates.items():
    # For each candidate word form, add each corresponding metadata entry.
    for meta in metadata_map.get(cand, []):
        for sentence in sents:
            # Determine the index of the candidate word (using case-insensitive matching).
            orig_tokens = sentence.split()
            idx = -1
            for i, token in enumerate(orig_tokens):
                if token.lower() == cand:
                    idx = i
                    break
            if idx >= 0:
                rows.append({
                    "Sentence": sentence,
                    "Target Index": idx,
                    "Lemma": meta["lemma"],
                    "Category": meta["category"],
                    "Inflection Label": meta["inflection_label"],
                    "Word Form": cand,
                    "Source Type": "NaturalWikiText"
                })

df_wikitext_sentences = pd.DataFrame(rows)
print("Wikitext sentences dataset has", len(df_wikitext_sentences), "rows.")
df_wikitext_sentences.head(10)

df_wikitext_sentences.to_csv("../data/wikitext_sentences.csv", index=False)

print("Probing input dataset saved as 'wikitext_sentences.csv'")

After balancing, using 30811 candidate word forms (each with 50 sentences).
Wikitext sentences dataset has 2276982 rows.
Probing input dataset saved as 'wikitext_sentences.csv'


In [None]:
# Combine and shufle these datasets
combined_df = pd.concat([df_controlled, df_wikitext_sentences], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
combined_df.to_csv("../data/combined_sentences.csv", index=False)
print("Combined dataset has", len(combined_df), "rows. Saved as 'combined_sentences.csv'.")

Combined dataset has 2538312 rows. Saved as 'combined_sentences.csv'.
