# Portuguese Dataset Analysis
This notebook swaps columns 2 and 3 of the original file, shuffles the lines, creates two datasets, and calculates AR/ER/IR/OR ending proportions for all three.

In [17]:
# Step 1: Invert columns
input_file = 'por_original'
output_file = 'por'
with open(input_file, 'r', encoding='utf8') as fin, open(output_file, 'w', encoding='utf8') as fout:
    for line in fin:
        parts = line.strip().split('\t')
        if len(parts) == 3:
            lemma, form, msd = parts
            fout.write(f'{lemma}\t{msd}\t{form}\n')
print('Inverted columns and saved to por')

Inverted columns and saved to por


In [28]:
# Step 2: Lemma-based stratified splits by verb ending, maintain proportions, no lemma overlap, exact line counts
import random
from collections import defaultdict
import math

input_file = "por"
train_file = "../por.trn"
dev_file = "../por.dev"
test_file = "../por.tst"
train_size = 10000
dev_size = 1000
test_size = 1000

# Read all lines and group by lemma
with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

lemma2lines = defaultdict(list)
lemma2ending = {}
for line in lines:
    lemma = line.split('\t')[0]
    lemma2lines[lemma].append(line)
    # Determine ending
    if lemma.endswith('ar'):
        lemma2ending[lemma] = 'ar'
    elif lemma.endswith('er'):
        lemma2ending[lemma] = 'er'
    elif lemma.endswith('ir'):
        lemma2ending[lemma] = 'ir'
    elif lemma.endswith('or') or lemma.endswith('ôr'):
        lemma2ending[lemma] = 'or'
    else:
        lemma2ending[lemma] = 'other'

# Group lemmas by ending
ending2lemmas = defaultdict(list)
for lemma, ending in lemma2ending.items():
    ending2lemmas[ending].append(lemma)

# Proportion of each ending in the full set (by lemma count)
total_lemmas = sum(len(lemmas) for lemmas in ending2lemmas.values())
ending2prop = {ending: len(lemmas)/total_lemmas for ending, lemmas in ending2lemmas.items()}

# Target lemma counts per split
splits = {'train': train_size, 'dev': dev_size, 'test': test_size}
total_lines = train_size + dev_size + test_size
split_props = {split: splits[split]/total_lines for split in splits}
ending2target_lemmas = {split: {ending: int(len(ending2lemmas[ending])*split_props[split]) for ending in ending2lemmas} for split in splits}
# Adjust for rounding errors
for ending in ending2lemmas:
    total = sum(ending2target_lemmas[split][ending] for split in splits)
    diff = len(ending2lemmas[ending]) - total
    ending2target_lemmas['train'][ending] += diff

# Assign lemmas to splits
split_lemmas = {'train': set(), 'dev': set(), 'test': set()}
random.seed(42)
for ending, lemmas in ending2lemmas.items():
    random.shuffle(lemmas)
    idx = 0
    for split in ['train', 'dev', 'test']:
        n = ending2target_lemmas[split][ending]
        split_lemmas[split].update(lemmas[idx:idx+n])
        idx += n

# Collect lines for each split
split_lines = {'train': [], 'dev': [], 'test': []}
for split in ['train', 'dev', 'test']:
    for lemma in split_lemmas[split]:
        split_lines[split].extend(lemma2lines[lemma])
    random.shuffle(split_lines[split])
    split_lines[split] = split_lines[split][:splits[split]]  # trim to exact size

# Write splits
with open(train_file, "w", encoding="utf-8") as f:
    f.writelines(split_lines['train'])
with open(dev_file, "w", encoding="utf-8") as f:
    f.writelines(split_lines['dev'])
with open(test_file, "w", encoding="utf-8") as f:
    f.writelines(split_lines['test'])

print(f"Train: {len(split_lines['train'])} lines, Dev: {len(split_lines['dev'])} lines, Test: {len(split_lines['test'])} lines.")

Train: 10000 lines, Dev: 1000 lines, Test: 1000 lines.


In [29]:
# Step 3: Calculate AR/ER/IR/OR ending proportions
def ending_proportion(lines):
    ar, er, ir, orr = 0, 0, 0, 0
    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) >= 1:
            lemma = parts[0]
            if lemma.endswith('ar'):
                ar += 1
            elif lemma.endswith('er'):
                er += 1
            elif lemma.endswith('ir'):
                ir += 1
            elif lemma.endswith('or') or lemma.endswith('ôr'):
                orr += 1
            else:
                print(f'Unexpected ending in lemma: {lemma}')
    total = ar + er + ir + orr
    return {'AR': ar, 'ER': er, 'IR': ir, 'OR': orr, 'Total': total,
            'AR%': ar/total if total else 0,
            'ER%': er/total if total else 0,
            'IR%': ir/total if total else 0,
            'OR%': orr/total if total else 0}

with open(train_file, 'r', encoding='utf8') as f:
    train_lines = f.readlines()
with open(dev_file, 'r', encoding='utf8') as f:
    dev_lines = f.readlines()
with open(test_file, 'r', encoding='utf8') as f:
    test_lines = f.readlines()
# write to output file
with open('ending_proportions.txt', 'w', encoding='utf-8') as f:
    orig = ending_proportion(lines)
    train = ending_proportion(train_lines)
    dev = ending_proportion(dev_lines)
    test = ending_proportion(test_lines)
    for name, props in zip(['Original proportions', 'Training set proportions', 'Development set proportions', 'Test set proportions'], [orig, train, dev, test]):
        f.write(f"{name}:\n")
        f.write(f"  Total: {props['Total']}\n")
        f.write(f"  AR: {props['AR']} ({props['AR%']*100:.2f}%)\n")
        f.write(f"  ER: {props['ER']} ({props['ER%']*100:.2f}%)\n")
        f.write(f"  IR: {props['IR']} ({props['IR%']*100:.2f}%)\n")
        f.write(f"  OR: {props['OR']} ({props['OR%']*100:.2f}%)\n\n")

## Results
The AR/ER/IR ending proportions for the original file, dataset1, and dataset2 are printed above.

In [31]:
train_file = "../por.trn"
dev_file = "../por.dev"
test_file = "../por.tst"

# check for common lemmas in train and dev
with open(train_file, 'r', encoding='utf8') as f:
    train_lines = f.readlines()
with open(dev_file, 'r', encoding='utf8') as f:
    dev_lines = f.readlines()
train_lemmas = set(line.split('\t')[0] for line in train_lines)
dev_lemmas = set(line.split('\t')[0] for line in dev_lines)
common_lemmas = train_lemmas.intersection(dev_lemmas)
print(f'Common lemmas in train and dev: {len(common_lemmas)}')
# check for common lemmas in train and test
with open(test_file, 'r', encoding='utf8') as f:
    test_lines = f.readlines()
test_lemmas = set(line.split('\t')[0] for line in test_lines)
common_lemmas_test = train_lemmas.intersection(test_lemmas)
print(f'Common lemmas in train and test: {len(common_lemmas_test)}')

Common lemmas in train and dev: 0
Common lemmas in train and test: 0
