# Portuguese Dataset Analysis
This notebook swaps columns 2 and 3 of the original file, shuffles the lines, creates two datasets, and calculates AR/ER/IR/OR ending proportions for all three.

In [17]:
# Step 1: Invert columns
input_file = 'por_original'
output_file = 'por'
with open(input_file, 'r', encoding='utf8') as fin, open(output_file, 'w', encoding='utf8') as fout:
    for line in fin:
        parts = line.strip().split('\t')
        if len(parts) == 3:
            lemma, form, msd = parts
            fout.write(f'{lemma}\t{msd}\t{form}\n')
print('Inverted columns and saved to por')

Inverted columns and saved to por


In [1]:
# Step 2: Shuffle and split into two datasets
import random

input_file = "por"
train_file = "../por.trn"
dev_file = "../por.dev"
test_file = "../por.tst"
train_size = 10000
dev_size = 1000
test_size = 1000

# Read all lines from the input file
with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Shuffle lines to randomize
random.seed(42)  # For reproducibility
random.shuffle(lines)

# Split into train and test
train_lines = lines[:train_size]
dev_lines = lines[train_size:train_size + dev_size]
test_lines = lines[train_size + dev_size:train_size + dev_size + test_size]

# Write train dataset
with open(train_file, "w", encoding="utf-8") as f:
    f.writelines(train_lines)

# Write dev dataset
with open(dev_file, "w", encoding="utf-8") as f:
    f.writelines(dev_lines)

# Write test dataset
with open(test_file, "w", encoding="utf-8") as f:
    f.writelines(test_lines)

In [None]:
# Step 3: Calculate AR/ER/IR/OR ending proportions
def ending_proportion(lines):
    ar, er, ir, orr = 0, 0, 0, 0
    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) >= 1:
            lemma = parts[0]
            if lemma.endswith('ar'):
                ar += 1
            elif lemma.endswith('er'):
                er += 1
            elif lemma.endswith('ir'):
                ir += 1
            elif lemma.endswith('or') or lemma.endswith('ôr'):
                orr += 1
            else:
                print(f'Unexpected ending in lemma: {lemma}')
    total = ar + er + ir + orr
    return {'AR': ar, 'ER': er, 'IR': ir, 'OR': orr, 'Total': total,
            'AR%': ar/total if total else 0,
            'ER%': er/total if total else 0,
            'IR%': ir/total if total else 0,
            'OR%': orr/total if total else 0}

# write to output file
with open('ending_proportions.txt', 'w', encoding='utf-8') as f:
    orig = ending_proportion(lines)
    train = ending_proportion(train_lines)
    test = ending_proportion(test_lines)
    for name, props in zip(['Original proportions', 'Training set proportions', 'Test set proportions'], [orig, train, test]):
        f.write(f"{name}:\n")
        f.write(f"  Total: {props['Total']}\n")
        f.write(f"  AR: {props['AR']} ({props['AR%']*100:.2f}%)\n")
        f.write(f"  ER: {props['ER']} ({props['ER%']*100:.2f}%)\n")
        f.write(f"  IR: {props['IR']} ({props['IR%']*100:.2f}%)\n")
        f.write(f"  OR: {props['OR']} ({props['OR%']*100:.2f}%)\n\n")

## Results
The AR/ER/IR ending proportions for the original file, dataset1, and dataset2 are printed above.