# Data splitting for Tira parsing dataset
Compare DataSAIL (Joeres et al 2025) w/ adversarial splitting (Søgaard et al 2021)

In [1]:
from datasail.sail import datasail
from datasail.eval import eval_split
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import neighbors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import wasserstein_distance
import numpy as np
from unidecode import unidecode

Illegal instruction (core dumped)
Illegal instruction (core dumped)
Illegal instruction (core dumped)
Illegal instruction (core dumped)


## Data preprocessing
Load sentences from text file and get TF-IDF vectors.

In [2]:
sentences_file = '../data/sentences.txt'
with open(sentences_file, 'r') as f:
    lines = f.readlines()
sentences = [line.split(',')[0] for line in lines]
len(sentences), sentences[:5]

(7861,
 ['àprí jɜ̀dí ðáŋàlà',
  'àprí jə̀və̀lɛ̀ðɔ́ ðáŋàlà',
  'àprí jàvə́lɛ̀ðɛ́ ðàŋàlà',
  'àprí jávə́lɛ̀ðà ðàŋàlà',
  'ðə̀və̀lɛ́ðɔ́ áprì'])

In [3]:
vectorizer = TfidfVectorizer(analyzer='char_wb', preprocessor=unidecode)
vectors = vectorizer.fit_transform(sentences)
vectors.shape

(7861, 26)

## DataSAIL

In [7]:
cosine_sim = cosine_similarity(vectors, vectors)
cosine_sim.shape

(7861, 7861)

In [8]:
e_data = {i: sentence for i, sentence in enumerate(sentences)}
e_sim = (list(range(len(sentences))), cosine_sim)
run_count = 5

In [10]:
techniques, inters, groups = datasail(
    techniques=["C1e"],
    splits=[7,2,1],
    names=["train","validation","test"],
    runs=run_count,
    epsilon=0.1,
    solver="SCIP",
    e_type="O",
    e_data=e_data,
    e_sim=e_sim,
)

                                     CVXPY                                     
                                     v1.5.3                                    
(CVXPY) Dec 31 05:54:45 PM: Your problem has 150 variables, 53 constraints, and 0 parameters.




(CVXPY) Dec 31 05:54:46 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Dec 31 05:54:46 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Dec 31 05:54:46 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Dec 31 05:54:46 PM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Dec 31 05:54:46 PM: Compiling problem (target solver=SCIP).
(CVXPY) Dec 31 05:54:46 PM: Reduction chain: Dcp2Cone -> CvxAttr2Constr -> ConeMatrixStuffing -> SCIP
(CVXPY) Dec 31 05:54:46 PM: Applying reduction Dcp2Cone
(CVXPY) Dec 31 05:54:47 PM: Applying reduction CvxAttr2Constr
(CVXPY) Dec 31 05:54:47 PM: Applying



                                     CVXPY                                     
                                     v1.5.3                                    
(CVXPY) Dec 31 05:56:35 PM: Your problem has 150 variables, 53 constraints, and 0 parameters.
(CVXPY) Dec 31 05:56:35 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Dec 31 05:56:35 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Dec 31 05:56:35 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Dec 31 05:56:35 PM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Dec 31 05:56:36 PM: Compiling problem (target solver=SCIP).
(C

In [12]:
datasail_assignments = techniques['C1e']
len(datasail_assignments)

5

## Adversarial splitting
Based on Wasserstein distance. Code adapted from [probing_utils.py](https://github.com/google-research/google-research/blob/master/talk_about_random_splits/probing/probing_utils.py) on Github on 31 Dec 25.

In [4]:
def split_with_wasserstein(vectors,
                           test_set_size=0.1,
                           no_of_trials=1,
                           leaf_size=5,
):
    """Finds test sets by maximizing Wasserstein distances among the given texts.
  
    This is separating the given texts into training/dev and test sets based on an
    approximate Wasserstein method. First all texts are indexed in a nearest
    neighbors structure. Then a new test centroid is sampled randomly, from which
    the nearest neighbors in Wasserstein space are extracted. Those constitute
    the new test set.
    Similarity is computed based on document-term counts.
  
    Args:
      texts: Texts to split into training/dev and test sets.
      test_set_size: Number of elements the new test set should contain.
      no_of_trials: Number of test sets requested.
      min_df: Mainly for speed-up and memory efficiency. All tokens must occur at
        least this many times to be considered in the Wasserstein computation.
      leaf_size: Leaf size parameter of the nearest neighbor search. Set high
        values for slower, but less memory-heavy computation.
  
    Returns:
      Returns a List of test set indices, one for each trial. The indices
      correspond to the items in `texts` that should be part of the test set.
    """
    print('Creating tree structure.')
    nn_tree = neighbors.NearestNeighbors(
        n_neighbors=int(test_set_size*vectors.shape[0]),
        algorithm='ball_tree',
        leaf_size=leaf_size,
        metric=wasserstein_distance)
    nn_tree.fit(vectors)
    print('Sampling test sets.')
    test_set_indices = []

    for trial in range(no_of_trials):
        print('Trial set: %d.', trial)
        # Sample random test centroid.
        sampled_point = np.random.randint(
            vectors.max().max() + 1, size=(1, vectors.shape[1]))
        nearest_neighbors = nn_tree.kneighbors(sampled_point, return_distance=False)
        # We queried for only one datapoint.
        nearest_neighbors = nearest_neighbors[0]
        print(nearest_neighbors[:10])
        test_set_indices.append(nearest_neighbors)

    return test_set_indices

In [9]:
def get_splits_w_wasserstein(vectors, sizes=[0.7, 0.2, 0.1], run_count=run_count):
    train_size, val_size, test_size = sizes

    assignment_list = []
    val_indices = split_with_wasserstein(vectors, val_size, no_of_trials=run_count)
    for val_set in val_indices:
        remaining_idcs = [i for i in range(vectors.shape[0]) if i not in val_set]
        remaining_vectors = vectors[remaining_idcs]
        test_set = split_with_wasserstein(remaining_vectors, test_size)[0]

        train_set = [i for i in remaining_idcs if i not in test_set]

        assignments = {}
        assignments.update({i: 'train' for i in train_set})
        assignments.update({i: 'validation' for i in val_set})
        assignments.update({i: 'test' for i in test_set})
        assignment_list.append(assignments)

    return assignment_list

sogaard_assignments = get_splits_w_wasserstein(vectors.toarray())
len(sogaard_assignments)

Creating tree structure.
Sampling test sets.
Trial set: %d. 0
[7083 7084 7170 7618 7251 7217 7216 6999 7005 7004]
Trial set: %d. 1
[7083 7084 7170 7618 7251 7217 7216 6999 7005 7004]
Trial set: %d. 2
[7083 7084 7170 7618 7251 7217 7216 6999 7005 7004]
Trial set: %d. 3
[7083 7084 7170 7618 7251 7217 7216 6999 7005 7004]
Trial set: %d. 4
[7083 7084 7170 7618 7251 7217 7216 6999 7005 7004]
Creating tree structure.
Sampling test sets.
Trial set: %d. 0
[6109 1019 3483 6197 1369 6150 4623 3484 2214 2213]
Creating tree structure.
Sampling test sets.
Trial set: %d. 0
[6109 1019 3483 6197 1369 6150 4623 3484 2214 2213]
Creating tree structure.
Sampling test sets.
Trial set: %d. 0
[6109 1019 3483 6197 1369 6150 4623 3484 2214 2213]
Creating tree structure.
Sampling test sets.
Trial set: %d. 0
[6109 1019 3483 6197 1369 6150 4623 3484 2214 2213]
Creating tree structure.
Sampling test sets.
Trial set: %d. 0
[6109 1019 3483 6197 1369 6150 4623 3484 2214 2213]


5

In [15]:
rows = []

for i, assignment in enumerate(datasail_assignments):
    scaled_leakage, total_leakage, max_leakage = eval_split(
        datatype="O",
        weights=None,
        distance=None,
        dist_conv=None,
        data=e_data,
        similarity=e_sim,
        split_assignment=assignment,
    )
    rows.append({
        'scaled_leakage': scaled_leakage,
        'total_leakage': total_leakage,
        'max_leakage': max_leakage,
        'run': i,
        'method': 'datasail',
    })

for i, assignment in enumerate(sogaard_assignments):
    scaled_leakage, total_leakage, max_leakage = eval_split(
        datatype="O",
        weights=None,
        distance=None,
        dist_conv=None,
        data=e_data,
        similarity=e_sim,
        split_assignment=assignment,
    )
    rows.append({
        'scaled_leakage': scaled_leakage,
        'total_leakage': total_leakage,
        'max_leakage': max_leakage,
        'run': i,
        'method': 'sogaard',
    })

In [16]:
leakage_df = pd.DataFrame(rows)
leakage_df

Unnamed: 0,scaled_leakage,total_leakage,max_leakage,run,method
0,0.408876,15656220.0,38290830.0,0,datasail
1,0.424479,16253650.0,38290830.0,1,datasail
2,0.419963,16080740.0,38290830.0,2,datasail
3,0.412973,15813090.0,38290830.0,3,datasail
4,0.428336,16401360.0,38290830.0,4,datasail
5,0.399829,15309790.0,38290830.0,0,sogaard
6,0.399829,15309790.0,38290830.0,1,sogaard
7,0.399829,15309790.0,38290830.0,2,sogaard
8,0.399829,15309790.0,38290830.0,3,sogaard
9,0.399829,15309790.0,38290830.0,4,sogaard


## Results
Sogaard's method is actually doing a tiny bit better, so we'll stick with it. Let's save the split to HuggingFace.

In [10]:
final_split = sogaard_assignments[0]
train_idcs = [k for k, v in final_split.items() if v == 'train']
val_idcs = [k for k, v in final_split.items() if v == 'validation']
test_idcs = [k for k, v in final_split.items() if v == 'test']

In [11]:
from datasets import load_dataset, DatasetDict

ds = load_dataset('tira-parsing/tira-parsing')
ds

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['orig_text', 'translation', 'checked_by_pi', 'checked_by_ra', 'reviewer', 'updated_txt', 'updated_gloss'],
        num_rows: 7861
    })
})

In [12]:
ds_dict = {}

for split, indices in [
    ('train', train_idcs),
    ('validation', val_idcs),
    ('test', test_idcs),
]:
    ds_dict[split] = ds['train'].select(indices)

ds_dict = DatasetDict(ds_dict)
ds_dict

DatasetDict({
    train: Dataset({
        features: ['orig_text', 'translation', 'checked_by_pi', 'checked_by_ra', 'reviewer', 'updated_txt', 'updated_gloss'],
        num_rows: 5747
    })
    validation: Dataset({
        features: ['orig_text', 'translation', 'checked_by_pi', 'checked_by_ra', 'reviewer', 'updated_txt', 'updated_gloss'],
        num_rows: 1486
    })
    test: Dataset({
        features: ['orig_text', 'translation', 'checked_by_pi', 'checked_by_ra', 'reviewer', 'updated_txt', 'updated_gloss'],
        num_rows: 628
    })
})

In [13]:
ds_dict.push_to_hub(
    repo_id = 'tira-parsing/tira-parsing',
    commit_message="Generate splits based on Wasserstein distance to minimize overlap",
)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  1.78ba/s]
Processing Files (1 / 1): 100%|██████████|  171kB /  171kB,  427kB/s  
New Data Upload: 100%|██████████|  171kB /  171kB,  427kB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.97s/ shards]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  7.83ba/s]
Processing Files (1 / 1): 100%|██████████| 40.7kB / 40.7kB,  0.00B/s  
New Data Upload: 100%|██████████| 40.7kB / 40.7kB,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.46 shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 17.88ba/s]
Processing Files (1 / 1): 100%|██████████| 28.7kB / 28.7kB,  0.00B/s  
New Data Upload: 100%|██████████| 28.7kB / 28.7kB,  0.00B/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.65 shards/s]


CommitInfo(commit_url='https://huggingface.co/datasets/tira-parsing/tira-parsing/commit/8402f9250977cea1deb0dc199c87ad8138a1bf17', commit_message='Generate splits based on Wasserstein distance to minimize overlap', commit_description='', oid='8402f9250977cea1deb0dc199c87ad8138a1bf17', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tira-parsing/tira-parsing', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tira-parsing/tira-parsing'), pr_revision=None, pr_num=None)