# Data splitting for Tira parsing dataset
Compare DataSAIL (Joeres et al 2025) w/ adversarial splitting (SÃ¸gaard et al 2021)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import neighbors
from sklearn.metrics.pairwise import cosine_similarity
from datasail.sail import datasail
from datasail.eval import eval_split
from scipy.stats import wasserstein_distance
import numpy as np

## Data preprocessing
Load sentences from text file and get TF-IDF vectors.

In [None]:
sentences_file = '../data/sentences.txt'
with open(sentences_file, 'r') as f:
    lines = f.readlines()
sentences = [line.split(',')[0] for line in lines]
len(sentences), sentences[:5]

In [None]:
vectorizer = TfidfVectorizer(analyzer='char_wb')
vectors = vectorizer.fit_transform(sentences)
vectors.shape

## DataSAIL

In [None]:
cosine_sim = cosine_similarity(vectors, vectors)
cosine_sim.shape

In [None]:
e_data = {i: sentence for i, sentence in enumerate(sentences)}
e_sim = (list(range(len(sentences))), cosine_sim)
run_count = 10

In [None]:
techniques, inters, groups = datasail(
    techniques=["C1e"],
    splits=[7,2,1],
    names=["train","validation","test"],
    runs=run_count,
    epsilon=0.1,
    solver="SCIP",
    e_type="O",
    e_data=e_data,
    e_sim=e_sim,
)

In [None]:
datasail_assignments = techniques['C1e']

## Adversarial splitting
Based on Wasserstein distance. Code adapted from [probing_utils.py](https://github.com/google-research/google-research/blob/master/talk_about_random_splits/probing/probing_utils.py) on Github on 31 Dec 25.

In [None]:
def split_with_wasserstein(vectors,
                           test_set_size=0.1,
                           no_of_trials=1,
                           leaf_size=5,
):
    """Finds test sets by maximizing Wasserstein distances among the given texts.
  
    This is separating the given texts into training/dev and test sets based on an
    approximate Wasserstein method. First all texts are indexed in a nearest
    neighbors structure. Then a new test centroid is sampled randomly, from which
    the nearest neighbors in Wasserstein space are extracted. Those constitute
    the new test set.
    Similarity is computed based on document-term counts.
  
    Args:
      texts: Texts to split into training/dev and test sets.
      test_set_size: Number of elements the new test set should contain.
      no_of_trials: Number of test sets requested.
      min_df: Mainly for speed-up and memory efficiency. All tokens must occur at
        least this many times to be considered in the Wasserstein computation.
      leaf_size: Leaf size parameter of the nearest neighbor search. Set high
        values for slower, but less memory-heavy computation.
  
    Returns:
      Returns a List of test set indices, one for each trial. The indices
      correspond to the items in `texts` that should be part of the test set.
    """
    print('Creating tree structure.')
    nn_tree = neighbors.NearestNeighbors(
        n_neighbors=int(test_set_size*vectors.shape[0]),
        algorithm='ball_tree',
        leaf_size=leaf_size,
        metric=wasserstein_distance)
    nn_tree.fit(vectors)
    print('Sampling test sets.')
    test_set_indices = []

    for trial in range(no_of_trials):
        print('Trial set: %d.', trial)
        # Sample random test centroid.
        sampled_point = np.random.randint(
            vectors.max().max() + 1, size=(1, vectors.shape[1]))
        nearest_neighbors = nn_tree.kneighbors(sampled_point, return_distance=False)
        # We queried for only one datapoint.
        nearest_neighbors = nearest_neighbors[0]
        print(nearest_neighbors[:10])
        test_set_indices.append(nearest_neighbors)

    return test_set_indices

In [None]:
def get_splits_w_wasserstein(vectors, sizes=[0.7, 0.2, 0.1], run_count=run_count):
    train_size, val_size, test_size = sizes

    assignment_list = []
    val_indices = split_with_wasserstein(vectors, val_size, no_of_trials=run_count)
    for val_set in val_indices:
        remaining_idcs = [i for i in range(vectors.shape[0]) if i not in val_set]
        remaining_vectors = vectors[remaining_idcs]
        test_set = split_with_wasserstein(remaining_vectors, test_size)[0]

        train_set = [i for i in remaining_idcs if i not in test_set]

        assignments = {}
        assignments.update({i: 'train' for i in train_set})
        assignments.update({i: 'validation' for i in val_set})
        assignments.update({i: 'test' for i in test_set})
        assignment_list.append(assignments)

    return assignment_list

sogaard_assignments = get_splits_w_wasserstein(vectors.toarray())
len(sogaard_assignments)

In [None]:
rows = []

for i, assignment in enumerate(datasail_assignments):
    scaled_leakage, total_leakage, max_leakage = eval_split(
        datatype="O",
        weights=None,
        distance=None,
        dist_conv=None,
        data=e_data,
        similarity=e_sim,
        split_assignment=assignment,
    )
    rows.append({
        'scaled_leakage': scaled_leakage,
        'total_leakage': total_leakage,
        'max_leakage': max_leakage,
        'run': i,
        'method': 'datasail',
    })

for i, assignment in enumerate(sogaard_assignments):
    scaled_leakage, total_leakage, max_leakage = eval_split(
        datatype="O",
        weights=None,
        distance=None,
        dist_conv=None,
        data=e_data,
        similarity=e_sim,
        split_assignment=assignment,
    )
    rows.append({
        'scaled_leakage': scaled_leakage,
        'total_leakage': total_leakage,
        'max_leakage': max_leakage,
        'run': i,
        'method': 'sogaard',
    })

In [None]:
leakage_df = pd.DataFrame(rows)
leakage_df