In [1]:
import random
import glob
import os
import time
import numpy as np
import pandas as pd
import _pickle as cPickle
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer
import utilities as utl
from sklearn.model_selection import train_test_split
import prepare_dataset_utilities as prepare_utl


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
benchmark_name = "tus_benchmark_corrected"

In [3]:
all_positive_samples = prepare_utl.LoadSampleDataPoints(r"finetune_data" + os.sep + benchmark_name + os.sep + "all_positive_samples.txt")
all_negative_samples = prepare_utl.LoadSampleDataPoints(r"finetune_data" + os.sep + benchmark_name + os.sep + "all_negative_samples.txt")
all_mixed_samples = all_positive_samples.union(all_negative_samples)

In [4]:
positive_same_table = prepare_utl.LoadSampleDataPoints(r"finetune_data" + os.sep + benchmark_name + os.sep + "positive_same_table.txt")
positive_different_tables = prepare_utl.LoadSampleDataPoints(r"finetune_data" + os.sep + benchmark_name + os.sep + "positive_different_tables.txt")
negative_same_table_cluster = prepare_utl.LoadSampleDataPoints(r"finetune_data" + os.sep + benchmark_name + os.sep + "negative_same_table_cluster.txt")
negative_different_table_cluster = prepare_utl.LoadSampleDataPoints(r"finetune_data" + os.sep + benchmark_name + os.sep + "negative_different_table_cluster.txt")

In [5]:
def SplitEachSetTrainTest(all_samples, ratio = 0.15):
    # for faster train-test split, index the strings to make input to test_train_split smaller.
    all_samples_set = set()
    for item in all_samples:
        item = item.rsplit("\t",1)
        if len(item) == 2:
            all_samples_set.add((item[0], item[1]))
    all_samples = list(all_samples_set)
    train_data, test_data, valid_data = prepare_utl.SplitTrainTestValid(all_samples, ratio)
    print("Train size:", len(train_data))
    print("Test size:", len(test_data))
    print("Valid size:", len(valid_data))
    return train_data, test_data, valid_data

print("Positive same:")
positive_same_train, positive_same_test, positive_same_valid = SplitEachSetTrainTest(positive_same_table)
print("Positive different:")
positive_different_train, positive_different_test, positive_different_valid = SplitEachSetTrainTest(positive_different_tables)
print("Negative same:")
negative_same_train, negative_same_test, negative_same_valid = SplitEachSetTrainTest(negative_same_table_cluster)
print("Negative different:")
negative_different_train, negative_different_test, negative_different_valid = SplitEachSetTrainTest(negative_different_table_cluster)

Positive same:
Successful! No leakage found during splitting.
Train size: 1881
Test size: 403
Valid size: 403
Positive different:
Successful! No leakage found during splitting.
Train size: 19121
Test size: 4096
Valid size: 4096
Negative same:
Successful! No leakage found during splitting.
Train size: 10500
Test size: 2250
Valid size: 2250
Negative different:
Successful! No leakage found during splitting.
Train size: 10500
Test size: 2250
Valid size: 2250


In [6]:
pop_out = positive_same_train.pop()
positive_same_test.add(pop_out)

pop_out = positive_different_train.pop()
positive_different_valid.add(pop_out)


In [7]:
train_data_points = positive_same_train.union(positive_different_train).union(negative_same_train).union(negative_different_train)
test_data_points = positive_same_test.union(positive_different_test).union(negative_same_test).union(negative_different_test)
valid_data_points = positive_same_valid.union(positive_different_valid).union(negative_same_valid).union(negative_different_valid)


if len(test_data_points.intersection(train_data_points)) > 0:
    print("Leakage between test and train")
    test_data_points = test_data_points - train_data_points
if len(test_data_points.intersection(valid_data_points)) > 0:
    print("Leakage between test and valid")
    valid_data_points = valid_data_points - test_data_points
if len(valid_data_points.intersection(train_data_points)) > 0:
    print("Leakage between valid and train")
    valid_data_points = valid_data_points - train_data_points

if len(test_data_points.intersection(train_data_points)) > 0:
    print("Leakage between test and train")
else:
    print("No leakage between test and train")
if len(test_data_points.intersection(valid_data_points)) > 0:
    print("Leakage between test and valid")
else:
    print("No leakage between test and valid")
if len(valid_data_points.intersection(train_data_points)) > 0:
    print("Leakage between valid and train")
else:
    print("No leakage between test and valid")
print("Train size:", len(train_data_points))
print("Test size:", len(test_data_points))
print("Valid size:", len(valid_data_points))

No leakage between test and train
No leakage between test and valid
No leakage between test and valid
Train size: 42000
Test size: 9000
Valid size: 9000


In [8]:
train_positive, train_negative = prepare_utl.CountClassLabelSize(train_data_points)
test_positive, test_negative = prepare_utl.CountClassLabelSize(test_data_points)
valid_positive, valid_negative = prepare_utl.CountClassLabelSize(valid_data_points)
print("Train positive:", train_positive)
print("Train negative:", train_negative)
print("Test positive:", test_positive)
print("Test negative:", test_negative)
print("Valid positive:", valid_positive)
print("Valid negative:", valid_negative)

Train positive: 21000
Train negative: 21000
Test positive: 4500
Test negative: 4500
Valid positive: 4500
Valid negative: 4500


In [9]:
prepare_utl.SaveDatasetAsTSVFile(train_data_points, r"data/finetune_data" + os.sep + benchmark_name + os.sep + "train")
prepare_utl.SaveDatasetAsTSVFile(test_data_points, r"data/finetune_data" + os.sep + benchmark_name + os.sep + "test")
prepare_utl.SaveDatasetAsTSVFile(valid_data_points, r"data/finetune_data" + os.sep + benchmark_name + os.sep + "valid")