In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import StratifiedShuffleSplit

In [18]:
def split_train_test(df):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
    for train_valid_idx, test_idx in sss.split(df, df['target']):
        train = df.loc[train_valid_idx].reset_index(drop=True)
        test = df.loc[test_idx].reset_index(drop=True)

    return train, test

In [6]:
def split_data(df):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_valid_idx, test_idx in split.split(df, df['target']):
        train_valid = df.loc[train_valid_idx].reset_index(drop=True)
        test = df.loc[test_idx].reset_index(drop=True)

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
    for train_idx, valid_idx in split.split(train_valid, train_valid['target']):
        train = train_valid.loc[train_idx].reset_index(drop=True)
        valid = train_valid.loc[valid_idx].reset_index(drop=True)

    return train, valid, test

In [3]:
def get_triplets(unique_labels, label_indices_map):
    label_l, label_r = np.random.choice(unique_labels, 2, replace=False)
    a, p = np.random.choice(label_indices_map[label_l], 2, replace=False)
    n = np.random.choice(label_indices_map[label_r])
    return a, p, n

def get_batch(batch_size, dataset, unique_labels, label_indices_map, get_embed):
    while True:
        idxs_a, idxs_p, idxs_n = [], [], []
        for _ in range(batch_size):
            a, p, n = get_triplets(unique_labels, label_indices_map)
            idxs_a.append(a)
            idxs_p.append(p)
            idxs_n.append(n)

        a = dataset.iloc[idxs_a].values.tolist()
        b = dataset.iloc[idxs_p].values.tolist()
        c = dataset.iloc[idxs_n].values.tolist()

        a = get_embed(a).numpy()
        p = get_embed(b).numpy()
        n = get_embed(c).numpy()

        yield a, p, n

def create_label_indices_map(dataset, collabel):
    unique_labels = np.array(dataset[collabel].unique().tolist())
    labels = np.array(dataset[collabel].tolist())
    label_indices_map = {
        label: np.flatnonzero(labels == label)
        for label in unique_labels
    }
    return unique_labels, label_indices_map

def triplet_generator(batch_size, dataset, unique_labels, label_indices_map, get_embed):
    while True:
        for a, p, n in get_batch(batch_size, dataset, unique_labels, label_indices_map, get_embed):
            yield (a, p, n), np.zeros((batch_size,))

def create_trip_dtset(batch_size, dataset, colfeat, collabel, get_embed, embed_dim=512):
    unique_labels, label_indices_map = create_label_indices_map(dataset, collabel)
    
    output_signature = (
        (
            tf.TensorSpec(shape=(None, embed_dim), dtype=tf.float32),
            tf.TensorSpec(shape=(None, embed_dim), dtype=tf.float32),
            tf.TensorSpec(shape=(None, embed_dim), dtype=tf.float32)
        ),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
    
    dataset_tf = tf.data.Dataset.from_generator(
        lambda: triplet_generator(
            batch_size, dataset[colfeat], unique_labels, label_indices_map, get_embed),
        output_signature=output_signature
    )
    return dataset_tf

In [19]:
# df = pd.read_csv('../data/df.csv')
# df.dropna(inplace=True)
# df = df.reset_index(drop=True)
# print(df.shape)
# df.head()

In [20]:
# train, valid, test = split_data(df)
# print(f"train: {train.shape}, test: {test.shape}, valid: {valid.shape}")