In [1]:
from util import TRAIN_DF
import pandas as pd
import numpy as np
import math
import random
df = TRAIN_DF.copy()
BATCH_SIZE = 64
task = "species"



In [2]:
def df_filter_for_indidum_training(train_df:pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    ids, respective_counts = np.unique(train_df["individual_id"].values,return_counts=True)
    ids = ids[respective_counts>1] # boolean index the ids 

    respective_counts= respective_counts[respective_counts>1] # and counts for later

    filter_function = np.vectorize(lambda x: x in ids) # our filter function

    train_df = train_df.iloc[filter_function(train_df["individual_id"])] # filter df

    train_df.index = range(len(train_df)) # reindex
    return train_df
df = df_filter_for_indidum_training(df)

In [3]:
def smart_batches(df:pd.core.frame.DataFrame, BATCH_SIZE:int, task:str = "individual_id") -> pd.core.frame.DataFrame:
    """
    This is one of the most important functions:
    -----------------
    arguments:
    df - pandas data frame of our data
    BATCH_SIZE - the bath_sie of our tensorflow dataset, must be even
    task - either "individual_id" or "species", Specifies if we want to create train to identify species or individuals.
    
    -----------------
    returns
    Ordered Data Frame for Tensorflow Data set creation, such that the batches are valid for the triplet loss,
    i.e. never contains only one positve.
    """
    df = df.copy()
    
    df["species_counts"] = df.groupby('species_label')["species_label"].transform('count')
    df['individum_count'] = df.groupby('individual_id')['individual_id'].transform('count')
    
    assert task in ["individual_id","species"] + df.columns.tolist(), 'task has to be either "individual_id" or "species"" and must be column index of df'
    assert BATCH_SIZE%2==0, "BATCH_SIZE must be even"
    
    counts_column = "individum_count"  if task == "individual_id" else "species_counts"
    label = "label" if task == "individual_id" else "species_label"
    df["assign_to"] = np.nan

    even_mask = (df[counts_column]%2==0).array
    uneven_mask = np.logical_not(even_mask)

    even_indices_list = list(df[even_mask].index)
    uneven_df = df[uneven_mask]

    amount_of_containers = math.ceil(len(df)/BATCH_SIZE)
    container = np.array([BATCH_SIZE for i in range(amount_of_containers-1) ] + [len(df)%BATCH_SIZE])

    set_of_uneven_classes = {a for a in uneven_df[label]}
    
    
    if not len(set_of_uneven_classes)%2 != container[-1]%2:
        unlucky_class = random.choice(uneven_df.index)
        df.drop(index = unlucky_class )

        even_mask = (df[counts_column]%2==0).array
        uneven_mask = np.logical_not(even_mask)

        even_indices_list = list(df[even_mask].index)
        uneven_df = df[uneven_mask]
        set_of_uneven_classes = {a for a in uneven_df[label]}
        
        print(f"We threw away the datapoint with index {unlucky_class} ")
    
    uneven_labels = {a:[] for a in uneven_df[label].array}
    for index, int_label in zip(uneven_df.index,  uneven_df[label].array):
        uneven_labels[int_label].append(index)
        
    for int_label in uneven_labels:
        if len(uneven_labels[int_label])>3:
            rest = uneven_labels[int_label][3:]
            keep = uneven_labels[int_label][:3]
            even_indices_list.extend(rest)

            uneven_labels[int_label] = keep
            
    uneven_indices_list = [uneven_labels[a] for a in uneven_labels]
    random.shuffle(uneven_indices_list)
    
    if len(set_of_uneven_classes)%2==1:
        container[-1]-=3
        first_triplet = uneven_indices_list.pop()
        df.loc[first_triplet,"assign_to"]=len(container)-1
    assert len(uneven_indices_list)%2 == 0, "stf went horbly wrong"

    combined_double_triplets = [a+b for a,b in zip(uneven_indices_list[::2],uneven_indices_list[1::2])]
    assert all([len(a) == 6 for a in combined_double_triplets])
    
    even_df = df.loc[even_indices_list]
    even_labels = even_df[label].sort_values().index
    
    combined_even_doubles = [[a,b] for a,b in zip(even_labels[::2],even_labels[1::2])]
    random.shuffle(combined_even_doubles)

    assert all([df.loc[a,label]==df.loc[b,label] for a,b in combined_even_doubles])
    i = 0
    while combined_double_triplets:


        if container[i]<6:
            i = i+1 if i+1!=len(container) else 0
            continue

        triplets = combined_double_triplets.pop()
        container[i]-=6
        df.loc[triplets,"assign_to"]=i

        i = i+1 if i+1!=len(container) else 0 

    i = 0
    while combined_even_doubles:


        if container[i]<2:
            i = i+1 if i+1!=len(container) else 0
            continue

        double = combined_even_doubles.pop()
        container[i]-=2
        df.loc[double,"assign_to"]=i

        i = i+1 if i+1!=len(container) else 0 


    assert np.all(container == 0)

    return df.sort_values(["assign_to"])

In [6]:
smart_batches(df,64,"species")

We threw away the datapoint with index 35114 


Unnamed: 0,image,species,individual_id,individum_count,label,species_label,species_counts,assign_to
40474,6a3ce4565aa05d.jpg,minke_whale,ce6e37904aa4,145,12567,19,1572,0.0
26676,ed44184853ab17.jpg,humpback_whale,f263fb6c725a,30,14798,14,5542,0.0
40476,f87a2bba4fcb5e.jpg,minke_whale,ce6e37904aa4,145,12567,19,1572,0.0
12288,c40f21f5c6719f.jpg,melon_headed_whale,76865acc4539,5,7240,18,553,0.0
39368,56695ed94bccb6.jpg,bottlenose_dolphin,4b8534134eb8,131,4584,2,9473,0.0
...,...,...,...,...,...,...,...,...
20215,81ee24b3588335.jpg,fin_whale,03880e5855cc,14,227,10,1017,596.0
33427,a82a3c527fe2ba.jpg,bottlenose_dolphin,daa006a682c7,56,13308,2,9473,596.0
21327,65cba2606353f5.jpg,false_killer_whale,b75810d6e594,16,11199,9,3284,596.0
100,44c1c39444185a.jpg,fin_whale,2ab525564d41,2,2631,10,1017,596.0
