# Incremental Ernollment Trial (Closed set)
---------

trial에 등장하는 대상을 소수의 인원으로 제한시킨 trial을 만들고자 한다.

## Environment

In [1]:
%load_ext autoreload
%autoreload 2
%pylab
%matplotlib inline

import os
import sys
import pickle
import pandas as pd
from utils import key2df

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


## Dataframes & embeddings

In [2]:
keys = pickle.load(open("./xvector_embeds/sv_keys.pkl", "rb"))
keys = np.array(keys)
key_df = key2df(keys)
spk_uttr_stat = key_df.spk.value_counts()

## Held-out validation set

utterance가 150개 넘는 것들을 trial spks로 빼고 나머지는 validation spk로 빼었다.  
그리고 validation spk 음성을 이용해서 threshold를 정하기 위한 trial을 만든다.

In [16]:
trial_enr_spks = spk_uttr_stat[spk_uttr_stat > 150].index.tolist()
val_spks = spk_uttr_stat[spk_uttr_stat <= 150].index.tolist()
val_uttrs = key_df[key_df.spk.isin(val_spks)]
trial_uttrs = key_df[key_df.spk.isin(trial_enr_spks)]

# trial에 쓰이지 않는 dataset
val_uttrs.to_pickle("trials/enr306/validation_set.pkl")

# vox2의 모든 trial 중에 validation set에 남아있는 음성으로 이루어진 trial을 추출한다.
voxc2_trial = pd.read_pickle("trials/voxc2_1211_trials.pkl")
trial_for_validation = voxc2_trial[(voxc2_trial.enroll_spk.isin(val_spks)) & (voxc2_trial.test_spk.isin(val_spks))]
trial_for_validation.to_pickle("trials/enr306/trial_for_validation.pkl")

## Design trials for each enr_spk

우선 closed set에 포함될 스피커들을 뽑는다.

In [6]:
all_spks = trial_uttrs.spk.unique().tolist()
print(f"number of spks: {len(all_spks)}")

number of spks: 306


In [26]:
family_sizes = [3, 5, 7, 9]
n_enroll_utters = 3 # later we can use 1~3 enrollment cases
n_family = 10 

In [51]:
cases = {} # total cases: n_family * family_size

for family_size in family_sizes:
    closed_sets = np.random.choice(all_spks, size=(n_family, family_size), replace=True)
    for spk_set in closed_sets: 
        closed_set_uttrs = trial_uttrs[trial_uttrs.spk.isin(spk_set)]
        open_set_uttrs = trial_uttrs[~trial_uttrs.spk.isin(spk_set)]
        open_set_uttrs.loc[:, 'label'] = 0
        for enr_spk in spk_set: # TODO: multiple enrolled speakers case
            enr_uttrs = closed_set_uttrs[closed_set_uttrs.spk == enr_spk].sample(n=n_enroll_utters)
            target_uttrs =  closed_set_uttrs[closed_set_uttrs.spk == enr_spk].drop(index=enr_uttrs.index)
            nonTarget_uttrs = closed_set_uttrs[closed_set_uttrs.spk != enr_spk]
            target_uttrs.loc[:, 'label'] = 1
            nonTarget_uttrs.loc[:, 'label'] = 0

            # adapt trials
            n_adapt_trial = int(min(len(target_uttrs), len(nonTarget_uttrs))*0.8)
            adapt_target_uttrs = target_uttrs.sample(n=n_adapt_trial)
            adapt_nonTarget_uttrs = nonTarget_uttrs.sample(n=n_adapt_trial)
            
            # test trials
            n_test_trial = min(len(target_uttrs), len(nonTarget_uttrs)) - n_adapt_trial 
            test_target_uttrs = target_uttrs.drop(index=adapt_target_uttrs.index).sample(n=n_test_trial)
            test_nonTarget_uttrs = nonTarget_uttrs.drop(index=adapt_nonTarget_uttrs.index).sample(n=n_test_trial)
            
            # ood trials
            ood_trial= open_set_uttrs.groupby('spk', group_keys=False).apply(lambda x: x.sample(n=1))[:len(test_trial)]
            
            # shuffle trials and it will be fixed for consistency
            adapt_trial = pd.concat([adapt_target_uttrs, adapt_nonTarget_uttrs]).sample(frac=1)
            test_trial = pd.concat([test_target_uttrs, test_nonTarget_uttrs]).sample(frac=1)
            
            if family_size not in cases:
                cases[family_size] = []
            cases[family_size] += [[enr_spk, 
                       enr_uttrs.index.tolist(), 
                       (adapt_trial.index.tolist(), adapt_trial.label.tolist()),
                       (test_trial.index.tolist(),test_trial.label.tolist()),
                       (ood_trial.index.tolist(), ood_trial.label.tolist())
                     ]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


## Save the trials

In [52]:
for family_size in family_sizes:
    save_dir = "./trials/enr306/enr306_closedset/FS_{}/".format(family_size)
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    pickle.dump(cases[family_size], open(save_dir + "/trials.pkl", "wb"))    
    trial_info = {'set_size':family_size, 'n_enrs':n_enroll_utters}
    pickle.dump(trial_info, open(save_dir + "/trial_info.pkl", "wb"))

In [36]:
key_df.loc['id10003-5ablueV_1tw-00001']

spk            id10003
session    5ablueV_1tw
label                2
origin           voxc2
Name: id10003-5ablueV_1tw-00001, dtype: object