In [1]:
import pandas as pd
import numpy as np

In [2]:
def stratified_sample(y, p=0.67, replace=False, seed=1234):
    unique_y, counts = np.unique(y, return_counts=True)
    n_per_class = np.array([int(np.math.floor(p*c)) for c in counts])
    n_per_class = np.array([max([npc, 1]) for npc in n_per_class])
    if seed != None:
        np.random.seed(seed)
    
    inds = [np.random.choice(np.where(y == unique_y[i])[0], size=npc, replace=replace) for i, npc in enumerate(n_per_class)]
    
    return np.concatenate(inds)

In [3]:
def get_split_label(ind, val_inds, main_inds):
    if ind in val_inds:
        return 'valid'
    elif ind in main_inds:
        return 'train'
    else:
        return 'test'

In [11]:
teamdrive_root = '../../../../teamdrive/transmediasp/kate/'
label_noises = [0, 10, 20, 30, 50, 80]
split_seeds = [1234, 6512, 3845, 4321, 5888, 7356, 1834, 4628, 9375, 8372]
noise_seeds = [6547, 9706, 2645, 1745, 3647, 3574, 2019, 4297, 4973, 1847]
vol = .55

In [5]:
df = pd.read_csv(teamdrive_root + 'icons_experiment/label_noise_data_frame_with_splits.csv',index_col=0)
del df['Unnamed: 0.1']

df.head()

Unnamed: 0,image_path,class,label,split,10_0,10_1,10_2,10_3,10_4,20_0,...,split_90_0,split_90_1,split_90_2,split_90_3,split_90_4,split_00_0,split_00_1,split_00_2,split_00_3,split_00_4
0,../../../data/testdotai/close/_e4530e1aae88750...,close,0,main,0,0,0,36,0,90,...,main,main,main,main,main,main,test,main,main,main
1,../../../data/testdotai/close/~02a0c54fd8374b4...,close,0,main,0,0,0,0,0,48,...,main,main,main,test,main,test,test,main,main,main
2,../../../data/testdotai/close/_10099f88fd8333f...,close,0,main,0,0,0,0,0,0,...,main,main,main,main,test,main,main,main,main,main
3,../../../data/testdotai/close/_bcd740021f1a62a...,close,0,main,0,0,0,0,0,0,...,main,main,main,main,main,main,main,main,main,main
4,../../../data/testdotai/close/_047b3f69c7c53b8...,close,0,main,0,0,0,0,0,0,...,main,main,main,test,main,main,main,main,main,test


In [6]:
clean = df[['image_path', 'class', 'label', 'split']]
clean = clean[clean['split'] !='fine'].reset_index(drop=True)
del clean['split']
clean.head()

Unnamed: 0,image_path,class,label
0,../../../data/testdotai/close/_e4530e1aae88750...,close,0
1,../../../data/testdotai/close/~02a0c54fd8374b4...,close,0
2,../../../data/testdotai/close/_10099f88fd8333f...,close,0
3,../../../data/testdotai/close/_bcd740021f1a62a...,close,0
4,../../../data/testdotai/close/_047b3f69c7c53b8...,close,0


In [8]:
#split into train, validation and test 

y = clean['label'].tolist()
ind_list = np.arange(len(clean))
vol = .55

for run,seed in enumerate(split_seeds):
    print('running round %s' %(run))
    main_inds = stratified_sample(y, vol, seed=seed)
    y_main = [y[i] for i in main_inds]
    inds_val = stratified_sample(y_main, .05/vol, seed=seed)
    validation_inds = [main_inds[i] for i in inds_val]
    split = [get_split_label(i, validation_inds, main_inds) for i in ind_list]
    clean['split_%s' %(run)] = split
clean.head(20)

running round 0
running round 1
running round 2
running round 3
running round 4
running round 5
running round 6
running round 7
running round 8
running round 9


Unnamed: 0,image_path,class,label,split_0,split_1,split_2,split_3,split_4,split_5,split_6,split_7,split_8,split_9
0,../../../data/testdotai/close/_e4530e1aae88750...,close,0,train,train,test,test,train,valid,test,test,train,train
1,../../../data/testdotai/close/~02a0c54fd8374b4...,close,0,test,train,test,test,train,test,train,train,train,train
2,../../../data/testdotai/close/_10099f88fd8333f...,close,0,train,train,test,test,train,train,test,test,train,train
3,../../../data/testdotai/close/_bcd740021f1a62a...,close,0,test,test,train,test,train,test,test,test,test,train
4,../../../data/testdotai/close/_047b3f69c7c53b8...,close,0,train,test,test,train,train,train,train,test,test,test
5,../../../data/testdotai/close/_530464eb7c56a08...,close,0,test,train,test,train,test,test,test,train,train,train
6,../../../data/testdotai/close/_b9fca4f2b106c1f...,close,0,train,train,test,test,train,test,test,train,test,test
7,../../../data/testdotai/close/~9414624116f8235...,close,0,test,train,test,train,train,test,test,test,train,train
8,../../../data/testdotai/close/~b8a44d79b1165ed...,close,0,test,train,train,test,test,train,test,train,test,test
9,../../../data/testdotai/close/_def90b0d68090cb...,close,0,test,train,test,valid,test,train,test,test,test,train


In [13]:
n_classes = len(clean['label'].unique())

In [15]:
for run,seed in enumerate(noise_seeds):
    main_inds = np.array(clean[clean['split_%s' %run] != 'test'].index)
    for label_noise in label_noises:
        if label_noise == 0:
            clean['%s_0' %(run)] = clean['label']
        else:
            np.random.seed(seed)
            size = int((label_noise/100) * len(main_inds))
            noise_idx = np.random.choice(len(main_inds), size, replace=False)
            noise_inds = np.take_along_axis(main_inds,noise_idx, axis=None)
            print('injecting noise ...%s, run %s' %(len(noise_idx), run))
            clean['%s_%s' %(run, label_noise)] = [np.random.randint(n_classes) if x in noise_inds 
                                                            else clean['label'].iat[x] for x in clean.index]
clean.head(20)        


injecting noise ...7398, run 0
injecting noise ...14796, run 0
injecting noise ...22194, run 0
injecting noise ...36991, run 0
injecting noise ...59185, run 0
injecting noise ...7398, run 1
injecting noise ...14796, run 1
injecting noise ...22194, run 1
injecting noise ...36991, run 1
injecting noise ...59185, run 1
injecting noise ...7398, run 2
injecting noise ...14796, run 2
injecting noise ...22194, run 2
injecting noise ...36991, run 2
injecting noise ...59185, run 2
injecting noise ...7398, run 3
injecting noise ...14796, run 3
injecting noise ...22194, run 3
injecting noise ...36991, run 3
injecting noise ...59185, run 3
injecting noise ...7398, run 4
injecting noise ...14796, run 4
injecting noise ...22194, run 4
injecting noise ...36991, run 4
injecting noise ...59185, run 4
injecting noise ...7398, run 5
injecting noise ...14796, run 5
injecting noise ...22194, run 5
injecting noise ...36991, run 5
injecting noise ...59185, run 5
injecting noise ...7398, run 6
injecting noise

Unnamed: 0,image_path,class,label,split_0,split_1,split_2,split_3,split_4,split_5,split_6,...,8_20,8_30,8_50,8_80,9_0,9_10,9_20,9_30,9_50,9_80
0,../../../data/testdotai/close/_e4530e1aae88750...,close,0,train,train,test,test,train,valid,test,...,17,17,17,17,0,0,0,0,88,88
1,../../../data/testdotai/close/~02a0c54fd8374b4...,close,0,test,train,test,test,train,test,train,...,0,0,0,42,0,0,0,88,38,38
2,../../../data/testdotai/close/_10099f88fd8333f...,close,0,train,train,test,test,train,train,test,...,0,0,42,13,0,88,88,38,8,8
3,../../../data/testdotai/close/_bcd740021f1a62a...,close,0,test,test,train,test,train,test,test,...,0,0,0,0,0,0,0,8,74,74
4,../../../data/testdotai/close/_047b3f69c7c53b8...,close,0,train,test,test,train,train,train,train,...,0,0,0,0,0,0,0,0,0,0
5,../../../data/testdotai/close/_530464eb7c56a08...,close,0,test,train,test,train,test,test,test,...,42,42,13,92,0,0,0,0,0,28
6,../../../data/testdotai/close/_b9fca4f2b106c1f...,close,0,train,train,test,test,train,test,test,...,0,0,0,0,0,0,0,0,0,0
7,../../../data/testdotai/close/~9414624116f8235...,close,0,test,train,test,train,train,test,test,...,13,13,92,75,0,0,0,0,0,0
8,../../../data/testdotai/close/~b8a44d79b1165ed...,close,0,test,train,train,test,test,train,test,...,0,0,0,0,0,0,0,0,0,0
9,../../../data/testdotai/close/_def90b0d68090cb...,close,0,test,train,test,valid,test,train,test,...,0,0,0,0,0,0,0,0,0,0


In [16]:
clean.to_csv(teamdrive_root + 'icons_experiment/data_frame_50_percent_splits_with_label_noise.csv')