In [1]:
### Create unary train/test/val splits of shapes to train AutoEncoders.
### For publication the random seed was 2022.

In [2]:
import pandas as pd
import os.path as osp
from changeit3d.utils.basics import make_train_test_val_splits

In [6]:
save_result = False
random_seed = 2022
loads = [0.85, 0.05, 0.1]

all_models_used_in_st = '../../data/shapetalk/misc/all_model_uids_used.csv'
out_dir = '../../data/shapetalk/misc/'
out_file = osp.join(out_dir, f'unary_split_rs_{random_seed}.csv')

In [7]:
df = pd.read_csv(all_models_used_in_st)

In [8]:
df['object_class'] = df.model_uid.apply(lambda x: x.split('/')[0])
df['dataset'] = df.model_uid.apply(lambda x: x.split('/')[1])
df['model_name'] = df.model_uid.apply(lambda x: x.split('/')[2])
df.head()

Unnamed: 0,model_uid,object_class,dataset,model_name
0,airplane/ModelNet/airplane_0001,airplane,ModelNet,airplane_0001
1,airplane/ModelNet/airplane_0004,airplane,ModelNet,airplane_0004
2,airplane/ModelNet/airplane_0005,airplane,ModelNet,airplane_0005
3,airplane/ModelNet/airplane_0007,airplane,ModelNet,airplane_0007
4,airplane/ModelNet/airplane_0009,airplane,ModelNet,airplane_0009


In [62]:
# split in EVERY class independently the requested percentages
all_splitted_dfs = []
for object_class in df.object_class.unique():
    sub_df = df[df.object_class == object_class].copy()
    sub_df.reset_index(inplace=True, drop=True)
    sub_df = make_train_test_val_splits(sub_df, loads, random_seed, split_column='model_uid', verbose=False)
    all_splitted_dfs.append(sub_df)
    print(f"{object_class: <10}", 
          "train: {:5d}  val: {:5d}  test: {:5d}".format((sub_df.split == "train").sum(),
                                                           (sub_df.split == "val").sum(),
                                                           (sub_df.split == "test").sum(),
                                                          ))

airplane   train:  2313  val:   137  test:   272
bag        train:   118  val:     7  test:    14
bathtub    train:   564  val:    34  test:    66
bed        train:   634  val:    38  test:    75
bench      train:  1408  val:    83  test:   166
bookshelf  train:   693  val:    41  test:    82
bottle     train:   418  val:    25  test:    49
bowl       train:   197  val:    12  test:    23
cabinet    train:   209  val:    12  test:    25
cap        train:   176  val:    11  test:    21
chair      train:  5616  val:   331  test:   661
clock      train:   492  val:    30  test:    58
display    train:   997  val:    60  test:   117
dresser    train:  1436  val:    85  test:   169
faucet     train:   543  val:    33  test:    64
flowerpot  train:   529  val:    32  test:    62
guitar     train:   640  val:    38  test:    75
helmet     train:   118  val:     8  test:    14
knife      train:   360  val:    22  test:    42
lamp       train:  1965  val:   116  test:   231
mug        train:   

In [10]:
result = pd.concat(all_splitted_dfs)
result.reset_index(inplace=True, drop=True)
assert len(result.groupby(['dataset', 'object_class', 'model_name']).size()) == len(result), 'combo must be unique'

In [11]:
result.sample(3)

Unnamed: 0,model_uid,object_class,dataset,model_name,split
18796,flowerpot/ShapeNet/3de068a04b96e3a3bc5996932c1...,flowerpot,ShapeNet,3de068a04b96e3a3bc5996932c1238cd,test
26696,sofa/ShapeNet/e1fe7e3d2dbdfda9bb5bd941c6665c21,sofa,ShapeNet,e1fe7e3d2dbdfda9bb5bd941c6665c21,train
35954,vase/ShapeNet/4a53c4af240c562537048583edf8ef2c,vase,ShapeNet,4a53c4af240c562537048583edf8ef2c,train


In [12]:
print(len(result))
for x in result.split.unique():
    print(x, (result.split == x).mean())

36391
train 0.8495232337665907
test 0.09994229342419829
val 0.05053447280921107


In [66]:
if save_result:
    result.to_csv(out_file, index=False)