In [16]:
import csv
import re
import os
import tqdm
import shutil

In [10]:
our_data_dir = 'our_data'
data_dir = 'data'

project_path = '/vol/main/initial-data-point-selection-backup/src/models/data'
datasets = ['matek', 'isic', 'retinopathy', 'jurkat', 'cifar10']
models_type_version = [('simclr', 'v1'), ('swav', 'v1'), ('dino', 'v2')]

In [11]:
# creating the root folder
os.makedirs(our_data_dir, exist_ok=True)

# copying features
for model_type, model_version in models_type_version:
    for dataset in datasets:
        # skips
        if (model_type, model_version) == ('dino', 'v2') and dataset == 'retinopathy':
            continue

        src_path = os.path.join(
            project_path,
            dataset,
            model_type,
            model_version,
            'features.csv'
        )

        dest_path = os.path.join(
            our_data_dir,
            f'{dataset}_{model_type}_{model_version}.csv'
        ) 

        shutil.copy(src_path, dest_path)

In [12]:
feature_names = os.listdir(our_data_dir)
feature_names

['matek_simclr_v1.csv',
 'isic_swav_v1.csv',
 'matek_swav_v1.csv',
 'cifar10_simclr_v1.csv',
 'jurkat_dino_v2.csv',
 'cifar10_swav_v1.csv',
 'matek_dino_v2.csv',
 'isic_dino_v2.csv',
 'retinopathy_swav_v1.csv',
 'cifar10_dino_v2.csv',
 'isic_simclr_v1.csv',
 'retinopathy_simclr_v1.csv',
 'jurkat_simclr_v1.csv',
 'jurkat_swav_v1.csv']

In [15]:
def process_our_features(features_name):
    features = []
    labels = []
    names = []

    with open(os.path.join(our_data_dir, features_name), 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            features.append([float(el) for el in row['feature'][1:-1].split(', ')])
            labels.append(int(re.search(r'tensor\((\d+)\)', row['label']).group(1)))
            names.append(row['name'])

    # features
    with open(os.path.join(data_dir, f'our_features_{features_name}'), 'w') as csv_file:
        csv_writer = csv.writer(csv_file)

        csv_writer.writerow(range(len(features[0])))
        for feature in features:
            csv_writer.writerow(feature)

    # labels
    with open(os.path.join(data_dir, f'our_labels_{features_name}'), 'w') as csv_file:
        csv_writer = csv.writer(csv_file)

        csv_writer.writerow(['label'])
        for label in labels:
            csv_writer.writerow([label])

    # names
    with open(os.path.join(data_dir, f'our_names_{features_name}'), 'w') as csv_file:
        csv_writer = csv.writer(csv_file)

        csv_writer.writerow(['name'])
        for name in names:
            csv_writer.writerow([name])


In [17]:
for feature_name in tqdm.tqdm(feature_names):
    process_our_features(feature_name)

 14%|█▍        | 2/14 [00:32<03:16, 16.40s/it]


KeyboardInterrupt: 