# Data cleaning

In [1]:
import json

try:
    from constants import ROOT
    import sys
    sys.path.insert(0, ROOT)
except:
    print('Could not import ROOT. Check or add manually.')

In [2]:
from pathlib import Path
import shutil
from categorize import create_new_folders, copy_all_files, rename_all_files, LABELS, load_paths, check_for_missing_files, classify, _create_new_folders

%load_ext autoreload
%autoreload 2

First we load json file with categorizations:

In [4]:
label_type = 'rack'
image_paths, mask_paths = load_paths(label_type)
# test_show_images(image_paths, mask_paths)
check_for_missing_files(image_paths, mask_paths)

In [3]:
user_qc_filepath = classify(image_paths, mask_paths, label_type)

Images are shown on desktop.
Press any button to go to the next (if needed click on an image window too; Q (capital) to break).
Generated /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/rack/20230109_111626_rack.json.


In [5]:
user_qc_filepath

PosixPath('/Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/rack/20230109_111626_rack.json')

In [5]:
user_qc_filepath = LABELS / label_type / '20230109_111626_rack.json'

In [6]:
with open(user_qc_filepath, 'r') as f:
    user_qc = json.load(f)
print(user_qc)

{'rack_20211128_row9_col78.png': 'a', 'rack_20211128_row9_col83.png': 'a', 'rack_20211128_row9_col82.png': 'a', 'samson_20211128_row9_col79.png': 'a', 'samson_20211128_row10_col83.png': 'a', 'samson_20211128_row59_col85.png': 'a', 'rack_20211128_row9_col80.png': 'a', 'rack_20211128_row9_col81.png': 'a', 'samson_20211012_row26_col48.png': 'a', 'samson_20211128_row10_col80.png': 'b', 'samson_20211128_row59_col79.png': 'a', 'samson_20211128_row14_col48.png': 'a', 'samson_20211012_row30_col50.png': 'a', 'samson_20211012_row21_col77.png': 'a', 'samson_20211128_row11_col84.png': 'a', 'samson_20211012_row22_col80.png': 'a', 'samson_20211128_row22_col82.png': 'a', 'samson_20211128_row22_col83.png': 'a', 'samson_20211012_row22_col81.png': 'a', 'samson_20211128_row11_col85.png': 'a', 'samson_20211012_row30_col51.png': 'a', 'samson_20211012_row21_col76.png': 'a', 'samson_20211012_row9_col76.png': 'a', 'samson_20211128_row17_col80.png': 'a', 'samson_20211128_row21_col76.png': 'a', 'samson_20211012

convert the single letter labels to longer labels:

In [7]:
single_char_labels = set(user_qc.values())
single_char_labels

{'a', 'b'}

**TOUCHPOINT** enter the new mapping between single char label and long names:

In [8]:
mapping = {'a': 'goodrack', 'b': 'badrack'}

In [9]:
assert len(mapping) == len(single_char_labels)
assert len(mapping) == len(set(mapping.values()))
new_user_qc = {name: mapping[single_char_label] for name, single_char_label in user_qc.items()}
print(f'Mapping is {mapping}')

Mapping is {'a': 'goodrack', 'b': 'badrack'}


Create new folders based on the labels in the QC file:

In [10]:
create_new_folders(label_type, mapping.values())

Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/rack/goodrack.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/rack/goodrack/imgs.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/rack/goodrack/masks.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/rack/badrack.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/rack/badrack/imgs.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/rack/badrack/masks.


Copy all the images into a folder based on their label:

In [11]:
copy_all_files(new_user_qc, LABELS / label_type)

Copied 2 x 63 files.


Rename all files:

In [16]:
mapping.update({'rack': 'goodrack'})

In [17]:
mapping

{'a': 'goodrack', 'b': 'badrack', 'rack': 'goodrack'}

In [21]:
set(mapping.values())

{'badrack', 'goodrack'}

In [22]:
for new_label in set(mapping.values()):
    rename_all_files(LABELS / label_type / new_label, 'samson', new_label)

Renamed 26 files.
Renamed 228 files.


# Combine to train

In [24]:
train_folder = LABELS / 'train'
_create_new_folders(train_folder)

Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/train.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/train/imgs.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/train/masks.


In [42]:
folders_to_combine = [LABELS / 'commonpanel', LABELS / 'to_sort' / 'commonpanel']

For each folder in `folders_to_combine`, copy everything from `folder / 'imgs'` to go to `train_folder / 'imgs'`, same for `masks`:

In [41]:
for folder in folders_to_combine:
    for subfolder in ['imgs', 'masks']:
        src_folder = folder / subfolder
        dst_folder = train_folder / subfolder
        for src_filepath in src_folder.glob('*.*'):
            shutil.copy(src_filepath, dst_folder / src_filepath.name)