# Data cleaning

In [1]:
import json

try:
    from constants import ROOT
    import sys
    sys.path.insert(0, ROOT)
except:
    print('Could not import ROOT. Check or add manually.')

In [2]:
from pathlib import Path
import shutil
from categorize import create_new_folders, copy_all_files, rename_all_files, LABELS, load_paths, check_for_missing_files, classify, _create_new_folders

%load_ext autoreload
%autoreload 2

First we load json file with categorizations:

In [8]:
label_type = 'to_sort'
image_paths, mask_paths = load_paths(label_type)
# test_show_images(image_paths, mask_paths)
check_for_missing_files(image_paths, mask_paths)
user_qc_filepath = classify(image_paths, mask_paths, label_type)

Images are shown on desktop.
Press any button to go to the next (if needed click on an image window too; Q (capital) to break).
Generated /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/to_sort/20230106_163149_to_sort.json.


In [19]:
user_qc_filepath = LABELS / label_type / '20230106_160756_to_sort.json'

In [11]:
with open(user_qc_filepath, 'r') as f:
    user_qc = json.load(f)
print(user_qc)

{'samson_20211128_row62_col8.png': 'b', 'samson_20211128_row63_col8.png': 'b', 'samson_20211128_row60_col20.png': 'b', 'samson_20211128_row59_col82.png': 'r', 'samson_20211012_row20_col77.png': 'r', 'samson_20211128_row52_col0.png': 'b', 'samson_20211128_row63_col14.png': 'b', 'samson_20211128_row53_col0.png': 'b', 'samson_20211128_row85_col36.png': 'b', 'samson_20211128_row9_col81.png': 'r', 'samson_20211128_row18_col77.png': 'r', 'samson_20211128_row65_col13.png': 'b', 'samson_20211128_row56_col14.png': 'b', 'samson_20211012_row28_col52.png': 'r', 'samson_20211128_row65_col12.png': 'b', 'samson_20211128_row18_col76.png': 'r', 'samson_20211128_row9_col80.png': 'r', 'samson_20211128_row66_col26.png': 'b', 'samson_20211128_row53_col1.png': 'b', 'samson_20211128_row52_col1.png': 'b', 'samson_20211128_row63_col15.png': 'b', 'samson_20211128_row10_col85.png': 'r', 'samson_20211128_row59_col83.png': 'r', 'samson_20211012_row20_col76.png': 'r', 'samson_20211128_row60_col21.png': 'b', 'samson

convert the single letter labels to longer labels:

In [14]:
single_char_labels = set(user_qc.values())
single_char_labels

{'b', 'r'}

**TOUCHPOINT** enter the new mapping between single char label and long names:

In [15]:
mapping = {'b': 'commonpanel', 'r': 'rack'}

In [16]:
assert len(mapping) == len(single_char_labels)
assert len(mapping) == len(set(mapping.values()))
new_user_qc = {name: mapping[single_char_label] for name, single_char_label in user_qc.items()}
print(f'Mapping is {mapping}')

Mapping is {'b': 'commonpanel', 'r': 'rack'}


Create new folders based on the labels in the QC file:

In [17]:
create_new_folders(label_type, mapping.values())

Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/to_sort/commonpanel.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/to_sort/commonpanel/imgs.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/to_sort/commonpanel/masks.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/to_sort/rack.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/to_sort/rack/imgs.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/to_sort/rack/masks.


Copy all the images into a folder based on their label:

In [18]:
copy_all_files(new_user_qc, LABELS / label_type)

Copied 2 x 95 files.


Rename all files:

In [20]:
mapping

{'b': 'commonpanel', 'r': 'rack'}

In [21]:
for new_label in mapping.values():
    rename_all_files(LABELS / label_type / new_label, 'samson', new_label)

Renamed 332 files.
Renamed 48 files.


# Combine to train

In [24]:
train_folder = LABELS / 'train'
_create_new_folders(train_folder)

Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/train.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/train/imgs.
Created /Users/nenad.bozinovic/PycharmProjects/solar_panel/data/all_labels/train/masks.


In [42]:
folders_to_combine = [LABELS / 'commonpanel', LABELS / 'to_sort' / 'commonpanel']

For each folder in `folders_to_combine`, copy everything from `folder / 'imgs'` to go to `train_folder / 'imgs'`, same for `masks`:

In [41]:
for folder in folders_to_combine:
    for subfolder in ['imgs', 'masks']:
        src_folder = folder / subfolder
        dst_folder = train_folder / subfolder
        for src_filepath in src_folder.glob('*.*'):
            shutil.copy(src_filepath, dst_folder / src_filepath.name)