In [2]:
from pathlib import Path
import os
import json
import csv
from collections import Counter
import shutil
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from multiprocessing import Pool
from tqdm.notebook import tqdm
%config InlineBackend.figure_format = 'svg'

# Use pathlib
current_dir = Path.cwd()
target_dir = Path('/srv/healthcare/datascience/data/part-net/data_knife/')

print(f'Current Working Directory: {current_dir}')
if current_dir != target_dir:
    print('Changing directory')
    os.chdir(target_dir)
    print(f'New Working directory: {Path.cwd()}')


Current Working Directory: /home/toy-problem/notebooks
Changing directory
New Working directory: /srv/healthcare/datascience/data/part-net/data_knife


### The Table dataset contains some small "weird" samples, such as pool table, I would like to only keep the "regular_tables"

In [7]:
# only keep "regular_table"
import shutil
def check_occurrence(data, keyword):
    """
    Checks if a keyword occurs in the nested data structure.

    :param data: List of dictionaries representing JSON data.
    :param keyword: String to search for in the data.
    :return: Boolean indicating whether the keyword was found.
    """
    def search(items):
        for item in items:
            if keyword in item.values():
                return True
            if 'children' in item:
                if search(item['children']):
                    return True
        return False

    return search(data)

def remove(results):
     """
     Samples containing these keys in their results.json we want to remove. 
     """
     if check_occurrence(results, "fingerhole"):
          return True
    #  if check_occurrence(results, "picnic_table"):
    #       return True
    #  if check_occurrence(results, "game_table"):
    #       return True
    #  if not check_occurrence(results, "regular_table"):
    #       return True
    #  if check_occurrence(results, "other_leaf"):
    #       return True
     
     return False

cnt = 0
for subdirectory in target_dir.iterdir():
    with open(subdirectory / 'result.json', 'r') as file:
                results = json.load(file)
    # we only want to keep regular tables. 
    if remove(results): 
        cnt += 1
        print("removing", subdirectory)
        shutil.rmtree(subdirectory)         
print(f'{cnt} removed')

removing /srv/healthcare/datascience/data/part-net/data_knife/1056
removing /srv/healthcare/datascience/data/part-net/data_knife/2018
removing /srv/healthcare/datascience/data/part-net/data_knife/1051
removing /srv/healthcare/datascience/data/part-net/data_knife/2009
removing /srv/healthcare/datascience/data/part-net/data_knife/212
removing /srv/healthcare/datascience/data/part-net/data_knife/697
removing /srv/healthcare/datascience/data/part-net/data_knife/2015
removing /srv/healthcare/datascience/data/part-net/data_knife/224
removing /srv/healthcare/datascience/data/part-net/data_knife/114
removing /srv/healthcare/datascience/data/part-net/data_knife/397
removing /srv/healthcare/datascience/data/part-net/data_knife/567
removing /srv/healthcare/datascience/data/part-net/data_knife/421
removing /srv/healthcare/datascience/data/part-net/data_knife/924
removing /srv/healthcare/datascience/data/part-net/data_knife/1213
14 removed


### Extract the unique lables from the tables dataset. 

In [5]:
import json
from pathlib import Path

def extract_labels(data):
    """
    Extracts unique labels from the JSON data.

    :param data: List of dictionaries representing JSON data.
    :return: Set of unique labels.
    """
    labels = set()

    def extract(items):
        for item in items:
            if 'name' in item:
                labels.add(item['name'])
            if 'children' in item:
                extract(item['children'])

    extract(data)
    return labels

unique_labels = set()
for subdirectory in target_dir.iterdir():
    if not subdirectory.is_dir():
            continue  # Skip if it's not a directory
    with open(subdirectory / 'result.json', 'r') as file:
        results_after_merging_data = json.load(file)
    unique_labels.update(extract_labels(results_after_merging_data))
    
print(f'found {len(unique_labels)} unique labels')
print(list(unique_labels))

# these are the labels we want to keep for the TABLE dataset
# to_keep_table = ['board', 'leg', 'glass', 'tabletop_connector', 'bar', 'shelf', 'vertical_side_panel', 'bar_stretcher', 'bottom_panel', 'other_leaf', 'pedestal', 'central_support', 'back_panel', 'runner', 'drawer_front', 'circle', 'vertical_front_panel', 'vertical_divider_panel', 'foot', 'drawer_bottom', 'cabinet_door_surface', 'drawer_side']
to_keep_table = ['guard', 'bolster', 'butt', 'blade', 'handle_side', 'blade_side', 'handle']


filtered_unique_labels = [label for label in unique_labels if label in to_keep_table]
filtered_unique_labels.append("miscellaneous")
print(len(filtered_unique_labels))
print(filtered_unique_labels)


found 12 unique labels
['knife', 'bolster', 'cutting_instrument', 'butt', 'handle', 'handle_side', 'blade', 'guard', 'other_leaf', 'blade_side', 'dagger', 'other']
8
['bolster', 'butt', 'handle', 'handle_side', 'blade', 'guard', 'blade_side', 'miscellaneous']


### Now create unique integer label pairs

In [6]:
unique_integer_label_pairs = {integer: label for label, integer in enumerate(filtered_unique_labels)}
# save the dictionary in the data folder, this will be usefull later
csv_file_path = target_dir / Path('all_unique_integer_label_pairs.csv')
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Written Out Label', 'Integer Label'])
    for label, integer in unique_integer_label_pairs.items():
        writer.writerow([label, integer])
# pd.DataFrame([unique_integer_label_pairs]).to_csv("all_unique_integer_label_pairs.csv")
print(unique_integer_label_pairs)

{'bolster': 0, 'butt': 1, 'handle': 2, 'handle_side': 3, 'blade': 4, 'guard': 5, 'blade_side': 6, 'miscellaneous': 7}


### Now create a new labels file for each sample, based on unique_integer_label_pair

In [17]:
def flatten_data(data, flat_dict=None):
    """
    Flattens a nested JSON structure into a dictionary for easy lookup.

    :param data: List of dictionaries representing JSON data.
    :param flat_dict: A dictionary to store the flattened data.
    :return: A dictionary with IDs as keys and corresponding names as values.
    """
    if flat_dict is None:
        flat_dict = {}

    for item in data:
        id = item.get('id')
        if id is not None:
            flat_dict[id] = item.get('name', 'Name not found')
        if 'children' in item:
            flatten_data(item['children'], flat_dict)
    
    return flat_dict

for subdirectory in target_dir.iterdir():
    if not subdirectory.is_dir():
        continue  # Skip if it's not a directory
    # Efficiently load labels
    label_path = subdirectory / 'point_sample/label-10000.txt'
    labels = label_path.read_text().splitlines()
    labels = [int(label) for label in labels if label.isdigit()]
    
    # Load results and flatten them
    result_path = subdirectory / 'result.json'
    with result_path.open() as file:
        results = json.load(file)
    flattened_results = flatten_data(results)  # Assuming this is an optimized function

    new_labels = []
    for idx, label in enumerate(labels):
        target_label = flattened_results.get(label, None)
        if target_label in unique_integer_label_pairs:
            new_labels.append([idx, unique_integer_label_pairs[target_label], target_label])
        else:
            misc_label = unique_integer_label_pairs["miscellaneous"]
            new_labels.append([idx, misc_label, "miscellaneous"])
            cnt += 1

    # Write to CSV file
    csv_file_path = subdirectory / 'point_sample/unique_labels.csv'
    with csv_file_path.open('w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Point', 'Integer Label', 'Written Out Label'])
        writer.writerows(new_labels)

