In [1]:
import json
import numpy as np
import pandas as pd
import os
import glob
# Used to randomly shuffle the data
import random
# Used to copy/move from data source to data destination
import shutil
# Used to get the image data
import base64

# Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###Confirm files in Test folder
- confirm no JPG files in folder that do not have associated JSON files, if so, delete them
- confirm no JSON files exists in folder that do not have associated JPG files, if so, copy them over from training folder and delete the JSON file on the train folder side

In [3]:
# Define the test directory
test_dir = '/content/drive/MyDrive/test/exports/test'

In [4]:
# Get a list of jpg file names in the test directory
jpg_files = [os.path.basename(file_name) for file_name in glob.glob(os.path.join(test_dir, '*.jpg'))]

In [5]:
# Get a list of json file names in the test directory
json_files = [os.path.basename(file_name) for file_name in glob.glob(os.path.join(test_dir, '*.json'))]

In [6]:
# Create an empty list to store the jpg files to remove
jpg_files_to_remove = []

In [7]:
# Create an empty list to store the json files without corresponding jpg files
json_files_without_jpg = []

In [8]:
# Loop through the jpg files
for jpg_file in jpg_files:
    # Get the file name without the extension
    file_name = os.path.splitext(jpg_file)[0]
    # Check if there is a json file with the same name
    if file_name + '.json' not in json_files:
        # Add the jpg file to the list of files to remove
        jpg_files_to_remove.append(jpg_file)

In [9]:
# Loop through the json files
for json_file in json_files:
    # Get the file name without the extension
    file_name = os.path.splitext(json_file)[0]
    # Check if there is a jpg file with the same name
    if file_name + '.jpg' not in jpg_files:
        # Add the json file to the list of files without corresponding jpg files
        json_files_without_jpg.append(json_file)

In [10]:
# Loop through the jpg files to remove
for jpg_file in jpg_files_to_remove:
    # Get the full path of the jpg file
    jpg_path = os.path.join(test_dir, jpg_file)
    # Remove the jpg file
    os.remove(jpg_path)

In [11]:
# Print the list of jpg files removed
print('The following jpg files were removed:')
for jpg_file in jpg_files_to_remove:
    print(jpg_file)

The following jpg files were removed:


In [12]:
# Print the list of json files without corresponding jpg files
print('The following json files do not have corresponding jpg files:')
for json_file in json_files_without_jpg:
    print(json_file)

The following json files do not have corresponding jpg files:
6ff1187d58522da123a25c0e8354f29f.json
563f37df673fa80cfc369fbcf4a9869a.json
39598903e7ffcd5d5ce5354c3a4d2764.json


###Compare Train and Test folder number of classes, it should be the same amount

In [13]:
# Open the JSON file
with open('/content/drive/MyDrive/test/exports/train.json') as f:
    data = json.load(f)

In [14]:
# List the parent nodes
parent_nodes = list(data.keys())
print(parent_nodes)

['images', 'categories', 'annotations']


In [15]:
# Assuming 'data' is your loaded JSON data
dataframes = {}

In [16]:
for key in data.keys():
    # Check if the data under the key is a list
    if isinstance(data[key], list):
        # Convert the dataset to a pandas DataFrame
        dataframes[key] = pd.DataFrame(data[key])
    elif isinstance(data[key], dict):
        # If the data is a dictionary of scalar values, convert it to a DataFrame with a single row
        dataframes[key] = pd.DataFrame([data[key]])
    else:
        # If the data is a scalar value, convert it to a DataFrame with a single row
        dataframes[key] = pd.DataFrame([data[key]])

In [17]:
# Now you can access each dataframe using its parent node name
# For example, to access the dataframe under the parent node 'node1', you can use:
df_cat = dataframes['categories']
df_ann = dataframes['annotations']
df_images = dataframes['images']

In [18]:
# Review the catagories to determine which ones to be used to custom train the model
df_cat

Unnamed: 0,supercategory,id,name
0,"top, t-shirt, sweatshirt",1,"top, t-shirt, sweatshirt"
1,"shirt, blouse",2,"shirt, blouse"
2,sweater,3,sweater
3,cardigan,4,cardigan
4,jacket,5,jacket
5,vest,6,vest
6,pants,7,pants
7,shorts,8,shorts
8,skirt,9,skirt
9,coat,10,coat


In [19]:
# Open the JSON file
with open('/content/drive/MyDrive/test/exports/test.json') as f:
    testdata = json.load(f)

In [20]:
# List the parent nodes
testparent_nodes = list(testdata.keys())
print(testparent_nodes)

['images', 'categories', 'annotations']


In [21]:
# Assuming 'data' is your loaded JSON data
dataframes = {}

In [22]:
for key in testdata.keys():
    # Check if the data under the key is a list
    if isinstance(testdata[key], list):
        # Convert the dataset to a pandas DataFrame
        dataframes[key] = pd.DataFrame(testdata[key])
    elif isinstance(testdata[key], dict):
        # If the data is a dictionary of scalar values, convert it to a DataFrame with a single row
        dataframes[key] = pd.DataFrame([testdata[key]])
    else:
        # If the data is a scalar value, convert it to a DataFrame with a single row
        dataframes[key] = pd.DataFrame([testdata[key]])

In [23]:
# Now you can access each dataframe using its parent node name
# For example, to access the dataframe under the parent node 'node1', you can use:
testdf_cat = dataframes['categories']
testdf_ann = dataframes['annotations']
testdf_images = dataframes['images']

In [24]:
# Review the catagories to determine which ones to be used to custom train the model
testdf_cat

Unnamed: 0,supercategory,id,name
0,"shirt, blouse",1,"shirt, blouse"
1,"top, t-shirt, sweatshirt",2,"top, t-shirt, sweatshirt"
2,coat,3,coat
3,dress,4,dress
4,jumpsuit,5,jumpsuit
5,sweater,6,sweater
6,jacket,7,jacket
7,vest,8,vest
8,pants,9,pants
9,skirt,10,skirt
