In [1]:
import os
import glob
import re
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

# Regular expression for numerical sorting
numbers = re.compile(r'(\d+)')

# Function for numerical sorting
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

# Define the directory path
directory_path = r'D:\CV_Project\datasets'

# Find all PNG files in the directory and its subdirectories
all_pngs = []
for root, _, files in os.walk(directory_path):
    png_files = sorted(glob.glob(os.path.join(root, '*.png')), key=numericalSort)
    all_pngs.extend(png_files)

# Extract image names from paths
image_names = [os.path.basename(image_path) for image_path in all_pngs]

# Initialize a dictionary to count occurrences of prefix patterns
prefix_counts = defaultdict(int)

# Count occurrences of each prefix pattern
for string in image_names:
    prefix = string.split('_', 2)[:2]  # Extract the prefix pattern 'i_j'
    prefix_counts[tuple(prefix)] += 1  # Count occurrences

# Find corrupted strings based on count conditions
corrupted_strings = [string for string in image_names if prefix_counts[tuple(string.split('_', 2)[:2])] != 12]

# Replace corrupted strings with None in the image_names list
image_names = [string if string not in corrupted_strings else None for string in tqdm(image_names)]

# Create a DataFrame with 'Path' and 'Filename' columns
df = pd.DataFrame({'Path': all_pngs, 'Filename': image_names})

# Display the length of the DataFrame before and after dropping rows with None values
print("Length before dropping rows with None values:", len(df))
df.dropna(axis=0, how='any', inplace=True)
df.reset_index(inplace=True, drop=True)
print("Length after dropping rows with None values:", len(df))
df.to_csv('dataset.csv', index=False)
df

100%|████████████████████████████████████████████████████████████████████████| 390521/390521 [00:58<00:00, 6668.67it/s]


Length before dropping rows with None values: 390521
Length after dropping rows with None values: 385152


Unnamed: 0,Path,Filename
0,D:\CV_Project\datasets\20230912\Part1\0_0_GT_p...,0_0_GT_pose_0_thermal.png
1,D:\CV_Project\datasets\20230912\Part1\0_0_pose...,0_0_pose_0_thermal.png
2,D:\CV_Project\datasets\20230912\Part1\0_0_pose...,0_0_pose_1_thermal.png
3,D:\CV_Project\datasets\20230912\Part1\0_0_pose...,0_0_pose_2_thermal.png
4,D:\CV_Project\datasets\20230912\Part1\0_0_pose...,0_0_pose_3_thermal.png
...,...,...
385147,D:\CV_Project\datasets\20231027\Part2\2_11000_...,2_11000_pose_6_thermal.png
385148,D:\CV_Project\datasets\20231027\Part2\2_11000_...,2_11000_pose_7_thermal.png
385149,D:\CV_Project\datasets\20231027\Part2\2_11000_...,2_11000_pose_8_thermal.png
385150,D:\CV_Project\datasets\20231027\Part2\2_11000_...,2_11000_pose_9_thermal.png


In [4]:
file_paths = list(df['Path'])
label_paths = [path for path in file_paths if "GT" in os.path.basename(path)]
len(label_paths)

32096

In [6]:
label_paths[482]

'D:\\CV_Project\\datasets\\20230912\\Part1\\0_485_GT_pose_0_thermal.png'