In [1]:
import os
import pandas as pd

import shutil
from colorama import Fore, Style


In [2]:
samples_destination_folder = 'data/samples'
captcha_name = 'Captcha_Name'
captcha_answer = 'Captcha_Answer'

In [3]:
# create samples folder if not exists
if not os.path.exists(samples_destination_folder):
    os.makedirs(samples_destination_folder)
else:
    # remove all files from samples folder
    for file in os.listdir(samples_destination_folder):
        os.remove(os.path.join(samples_destination_folder, file))

In [4]:
def create_file_names_with_labels(csv_file_with_labels, folder_path):
    
    # Read the CSV file containing the labels
    labels_df = pd.read_csv(csv_file_with_labels)
    
    # Get all the rows with column "Captcha_Answer" not empty and  create a new dataframe
    labels_df = labels_df[labels_df[captcha_answer].notnull()]
    # labels_df = labels_df.head(samples_count_to_label + 1)
    
    file_names = os.listdir(folder_path)
    file_names = [file for file in file_names if file.endswith('.png')]
    
    label_not_found_count = 0
    label_with_error_count = 0
    # Loop through all the file names in the folder
    for file_name in file_names:
        # check if file not already renamed
        if file_name.find('_') != -1:
            continue
        
        # Search for the file name in the "Captcha_Name" column
        try:
            label = labels_df.loc[labels_df[captcha_name] == file_name, captcha_answer].values[0]
        except:
            label = None
        
        # Check if label is not empty
        if (label == None):
            label_not_found_count += 1
            continue
        #Check incorrect label if length is not 5
        if (label != None) and (len(label) != 5):
            label_with_error_count += 1
            print(Fore.RED + 'Incorrect label:', label, 'File:', file_name, Style.RESET_ALL)
            continue
        else:
            # remove the start and end white spaces from the label anc convert to lower case
            label = label.strip().lower()
            #print(Fore.GREEN + 'Label:', label, 'File:', file_name, Style.RESET_ALL)
            # Rename the file with the appended label
            new_file_name = file_name.replace('.png', '_' + str(label) + '.png')
            
            # Copy the file to the samples folder with new name
            shutil.copy(os.path.join(folder_path, file_name), os.path.join(samples_destination_folder, new_file_name))
    
    # Decorate the print statement with colors
    print(Fore.RED + 'Total Labels with errors:', label_with_error_count, Style.RESET_ALL)
    print(Fore.YELLOW + 'Total Labels not found:', label_not_found_count, Style.RESET_ALL)
    


In [5]:
folder_and_csv_labels = [
    ('rsce_sample_captcha_batches/_start_1000', 'google-sheets/_start_1000/file_names_start_1000_labels.csv'),
    ('rsce_sample_captcha_batches/_start_2000', 'google-sheets/_start_2000/file_names_start_2000_labels.csv'),
    ('rsce_sample_captcha_batches/_start_3000', 'google-sheets/_start_3000/file_names_start_3000_labels.csv'),
    ('rsce_sample_captcha_batches/_start_4000', 'google-sheets/_start_4000/file_names_start_4000_labels.csv'),
    ('rsce_sample_captcha_batches/_start_5000', 'google-sheets/_start_5000/file_names_start_5000_labels.csv'),
    ('rsce_sample_captcha_batches/_start_6000', 'google-sheets/_start_6000/file_names_start_6000_labels.csv'),
    ('rsce_sample_captcha_batches_2/_start_1000', 'google-sheets-2/_start_1000/file_names_2_start_1000_labels.csv'),
    ('rsce_sample_captcha_batches_2/_start_2000', 'google-sheets-2/_start_2000/file_names_2_start_2000_labels.csv'),
    ('rsce_sample_captcha_batches_2/_start_3000', 'google-sheets-2/_start_3000/file_names_2_start_3000_labels.csv'),
    ]
for folder_path, csv_file_with_labels in folder_and_csv_labels:
    print(Fore.BLUE + 'Folder:', folder_path, 'CSV:', csv_file_with_labels, Style.RESET_ALL)
    create_file_names_with_labels(csv_file_with_labels, folder_path)

[34mFolder: rsce_sample_captcha_batches/_start_1000 CSV: google-sheets/_start_1000/file_names_start_1000_labels.csv [0m
[31mTotal Labels with errors: 0 [0m
[33mTotal Labels not found: 0 [0m
[34mFolder: rsce_sample_captcha_batches/_start_2000 CSV: google-sheets/_start_2000/file_names_start_2000_labels.csv [0m
[31mTotal Labels with errors: 0 [0m
[33mTotal Labels not found: 0 [0m
[34mFolder: rsce_sample_captcha_batches/_start_3000 CSV: google-sheets/_start_3000/file_names_start_3000_labels.csv [0m
[31mTotal Labels with errors: 0 [0m
[33mTotal Labels not found: 0 [0m
[34mFolder: rsce_sample_captcha_batches/_start_4000 CSV: google-sheets/_start_4000/file_names_start_4000_labels.csv [0m
[31mTotal Labels with errors: 0 [0m
[33mTotal Labels not found: 0 [0m
[34mFolder: rsce_sample_captcha_batches/_start_5000 CSV: google-sheets/_start_5000/file_names_start_5000_labels.csv [0m
[31mTotal Labels with errors: 0 [0m
[33mTotal Labels not found: 0 [0m
[34mFolder: rsce_sa

In [6]:
# create zip file of data/samples folder with name samples.zip and save it in root directory
zip_file_name = 'data-samples'
# first delete if samples.zip already exists
if os.path.exists(zip_file_name + '.zip'):
    os.remove(zip_file_name + '.zip')
shutil.make_archive(zip_file_name, 'zip', samples_destination_folder)

'd:\\Sample_Captcha_Data\\data-samples.zip'