In [1]:
import pandas as pd
import os
import shutil

In [2]:
def find_jpg_files(root_folder, extension):
    df = pd.DataFrame()

    # Iterate through all subdirectories in root_folder
    for foldername in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, foldername)
        
        # Check if it's a directory
        if os.path.isdir(folder_path):
            file_list = []
            for dirpath, _, filenames in os.walk(folder_path):
                for filename in filenames:
                    if filename.lower().endswith(extension):
                        full_path = os.path.join(dirpath, filename)
                        file_list.append({
                            'filename': filename, 
                            'full_path': full_path, 
                            'foldername': dirpath
                        })
            
            # Create DataFrame for current folder
            tmp_df = pd.DataFrame(file_list, columns=['filename', 'full_path', 'foldername'])
            df = pd.concat([df, tmp_df], ignore_index=True)
            
            print(f'{foldername}: {tmp_df.shape}')

    df['full_path'] = df['full_path'].apply(lambda x: x.replace('\\', '/'))
    return df

In [3]:
def data_to_csv(df, root_folder):
    output_dir = './data/test'
    folder = root_folder.split('/')[-1]
    os.makedirs(output_dir, exist_ok=True)  # 確保輸出目錄存在

    # for folder in df['foldername'].unique():
    #     df_tmp = df[df['foldername'] == folder]
    #     # 按檔名排序（忽略大小寫）
    df_tmp = df.sort_values(['foldername', 'filename'], key=lambda x: x.str.lower()).reset_index(drop=True)

    # 計算總批次數
    batch_size = 200
    num_batches = (len(df_tmp) + batch_size - 1) // batch_size

    for batch_idx in range(num_batches):
        start = batch_idx * batch_size
        end = start + batch_size
        batch_df = df_tmp.iloc[start:end]
        
        # 檔名格式：folder_1.csv、folder_2.csv...
        output_file = os.path.join(output_dir, f'{folder}_{batch_idx + 1}.csv')
        batch_df.to_csv(output_file, index=False)
        print(f'{output_file}:{batch_df.shape[0]}')

In [4]:
root_folder = './images/check/photo_activity_1130-1'
df = find_jpg_files(root_folder, '.jpg')
print(f'---df shape：{df.shape}')
data_to_csv(df, root_folder)

其他: (1, 3)
LingOrm: (5, 3)
雜誌: (13, 3)
活動: (225, 3)
拍攝: (1, 3)
ormstagram: (4, 3)
---df shape：(249, 3)
./data/test/photo_activity_1130-1_1.csv:200
./data/test/photo_activity_1130-1_2.csv:49


In [5]:
root_folder = './images/check/photo_activity_1201-1'
df = find_jpg_files(root_folder, '.jpg')
print(f'---df shape：{df.shape}')
data_to_csv(df, root_folder)

其他: (0, 3)
LingOrm: (2, 3)
雜誌: (23, 3)
活動: (76, 3)
拍攝: (1, 3)
ormstagram: (2, 3)
桌布: (1, 3)
---df shape：(105, 3)
./data/test/photo_activity_1201-1_1.csv:105
