In [1]:
import os
import requests
import time
from tqdm.notebook import tqdm
import uuid

def process_files_in_batches(root_dir, batch_size=6, delay_seconds=8):
    filepaths = []
    for subdir, dirs, files in os.walk(root_dir):
        if 'train' in subdir or 'dev' in subdir:
            for file in files:
                filepaths.append(os.path.join(subdir, file))

    if not filepaths:
        print(f"No files found in {root_dir}")
        return

    for i in tqdm(range(0, len(filepaths), batch_size), desc=f"Processing {os.path.basename(root_dir)} files"):
        batch = filepaths[i:i+batch_size]
        for file_path in batch:
            print(f"Sending file: {file_path}")
            try:
                url = 'http://127.0.0.1:4000/mdp/ai-safe/anonymize-file'
                data = {'requestId': str(uuid.uuid4()), 'seed': 'xyz'}
                with open(file_path, 'rb') as f:
                    files = {'file': f}
                    response = requests.post(url, files=files, data=data)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
        
        if i + batch_size < len(filepaths):
            time.sleep(delay_seconds)

    print(f"All {os.path.basename(root_dir)} files processed.")

In [2]:
# Process doc files

docs_dir = '/home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/docs'
process_files_in_batches(docs_dir)

Processing docs files:   0%|          | 0/1069 [00:00<?, ?it/s]

Sending file: /home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/docs/hipe2020/fr/train/hipe2020_fr_train_chunk323.docx
Sending file: /home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/docs/hipe2020/fr/train/hipe2020_fr_train_chunk461.docx
Sending file: /home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/docs/hipe2020/fr/train/hipe2020_fr_train_chunk140.docx
Sending file: /home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/docs/hipe2020/fr/train/hipe2020_fr_train_chunk124.docx
Sending file: /home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/docs/hipe2020/fr/train/hipe2020_fr_train_chunk438.docx
Sending file: /home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/docs/hipe2020/fr/train/hipe2020_fr_train_chunk283.docx
Sending file: /home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/docs/hipe2020/fr/train/hipe2020_fr_train_chunk562.docx
Sending file: /home/stark007/MayaDataPriv

In [None]:
# Process srt files

srt_dir = '/home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/srt'
process_files_in_batches(srt_dir)

In [None]:
# Process log files

log_dir = '/home/stark007/MayaDataPrivacy/TESTAPI/DatasetTransformation/output/log'
process_files_in_batches(log_dir)