In [2]:
import time
import re
from google.cloud import storage
from collections import deque
# Google Cloud Storage Bucket Details
BUCKET_NAME = "tpch-source"
SRC_FOLDER = "orders-partittions/"
DEST_FOLDER = "orders-partittions2/"

# Initialize GCS Client
client = storage.Client()



In [4]:


def extract_numeric_values(path):
    """
    Extracts numeric values from GCS path like:
    'orders_partitioned_csv/year=2023/month=1/day=2/part-00000.csv'

    Sorting Order: (year, month, day, part_number)
    """
    year_match = re.search(r'year=(\d+)', path)
    month_match = re.search(r'month=(\d+)', path)
    day_match = re.search(r'day=(\d+)', path)
    part_match = re.search(r'part-(\d+)', path)

    year = int(year_match.group(1)) if year_match else 0
    month = int(month_match.group(1)) if month_match else 0
    day = int(day_match.group(1)) if day_match else 0
    part = int(part_match.group(1)) if part_match else 0  # Extract part-XXXXX number

    return (year, month, day, part, path)  # Return tuple for sorting

def list_files_sorted(prefix):
    """List and sort files once based on (year, month, day, part-XXXXX)."""
    bucket = client.bucket(BUCKET_NAME)
    blobs = list(bucket.list_blobs(prefix=prefix))

    # Filter only CSV files
    files = [blob.name for blob in blobs if blob.name.endswith(".csv")]

    # Sort using extracted numeric values
    files_sorted = sorted(files, key=extract_numeric_values)

    return deque(files_sorted)  # Return a queue for sequential processing

def copy_file(src_blob_name, dest_blob_name):
    """Copy a file from one GCS location to another."""
    bucket = client.bucket(BUCKET_NAME)
    source_blob = bucket.blob(src_blob_name)
    destination_blob = bucket.copy_blob(source_blob, bucket, dest_blob_name)
    print(f"Copied {src_blob_name} -> {dest_blob_name}")

def batch_copy():
    """Copy files sequentially every 5 minutes from the queue."""
    files_queue = list_files_sorted(SRC_FOLDER)  # Fetch & sort files ONCE

    while files_queue:
        file_to_copy = files_queue.popleft()  # Take first file from the queue
        print (file_to_copy)
        dest_file_name = file_to_copy.replace(SRC_FOLDER, DEST_FOLDER)  # Change folder path

        copy_file(file_to_copy, dest_file_name)

        if not files_queue:
            print("All files copied. Exiting...")
            break

        # Wait for 5 minutes before copying the next file
        time.sleep(5)  # 300 seconds = 5 minutes


In [5]:

batch_copy()


orders-partittions/year=1992/month=1/day=1/part-00000-34ba453c-b34c-4d0b-9ee4-ea56f71e2a10.c000.csv
Copied orders-partittions/year=1992/month=1/day=1/part-00000-34ba453c-b34c-4d0b-9ee4-ea56f71e2a10.c000.csv -> orders-partittions2/year=1992/month=1/day=1/part-00000-34ba453c-b34c-4d0b-9ee4-ea56f71e2a10.c000.csv
orders-partittions/year=1992/month=1/day=2/part-00000-34ba453c-b34c-4d0b-9ee4-ea56f71e2a10.c000.csv
Copied orders-partittions/year=1992/month=1/day=2/part-00000-34ba453c-b34c-4d0b-9ee4-ea56f71e2a10.c000.csv -> orders-partittions2/year=1992/month=1/day=2/part-00000-34ba453c-b34c-4d0b-9ee4-ea56f71e2a10.c000.csv
orders-partittions/year=1992/month=1/day=3/part-00000-34ba453c-b34c-4d0b-9ee4-ea56f71e2a10.c000.csv
Copied orders-partittions/year=1992/month=1/day=3/part-00000-34ba453c-b34c-4d0b-9ee4-ea56f71e2a10.c000.csv -> orders-partittions2/year=1992/month=1/day=3/part-00000-34ba453c-b34c-4d0b-9ee4-ea56f71e2a10.c000.csv
orders-partittions/year=1992/month=1/day=4/part-00000-34ba453c-b34c