In [11]:
import os
import re
from pathlib import Path
from google.cloud import storage
import json
from google.oauth2 import service_account

# Initialize GCS client
gcp_credentials_info = os.getenv("GCP_SERVICE_ACCOUNT_CREDENTIALS")
gcp_credentials_info = json.loads(gcp_credentials_info)
gcp_service_account_credentials = service_account.Credentials.from_service_account_info(gcp_credentials_info)
storage_client = storage.Client(credentials=gcp_service_account_credentials)
bucket = storage_client.bucket("loomy-public-documents")

# Base paths
local_base_path = Path(r"c:\Users\leoac\Work\Companies\Loomy (personal)\loomy\kb\Leggi\parlamento")
gcs_base_folder = "parlamento"

def clean_text(text):
    """
    Clean incorrectly spaced words containing 'legge' and 'decreto'
    Examples:
    - dellalegge -> della legge
    - lalegge -> la legge
    - deldecreto -> del decreto
    - aldecreto -> al decreto
    - ildecreto -> il decreto
    - deldecreto-legge -> del decreto-legge
    """
    # Pattern for words ending with 'legge' (with or without hyphen and additional text)
    # Captures: (prefix)(legge)(optional hyphen and more text)
    text = re.sub(r'(\w+)(legge)(\b)', r'\1 \2\3', text, flags=re.IGNORECASE)
    
    # Pattern for words ending with 'decreto' (with or without hyphen and additional text)
    # Captures: (prefix)(decreto)(optional: -legge or other)
    text = re.sub(r'(\w+)(decreto)(\b)', r'\1 \2\3', text, flags=re.IGNORECASE)
    
    return text

# Counter for tracking progress
uploaded_count = 0
error_count = 0

# Walk through all subdirectories in parlamento
for root, dirs, files in os.walk(local_base_path):
    for filename in files:
        if filename.endswith('.txt'):
            local_file_path = Path(root) / filename
            
            # Calculate relative path from parlamento folder
            relative_path = local_file_path.relative_to(local_base_path)
            
            # Construct GCS path
            gcs_path = f"{gcs_base_folder}/{relative_path.as_posix()}"
            
            try:
                # Read the file
                with open(local_file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                # Clean the content
                cleaned_content = clean_text(content)

                # Upload to GCS
                blob = bucket.blob(gcs_path)
                blob.upload_from_string(cleaned_content, content_type='text/plain')
                
                uploaded_count += 1
                
                if uploaded_count % 10 == 0:
                    print(f"Uploaded {uploaded_count} files...")
                    
            except Exception as e:
                print(f"Error processing {local_file_path}: {str(e)}")
                error_count += 1

print(f"\n✓ Upload complete!")
print(f"  Successfully uploaded: {uploaded_count} files")
print(f"  Errors: {error_count} files")

Uploaded 10 files...
Uploaded 20 files...
Uploaded 20 files...
Uploaded 30 files...
Uploaded 30 files...
Uploaded 40 files...
Uploaded 40 files...
Uploaded 50 files...
Uploaded 50 files...
Uploaded 60 files...
Uploaded 60 files...
Uploaded 70 files...
Uploaded 70 files...
Uploaded 80 files...
Uploaded 80 files...
Uploaded 90 files...
Uploaded 90 files...
Uploaded 100 files...
Uploaded 100 files...
Uploaded 110 files...
Uploaded 110 files...
Uploaded 120 files...
Uploaded 120 files...
Uploaded 130 files...
Uploaded 130 files...
Uploaded 140 files...
Uploaded 140 files...
Uploaded 150 files...
Uploaded 150 files...
Uploaded 160 files...
Uploaded 160 files...
Uploaded 170 files...
Uploaded 170 files...
Uploaded 180 files...
Uploaded 180 files...
Uploaded 190 files...
Uploaded 190 files...
Uploaded 200 files...
Uploaded 200 files...
Uploaded 210 files...
Uploaded 210 files...
Uploaded 220 files...
Uploaded 220 files...
Uploaded 230 files...
Uploaded 230 files...
Uploaded 240 files...
Uploa