## Step 1: Install Required Libraries

In [1]:
%pip install google-cloud-storage

Collecting google-cloud-storage
  Downloading google_cloud_storage-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting google-api-core<3.0.0,>=2.27.0 (from google-cloud-storage)
  Downloading google_api_core-2.28.1-py3-none-any.whl.metadata (3.3 kB)
Collecting google-cloud-core<3.0.0,>=2.4.2 (from google-cloud-storage)
  Downloading google_cloud_core-2.5.0-py3-none-any.whl.metadata (3.1 kB)
Collecting google-resumable-media<3.0.0,>=2.7.2 (from google-cloud-storage)
  Downloading google_resumable_media-2.8.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-crc32c<2.0.0,>=1.1.3 (from google-cloud-storage)
  Downloading google_crc32c-1.7.1-cp310-cp310-win_amd64.whl.metadata (2.4 kB)
Collecting googleapis-common-protos<2.0.0,>=1.56.2 (from google-api-core<3.0.0,>=2.27.0->google-cloud-storage)
  Downloading googleapis_common_protos-1.72.0-py3-none-any.whl.metadata (9.4 kB)
Collecting proto-plus<2.0.0,>=1.22.3 (from google-api-core<3.0.0,>=2.27.0->google-cloud-storage)
  Downloading proto

## Step 2: Configure GCS Settings

In [2]:
import os
from pathlib import Path

# GCS Configuration
PROJECT_ID = "kyc-aml-automation"  # Replace with your GCP project ID
BUCKET_NAME = "kyc-aml-dataset"  # Single bucket for all KYC/AML datasets
SERVICE_ACCOUNT_KEY = "C:\\Users\\Lenovo\\.ssh\\gcp\\service_account.json"  # Replace with path to your key file

# Local dataset path
DATASET_DIR = "dataset"
DATASET_ZIP = "kyc_aml_dataset_identity.zip"

# GCS paths - organized by dataset type
GCS_DATASET_PATH = "identity/v1/kyc_aml_dataset_identity.zip"  # identity/version/filename structure

print(f"Project ID: {PROJECT_ID}")
print(f"Bucket: {BUCKET_NAME}")
print(f"Local dataset: {DATASET_DIR}")
print(f"Will upload to: gs://{BUCKET_NAME}/{GCS_DATASET_PATH}")

Project ID: kyc-aml-automation
Bucket: kyc-aml-dataset
Local dataset: dataset
Will upload to: gs://kyc-aml-dataset/identity/v1/kyc_aml_dataset_identity.zip


## Step 3: Verify Dataset Structure

In [3]:
def verify_dataset(base_dir):
    """Verify dataset structure and count images"""
    if not os.path.exists(base_dir):
        print(f"‚ùå Dataset directory not found: {base_dir}")
        return False
    
    print("\n" + "="*60)
    print("üìä DATASET VERIFICATION")
    print("="*60)
    
    splits = ['train', 'valid']
    expected_classes = ['aadhar', 'driving', 'pan', 'passport', 'voter']
    
    total_images = 0
    for split in splits:
        split_dir = os.path.join(base_dir, split)
        if not os.path.exists(split_dir):
            print(f"‚ùå Missing split: {split}")
            return False
        
        print(f"\n{split.upper()} SET:")
        split_total = 0
        
        for class_name in expected_classes:
            class_dir = os.path.join(split_dir, class_name)
            if not os.path.exists(class_dir):
                print(f"  ‚ùå Missing class: {class_name}")
                continue
            
            images = [f for f in os.listdir(class_dir) 
                     if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            count = len(images)
            split_total += count
            print(f"  ‚úì {class_name}: {count} images")
        
        print(f"  TOTAL: {split_total} images")
        total_images += split_total
    
    print(f"\nüéØ GRAND TOTAL: {total_images} images")
    print("="*60)
    
    return total_images > 0

# Verify dataset
if verify_dataset(DATASET_DIR):
    print("\n‚úÖ Dataset verification passed!")
else:
    print("\n‚ùå Dataset verification failed!")


üìä DATASET VERIFICATION

TRAIN SET:
  ‚úì aadhar: 1324 images
  ‚úì driving: 1410 images
  ‚úì pan: 1194 images
  ‚úì passport: 540 images
  ‚úì voter: 1458 images
  TOTAL: 5926 images

VALID SET:
  ‚úì aadhar: 35 images
  ‚úì driving: 37 images
  ‚úì pan: 90 images
  ‚úì passport: 60 images
  ‚úì voter: 38 images
  TOTAL: 260 images

üéØ GRAND TOTAL: 6186 images

‚úÖ Dataset verification passed!


## Step 4: Create ZIP Archive

In [4]:
import zipfile
from datetime import datetime

def create_zip_archive(source_dir, output_zip):
    """Create a ZIP archive of the dataset"""
    print(f"\nüì¶ Creating ZIP archive: {output_zip}")
    
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(source_dir):
            for file in files:
                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, os.path.dirname(source_dir))
                    zipf.write(file_path, arcname)
                    
    # Get file size
    size_mb = os.path.getsize(output_zip) / (1024 * 1024)
    print(f"‚úÖ ZIP created: {output_zip} ({size_mb:.2f} MB)")
    return output_zip

# Create ZIP archive
zip_path = create_zip_archive(DATASET_DIR, DATASET_ZIP)


üì¶ Creating ZIP archive: kyc_aml_dataset_identity.zip
‚úÖ ZIP created: kyc_aml_dataset_identity.zip (399.68 MB)
‚úÖ ZIP created: kyc_aml_dataset_identity.zip (399.68 MB)


## Step 5: Authenticate with Google Cloud

In [6]:
from google.cloud import storage

# Set credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_ACCOUNT_KEY

# Initialize GCS client
try:
    client = storage.Client(project=PROJECT_ID)
    print(f"‚úÖ Authenticated with GCP project: {PROJECT_ID}")
    
    # List buckets to verify access
    buckets = list(client.list_buckets())
    print(f"‚úÖ Found {len(buckets)} bucket(s)")
    
except Exception as e:
    print(f"‚ùå Authentication failed: {e}")
    print("\nTroubleshooting:")
    print("1. Verify SERVICE_ACCOUNT_KEY path is correct")
    print("2. Ensure service account has 'Storage Admin' role")
    print("3. Check PROJECT_ID is correct")

‚úÖ Authenticated with GCP project: kyc-aml-automation
‚úÖ Found 0 bucket(s)
‚úÖ Found 0 bucket(s)


## Step 6: Create or Verify Bucket

In [7]:
def get_or_create_bucket(client, bucket_name, location='us-central1'):
    """Get existing bucket or create new one"""
    try:
        bucket = client.get_bucket(bucket_name)
        print(f"‚úÖ Found existing bucket: {bucket_name}")
        return bucket
    except Exception:
        print(f"üì¶ Creating new bucket: {bucket_name}")
        bucket = client.create_bucket(bucket_name, location=location)
        print(f"‚úÖ Bucket created: {bucket_name}")
        return bucket

# Get or create bucket
bucket = get_or_create_bucket(client, BUCKET_NAME)
print(f"\nBucket info:")
print(f"  Name: {bucket.name}")
print(f"  Location: {bucket.location}")
print(f"  Storage class: {bucket.storage_class}")

üì¶ Creating new bucket: kyc-aml-dataset
‚úÖ Bucket created: kyc-aml-dataset

Bucket info:
  Name: kyc-aml-dataset
  Location: US-CENTRAL1
  Storage class: STANDARD
‚úÖ Bucket created: kyc-aml-dataset

Bucket info:
  Name: kyc-aml-dataset
  Location: US-CENTRAL1
  Storage class: STANDARD


## Step 7: Upload Dataset to GCS

In [8]:
def upload_to_gcs(bucket, source_file, destination_blob_name):
    """Upload file to GCS with progress tracking"""
    blob = bucket.blob(destination_blob_name)
    
    print(f"\n‚òÅÔ∏è  Uploading to GCS...")
    print(f"   Source: {source_file}")
    print(f"   Destination: gs://{bucket.name}/{destination_blob_name}")
    
    # Upload file
    blob.upload_from_filename(source_file)
    
    # Get uploaded file info
    blob.reload()
    size_mb = blob.size / (1024 * 1024)
    
    print(f"‚úÖ Upload complete!")
    print(f"   Size: {size_mb:.2f} MB")
    print(f"   MD5: {blob.md5_hash}")
    print(f"   Created: {blob.time_created}")
    
    # Make file publicly accessible (optional)
    # blob.make_public()
    # print(f"   Public URL: {blob.public_url}")
    
    return blob

# Upload dataset
blob = upload_to_gcs(bucket, zip_path, GCS_DATASET_PATH)


‚òÅÔ∏è  Uploading to GCS...
   Source: kyc_aml_dataset_identity.zip
   Destination: gs://kyc-aml-dataset/identity/v1/kyc_aml_dataset_identity.zip
‚úÖ Upload complete!
   Size: 399.68 MB
   MD5: C7SYzAx56tqJCODxpKLb7A==
   Created: 2025-12-09 03:02:39.730000+00:00
‚úÖ Upload complete!
   Size: 399.68 MB
   MD5: C7SYzAx56tqJCODxpKLb7A==
   Created: 2025-12-09 03:02:39.730000+00:00


## Step 8: Generate Download Instructions

In [9]:
def generate_download_code(bucket_name, blob_path):
    """Generate code snippet for downloading in Colab"""
    
    print("\n" + "="*60)
    print("üìã DOWNLOAD INSTRUCTIONS FOR GOOGLE COLAB")
    print("="*60)
    
    code = f'''
# Install gsutil (if not already installed)
!pip install google-cloud-storage

# Method 1: Download using gsutil (recommended for Colab)
!gsutil -m cp gs://{bucket_name}/{blob_path} .

# Method 2: Download using Python API
from google.cloud import storage
import zipfile

# Authenticate (in Colab, use Google Drive or service account)
from google.colab import auth
auth.authenticate_user()

# Download dataset
client = storage.Client(project='{PROJECT_ID}')
bucket = client.bucket('{bucket_name}')
blob = bucket.blob('{blob_path}')
blob.download_to_filename('kyc_aml_dataset_identity.zip')

# Extract dataset
with zipfile.ZipFile('kyc_aml_dataset_identity.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

print("‚úÖ Dataset ready!")
print("Dataset structure:")
!ls -lh dataset/train dataset/valid
'''
    
    print(code)
    print("="*60)
    
    # Save to file
    instructions_file = "colab_download_instructions.txt"
    with open(instructions_file, 'w') as f:
        f.write(code)
    print(f"\n‚úÖ Instructions saved to: {instructions_file}")

generate_download_code(BUCKET_NAME, GCS_DATASET_PATH)


üìã DOWNLOAD INSTRUCTIONS FOR GOOGLE COLAB

# Install gsutil (if not already installed)
!pip install google-cloud-storage

# Method 1: Download using gsutil (recommended for Colab)
!gsutil -m cp gs://kyc-aml-dataset/identity/v1/kyc_aml_dataset_identity.zip .

# Method 2: Download using Python API
from google.cloud import storage
import zipfile

# Authenticate (in Colab, use Google Drive or service account)
from google.colab import auth
auth.authenticate_user()

# Download dataset
client = storage.Client(project='kyc-aml-automation')
bucket = client.bucket('kyc-aml-dataset')
blob = bucket.blob('identity/v1/kyc_aml_dataset_identity.zip')
blob.download_to_filename('kyc_aml_dataset_identity.zip')

# Extract dataset
with zipfile.ZipFile('kyc_aml_dataset_identity.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

print("‚úÖ Dataset ready!")
print("Dataset structure:")
!ls -lh dataset/train dataset/valid


‚úÖ Instructions saved to: colab_download_instructions.txt


## Step 9: Verify Upload

In [10]:
def verify_gcs_upload(bucket, blob_path):
    """Verify the uploaded file exists and is accessible"""
    print("\nüîç Verifying upload...")
    
    try:
        blob = bucket.blob(blob_path)
        blob.reload()
        
        print("‚úÖ File verified on GCS:")
        print(f"   Name: {blob.name}")
        print(f"   Size: {blob.size / (1024*1024):.2f} MB")
        print(f"   Content-Type: {blob.content_type}")
        print(f"   MD5: {blob.md5_hash}")
        print(f"   Created: {blob.time_created}")
        print(f"   Updated: {blob.updated}")
        print(f"\n   GCS URI: gs://{bucket.name}/{blob.name}")
        
        return True
    except Exception as e:
        print(f"‚ùå Verification failed: {e}")
        return False

verify_gcs_upload(bucket, GCS_DATASET_PATH)


üîç Verifying upload...
‚úÖ File verified on GCS:
   Name: identity/v1/kyc_aml_dataset_identity.zip
   Size: 399.68 MB
   Content-Type: application/x-zip-compressed
   MD5: C7SYzAx56tqJCODxpKLb7A==
   Created: 2025-12-09 03:02:39.730000+00:00
   Updated: 2025-12-09 03:02:39.730000+00:00

   GCS URI: gs://kyc-aml-dataset/identity/v1/kyc_aml_dataset_identity.zip
‚úÖ File verified on GCS:
   Name: identity/v1/kyc_aml_dataset_identity.zip
   Size: 399.68 MB
   Content-Type: application/x-zip-compressed
   MD5: C7SYzAx56tqJCODxpKLb7A==
   Created: 2025-12-09 03:02:39.730000+00:00
   Updated: 2025-12-09 03:02:39.730000+00:00

   GCS URI: gs://kyc-aml-dataset/identity/v1/kyc_aml_dataset_identity.zip


True

## Step 10: List All Files in Bucket

In [11]:
def list_bucket_contents(bucket, prefix=None):
    """List all files in the bucket"""
    print("\nüìÅ Bucket contents:")
    print("="*60)
    
    blobs = bucket.list_blobs(prefix=prefix)
    total_size = 0
    count = 0
    
    for blob in blobs:
        size_mb = blob.size / (1024 * 1024)
        total_size += blob.size
        count += 1
        print(f"  {blob.name} ({size_mb:.2f} MB)")
    
    print("="*60)
    print(f"Total: {count} file(s), {total_size / (1024*1024):.2f} MB")

list_bucket_contents(bucket)


üìÅ Bucket contents:
  identity/v1/kyc_aml_dataset_identity.zip (399.68 MB)
Total: 1 file(s), 399.68 MB
  identity/v1/kyc_aml_dataset_identity.zip (399.68 MB)
Total: 1 file(s), 399.68 MB


## Step 11: Clean Up (Optional)

In [12]:
# Optional: Remove local ZIP file after upload
import os

if os.path.exists(DATASET_ZIP):
    response = input(f"Delete local ZIP file ({DATASET_ZIP})? (y/n): ")
    if response.lower() == 'y':
        os.remove(DATASET_ZIP)
        print(f"‚úÖ Deleted: {DATASET_ZIP}")
    else:
        print(f"‚ÑπÔ∏è  Keeping local file: {DATASET_ZIP}")

‚ÑπÔ∏è  Keeping local file: kyc_aml_dataset_identity.zip


## Step 12: Generate Signed URL for Public Access

Generate a time-limited signed URL that allows anyone to download the dataset without authentication.

In [18]:
from datetime import timedelta

def generate_signed_url(bucket, blob_path, expiration_days=30):
    """
    Generate a signed URL that allows public download without authentication.
    The URL expires after the specified number of days.
    """
    blob = bucket.blob(blob_path)
    
    # Generate signed URL (valid for specified days)
    url = blob.generate_signed_url(
        version="v4",
        expiration=timedelta(days=expiration_days),
        method="GET"
    )
    
    return url

# Generate signed URL for the dataset
print("\n" + "="*60)
print("üîó GENERATING SIGNED URL FOR PUBLIC ACCESS")
print("="*60)

try:
    signed_url = generate_signed_url(bucket, GCS_DATASET_PATH, expiration_days=7)
    
    print(f"\n‚úÖ Signed URL generated successfully!")
    print(f"   Valid for: 365 days")
    print(f"   GCS Path: gs://{BUCKET_NAME}/{GCS_DATASET_PATH}")
    print(f"\nüìã Signed URL:")
    print(f"   {signed_url}")
    
    # Save URL to file
    url_file = "dataset_download_url.txt"
    with open(url_file, 'w') as f:
        f.write(f"# KYC/AML Identity Dataset - Download URL\n")
        f.write(f"# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"# Valid for: 365 days\n")
        f.write(f"# GCS Path: gs://{BUCKET_NAME}/{GCS_DATASET_PATH}\n\n")
        f.write(f"# Direct download URL:\n")
        f.write(f"{signed_url}\n\n")
        f.write(f"# Usage in Colab (without authentication):\n")
        f.write(f"!wget -O kyc_aml_dataset_identity.zip '{signed_url}'\n")
        f.write(f"!unzip -q kyc_aml_dataset_identity.zip\n")
    
    print(f"\n‚úÖ URL saved to: {url_file}")
    
    print("\nüìã Usage in Google Colab (no authentication needed):")
    print("```python")
    print(f"# Download dataset using signed URL")
    print(f"!wget -O kyc_aml_dataset_identity.zip '{signed_url}'")
    print(f"")
    print(f"# Extract dataset")
    print(f"!unzip -q kyc_aml_dataset_identity.zip")
    print(f"print('‚úÖ Dataset ready!')")
    print("```")
    
except Exception as e:
    print(f"\n‚ùå Failed to generate signed URL: {e}")
    print("\nNote: Make sure your service account has 'Service Account Token Creator' role")
    print("or use 'iam.serviceAccounts.signBlob' permission.")


üîó GENERATING SIGNED URL FOR PUBLIC ACCESS

‚úÖ Signed URL generated successfully!
   Valid for: 365 days
   GCS Path: gs://kyc-aml-dataset/identity/v1/kyc_aml_dataset_identity.zip

üìã Signed URL:
   https://storage.googleapis.com/kyc-aml-dataset/identity/v1/kyc_aml_dataset_identity.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=github-iac%40kyc-aml-automation.iam.gserviceaccount.com%2F20251209%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20251209T033127Z&X-Goog-Expires=604800&X-Goog-SignedHeaders=host&X-Goog-Signature=3dad62c9de341797e1eaf682b391357e18253c030e9a683ae7512afc9f881fbbf2be97e69e774af60ebc46a150291c21c1945aaad9ca998f96d43d0159f42fbd1665cf99cafca12c6973cb4990df0965611653cad0c291aed6462ee08f89ed8fa924a9ba6c52ebe31f4e7bf854191a06797bca8e21c4ecdb9bbe11e6c91d73b12b1bacbfe76a15cf35f5beb4b7a1484e7795b9fff9ced50463a22ec7d568f4c386fb005d11f6123e34fde84538c7ae014e2e5e77b61c6374acd4348b72d49fc70d84918c7e8dcaf968e5156c5fd5b1012c7f9da5809fab8d61388886873f9a3ef4078d226b5d8

## Summary

‚úÖ **Dataset uploaded to Google Cloud Storage!**

**What we did:**
1. ‚úì Verified local dataset structure
2. ‚úì Created ZIP archive of dataset
3. ‚úì Authenticated with Google Cloud
4. ‚úì Uploaded dataset to GCS bucket
5. ‚úì Generated download instructions for Colab

**Next Steps:**

### For Google Colab Training:

```python
# Add this to the beginning of your training notebook
!gsutil -m cp gs://{BUCKET_NAME}/{GCS_DATASET_PATH} .
!unzip -q kyc_aml_dataset_identity.zip
```

### Alternative: Using Google Colab Authentication

```python
from google.colab import auth
auth.authenticate_user()

from google.cloud import storage
import zipfile

client = storage.Client(project='your-project-id')
bucket = client.bucket('kyc-aml-dataset')
blob = bucket.blob('identity/v1/kyc_aml_dataset_identity.zip')
blob.download_to_filename('kyc_aml_dataset_identity.zip')

with zipfile.ZipFile('kyc_aml_dataset_identity.zip', 'r') as zip_ref:
    zip_ref.extractall('.')
```

### Managing Costs:

- Standard storage: ~$0.02 per GB/month
- Network egress (download): ~$0.12 per GB
- For this dataset (~100-500 MB): Less than $1/month

### Useful Commands:

```bash
# List all datasets
!gsutil ls gs://kyc-aml-dataset/

# List identity datasets
!gsutil ls gs://kyc-aml-dataset/identity/

# Download specific version
!gsutil cp gs://kyc-aml-dataset/identity/v1/kyc_aml_dataset_identity.zip .


# Delete from GCS (if needed)```

!gsutil rm gs://kyc-aml-dataset/identity/v1/kyc_aml_dataset_identity.zip# gs://kyc-aml-dataset/faces/v1/kyc_aml_dataset_faces.zip

# gs://kyc-aml-dataset/documents/v1/kyc_aml_dataset_documents.zip

# Future datasets can be organized like:# gs://kyc-aml-dataset/identity/v2/kyc_aml_dataset_identity_v2.zip