In [None]:
!pip install papermill tqdm google-cloud-storage

import papermill as pm
from tqdm.auto import tqdm
import time
from google.cloud import storage
from google.colab import auth
import resource

# Authenticate and set up the GCS client
auth.authenticate_user()
gcs_client = storage.Client()

# Define the GCS bucket and folders
bucket_name = 'prd-marketshare'
input_folder = '07_notebooks/01_notebooks_input'
output_folder = '07_notebooks/02_notebooks_output'
bucket = gcs_client.get_bucket(bucket_name)

# Define the list of notebook names to be executed
notebooks_to_run = [
    "01 Cleaning.ipynb",
    "02 PIHS VL.ipynb",
    "03 PIHS VL PS.ipynb",
    "04.1 PIHS VL PS SD - EMOT.ipynb",
    "04.2 PIHS VL PS SD - OBC.ipynb",
    "05 Transformation.ipynb",
    "06 MELTED LV PRICES.ipynb",
    "07 MELTED HV PRICES.ipynb",
    "08 MARKETSHARE.ipynb"
]

# Function to download a file from GCS
def download_from_gcs(bucket, source_blob_name, destination_file_name):
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

# Function to upload a file to GCS
def upload_to_gcs(bucket, source_file_name, destination_blob_name):
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)

# Initialize tqdm to create a progress bar for the notebooks
for notebook in tqdm(notebooks_to_run, desc='Executing Notebooks', unit='notebook'):
    input_path = f"{input_folder}/{notebook}"
    output_path = f"{output_folder}/{notebook}"

    # Download the notebook from GCS
    download_from_gcs(bucket, input_path, notebook)

    # Print which notebook is being executed
    print(f"Starting execution of {notebook}...")

    # Execute the notebook with papermill and measure peak memory usage
    peak_memory_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    pm.execute_notebook(
        input_path=notebook,
        output_path=notebook,  # Overwrite the local copy of the notebook
        parameters=dict()  # Pass additional parameters if needed
    )
    peak_memory_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    memory_difference = (peak_memory_after - peak_memory_before) / 1024  # Convert kilobytes to megabytes

    # Print the memory usage in megabytes
    print(f"Peak memory increase for {notebook}: {memory_difference:.2f} megabytes")

    # Upload the executed notebook back to GCS in the output folder
    upload_to_gcs(bucket, notebook, output_path)

    # Print confirmation that the notebook has been processed and uploaded
    print(f"Finished execution and uploaded {notebook} to {output_path}.")

    # Optional: Wait for a short period before proceeding to the next notebook
    time.sleep(10)
