This notebook:
- Downloads the latest version of the glue notebooks from s3 bucket
- Changes `%connections target_env_prod, stuff` to `%connections {target_env}, stuff`
- Uploads the modified version back to s3 bucket

In [None]:
# list of etls that we care about (taken from the step function)
step_etls=[
    # 'benefits.benefits_and_enrollment',
    'benefits.company',
    'benefits.company_address',
    'benefits.company_onboarding_status',
    'benefits.hra_program_class',
    'benefits.reimbursement_rate',
    'benefits.rmb_class_zip_county',
    
    'billing.company',
    
    'election-1-etp',
    'election-2-rp and hbe',
    
    'funding',
    
    'people.address',
    'people.company_admin',
    'people.employment',
    'people.person',
    
    'ledger.recurring_premiums',
    'ledger.ledger',
    'ledger.account',
    'ledger.journal_entry - accrue allowance'
    
]
target_env='target_env_migrate' # target_env_migrate, target_env_staging, target_env_prod, target_env_test

source_folder = '0_inputs'
results_folder = '1_results'

bucket_name = 's3_bucket'
prefix = 'notebooks/'  # The prefix or "folder" path in the S3 bucket


### 1- Download the selected the notebooks from S3

In [2]:
import os
import boto3
import re

# Initialize the S3 client
s3 = boto3.client('s3')

# Define the local directory where you want to save the files
local_dir = source_folder
if not os.path.exists(local_dir):
    os.makedirs(local_dir)

# List objects in the S3 bucket with the specified prefix
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
counter = 0
total = len(step_etls)
# Check if there are contents in the response
if 'Contents' in response:
    for obj in response['Contents']:
        key = obj['Key']
        # Skip if it's just the folder name
        if key.endswith('/'):
            continue
        
        # Only download if the file has a .ipynb extension
        if key.endswith('.ipynb'):
            k='.'.join(key.split('/')[-1].split('.')[:-1]) # etl name without the path nor the extension
            # print(k)
            if k in step_etls:
                counter += 1
                # Define the local file path
                local_file_path = os.path.join(local_dir, key.split('/')[-1])
                
                # Download the file
                print(f'{counter}/{total} Downloading {key} to {local_file_path}...')
                s3.download_file(bucket_name, key, local_file_path)

print("Download complete!")


1/18 Downloading notebooks/benefits.company.ipynb to 0_inputs\benefits.company.ipynb...
2/18 Downloading notebooks/benefits.company_address.ipynb to 0_inputs\benefits.company_address.ipynb...
3/18 Downloading notebooks/benefits.company_onboarding_status.ipynb to 0_inputs\benefits.company_onboarding_status.ipynb...
4/18 Downloading notebooks/benefits.hra_program_class.ipynb to 0_inputs\benefits.hra_program_class.ipynb...
5/18 Downloading notebooks/benefits.reimbursement_rate.ipynb to 0_inputs\benefits.reimbursement_rate.ipynb...
6/18 Downloading notebooks/benefits.rmb_class_zip_county.ipynb to 0_inputs\benefits.rmb_class_zip_county.ipynb...
7/18 Downloading notebooks/billing.company.ipynb to 0_inputs\billing.company.ipynb...
8/18 Downloading notebooks/election-1-etp.ipynb to 0_inputs\election-1-etp.ipynb...
9/18 Downloading notebooks/election-2-rp and hbe.ipynb to 0_inputs\election-2-rp and hbe.ipynb...
10/18 Downloading notebooks/funding.ipynb to 0_inputs\funding.ipynb...
11/18 Downloa

### 2- Modify selected notebooks (same notebooks from the step function)

In [None]:

# Create results folder if it does not exist
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

success=True
# Iterate over all files in the source folder
# for filename in os.listdir(source_folder):

for i,filename in enumerate(step_etls):
    file_path =  f'{source_folder}//{filename}.ipynb'
    
    # Check if it is a file (not a directory)
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            notebook_content = f.read()
        
        # remove all the commented lines 
        nc=re.sub(r'#.*?\\n','',notebook_content) # uncommented notebook_contents
        # above line might cause an issue if there are markdown cell before the frist code cell
        
        # Find connections line
        connection_lines = re.findall(r'%connections.*?\\n', nc)
        
        if len(connection_lines) == 1:
            # there is a %connections line
            b_query = re.findall('bigquery', connection_lines[0])
            if len(b_query) > 0:
                # bigquery exists
                print(f'{i+1}/{total} {filename} -> %connections {target_env}, bigquery')
                result = re.sub(r'%connections.*?\\n', rf'%connections {target_env}, bigquery\\n', notebook_content)
                # print(filename)
            else:
                # no bigquery, only target_envs
                print(f'{i+1}/{total} {filename} -> %connections {target_env}')
                # print(filename)
                result = re.sub(r'%connections.*?\\n', rf'%connections {target_env}\\n', notebook_content)
        elif len(connection_lines) >1:
            print(rf'more than 1 %connections line found in {filename}')
            success=False
            break
        else:
            print(rf'no %connections line found in {filename}')
            result = notebook_content  # No changes made
            success=False
            break

        # Save the modified content to the results folder
        result_path = f'{results_folder}//{filename}.ipynb'
        with open(result_path, 'w', encoding='utf-8') as f:
            f.write(result)
    else:
        print(f'!!!!!!!!!!!!!!!!!!!! {file_path} does not exist!')
        success=False
        break
if success :
    print('Success!!✅')
else: 
    print('FAILED!!!❌')


1/19 benefits.company -> %connections target_env_migrate, bigquery
2/19 benefits.company_address -> %connections target_env_migrate
3/19 benefits.company_onboarding_status -> %connections target_env_migrate
4/19 benefits.hra_program_class -> %connections target_env_migrate
5/19 benefits.reimbursement_rate -> %connections target_env_migrate
6/19 benefits.rmb_class_zip_county -> %connections target_env_migrate
7/19 billing.company -> %connections target_env_migrate
8/19 election-1-etp -> %connections target_env_migrate
9/19 election-2-ss and hbe -> %connections target_env_migrate
10/19 election-2-rp and hbe -> %connections target_env_migrate
11/19 funding -> %connections target_env_migrate
12/19 people.address -> %connections target_env_migrate
13/19 people.company_admin -> %connections target_env_migrate
14/19 people.employment -> %connections target_env_migrate
15/19 people.person -> %connections target_env_migrate
16/19 ledger.recurring_premiums -> %connections target_env_migrate
17/1

### 2- Upload Notebooks

In [37]:
# Define the local directory containing the files to upload
local_dir = results_folder  # Change 'source' to your local folder name
if success:
    # Iterate through the files in the local directory
    for root, dirs, files in os.walk(local_dir):
        for i,file in enumerate(files):
            local_file_path = os.path.join(root, file)
            s3_key = prefix + file  # Define the S3 key (path in the bucket)
            
            # Upload the file
            print(f'{i+1}/{len(files)} Uploading {local_file_path} to s3://{bucket_name}/{s3_key}...')
            s3.upload_file(local_file_path, bucket_name, s3_key)

    print("Upload complete!")
else: 
    print('CHANGE FAILED check above cell!!')

1/19 Uploading 1_results\benefits.company.ipynb to s3://s3_bucket/notebooks/benefits.company.ipynb...
2/19 Uploading 1_results\benefits.company_address.ipynb to s3://s3_bucket/notebooks/benefits.company_address.ipynb...
3/19 Uploading 1_results\benefits.company_onboarding_status.ipynb to s3://s3_bucket/notebooks/benefits.company_onboarding_status.ipynb...
4/19 Uploading 1_results\benefits.hra_program_class.ipynb to s3://s3_bucket/notebooks/benefits.hra_program_class.ipynb...
5/19 Uploading 1_results\benefits.reimbursement_rate.ipynb to s3://s3_bucket/notebooks/benefits.reimbursement_rate.ipynb...
6/19 Uploading 1_results\benefits.rmb_class_zip_county.ipynb to s3://s3_bucket/notebooks/benefits.rmb_class_zip_county.ipynb...
7/19 Uploading 1_results\billing.company.ipynb to s3://s3_bucket/notebooks/billing.company.ipynb...
8/19 Uploading 1_results\election-1-etp.ipynb to s3://s3_bucket/notebooks/election-1-etp.ipynb...
9/19 Uploading 1_results\election-2-rp and hbe.ipynb to s3://s3_bucket