# Setup experiments 
This notebook helps users to create a dedicated space on the bucket with source data to run their instance of the demo.

In [1]:
# Author: ALLIANZ NLP esg data pipeline
import os
import pathlib
from dotenv import load_dotenv
from src.data.s3_communication import S3Communication
import tempfile
from enum import Enum
import config

In [2]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

# S3 connecter for the bucket with source data
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

# Select experiment name and sample size

In [3]:
class ExperimentSampleSize(Enum):
    '''
    small will get 1 pdf for the experimemt (few seconds)
    medium will get 10 pdfs for the experimemt (few seconds)
    large will get 145 pdfs for the experimemt (few mins)
    '''
    small = 1
    medium = 10
    large = 145


# Set the experiment name here; this will be the prefix for your experiment files on s3
EXPERIMENT_NAME = "test-demo-4"

# Set the sample size of the data you want to run the demo for:
SAMPLE_SIZE = ExperimentSampleSize.medium
assert(SAMPLE_SIZE.name==config.SAMPLE_SIZE)

# Data Transfer

In [4]:
# Based on user selection, set the right urls
SOURCE_DATA_PREFIX = "aicoe-osc-demo/pipeline_run"
DESTINATION_DATA_PREFIX = EXPERIMENT_NAME + "/pipeline_run"

SOURCE_DATA = f"{SOURCE_DATA_PREFIX}/samples_{SAMPLE_SIZE.value}/pdfs"
DESTINATION_DATA = f"{DESTINATION_DATA_PREFIX}/{SAMPLE_SIZE.name}/pdfs"
SOURCE_ANNOTATIONS = f"{SOURCE_DATA_PREFIX}/samples_{SAMPLE_SIZE.value}/annotations"
DESTINATION_ANNOTATIONS = f"{DESTINATION_DATA_PREFIX}/{SAMPLE_SIZE.name}/annotations"

In [5]:
# Copy files from source to user directory
with tempfile.TemporaryDirectory() as tmpdirname:
    s3c.download_files_in_prefix_to_dir(SOURCE_DATA, tmpdirname)
    s3c.upload_files_in_dir_to_prefix(tmpdirname, DESTINATION_DATA)
with tempfile.TemporaryDirectory() as tmpdirname:
    s3c.download_files_in_prefix_to_dir(SOURCE_ANNOTATIONS, tmpdirname)
    s3c.upload_files_in_dir_to_prefix(tmpdirname, DESTINATION_ANNOTATIONS)
with tempfile.TemporaryDirectory() as tmpdirname:
    s3c.download_files_in_prefix_to_dir("aicoe-osc-demo/kpi_mapping", tmpdirname)
    s3c.upload_files_in_dir_to_prefix(tmpdirname, f"{EXPERIMENT_NAME}/kpi_mapping")

We created a space for storing outputs based on the experiment name selected by the user. **The next important thing is to edit the config.py file to reflect the link we created in this notebook. The DATA_S3_PREFIX in the config should be changed to "{EXPERIMENT_NAME}/pipeline_run/{SAMPLE_SIZE}"**