# S3 Connection Notebook

This notebook is intended to connect with S3 via the jupyterhub and to upload/download outcome from the data extraction process.

Assumption: You

### Import needed packages and credentials

In [2]:
import os
from s3_communication import S3Communication
import pathlib
import zipfile
from dotenv import load_dotenv
from io import BytesIO
import shutil

In [3]:
# Load credentials
dotenv_dir = os.environ.get("CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src"))
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

### New Project

* Create a folder somewhere where this notebook can access your project under project name (specify the project name below and the folder path)
* A certain structure is required (compare setup_project.py in the corporate_data_extraction repository). If such structure already exists you can upload it directly. If not execute the cell below which starts with "#Create folder tree". Afterwards you have to fill input subfolders. On top copy from corporate_data_extraction the settings_default.yaml into the folder with name settings.yaml.


In [4]:
project_name = "ABC"
folder_path = pathlib.Path(os.getcwd()) / project_name
print(folder_path)

/opt/app-root/src/ABC


In [5]:
# If uploaded new project as a zip file with existing structure
zip_name = "Downloads.zip"
with zipfile.ZipFile(str(folder_path / zip_name), "r") as zip_ref:
    zip_ref.extractall(str(folder_path))

In [5]:
# Create input folder tree
os.makedirs(folder_path, exist_ok=True)
os.makedirs(folder_path / "input", exist_ok=True)
os.makedirs(folder_path / "interim", exist_ok=True)
os.makedirs(folder_path / "output", exist_ok=True)
os.makedirs(folder_path / "models", exist_ok=True)
os.makedirs(folder_path / "input/pdfs", exist_ok=True)
os.makedirs(folder_path / "input/kpi_mapping", exist_ok=True)
os.makedirs(folder_path / "input/annotations", exist_ok=True)
os.makedirs(folder_path / "input/pdfs/training", exist_ok=True)
os.makedirs(folder_path / "input/pdfs/inference", exist_ok=True)

### Create S3 connectors

In [5]:
# init s3 connector
s3c = S3Communication(
    s3_endpoint_url=os.getenv("LANDING_AWS_ENDPOINT"),
    aws_access_key_id=os.getenv("LANDING_AWS_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("LANDING_AWS_SECRET_KEY"),
    s3_bucket=os.getenv("LANDING_AWS_BUCKET_NAME"),
)

In [6]:
s3c_interim = S3Communication(
    s3_endpoint_url=os.getenv("INTERIM_AWS_ENDPOINT"),
    aws_access_key_id=os.getenv("INTERIM_AWS_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("INTERIM_AWS_SECRET_KEY"),
    s3_bucket=os.getenv("INTERIM_AWS_BUCKET_NAME"),
)

### Existing content on S3

In [29]:
project_prefix = f"corporate_data_extraction_projects/{project_name}"
# Show only objects which satisfy our prefix
my_bucket = s3c.s3_resource.Bucket(name=s3c.bucket)
for objects in my_bucket.objects.filter(Prefix=project_prefix):
    print(objects.key)

corporate_data_extraction_projects/ABC/data/input/annotations/test_annotations.xlsx
corporate_data_extraction_projects/ABC/data/input/kpi_mapping/kpi_mapping.csv
corporate_data_extraction_projects/ABC/data/input/pdfs/inference/AstraZeneca_2019_SustainabilityDataSummary.pdf
corporate_data_extraction_projects/ABC/data/input/pdfs/inference/AstraZeneca_2020_SustainabilityDataSummary.pdf
corporate_data_extraction_projects/ABC/data/input/pdfs/inference/AstraZeneca_2021_SustainabilityDataSummary.pdf
corporate_data_extraction_projects/ABC/data/input/pdfs/inference/Atlassian_2022_SustainabilityReport.pdf
corporate_data_extraction_projects/ABC/data/input/pdfs/inference/BayerAG_2019_SustainabilityReport.pdf
corporate_data_extraction_projects/ABC/data/input/pdfs/inference/BayerAG_2020_SustainabilityReport.pdf
corporate_data_extraction_projects/ABC/data/input/pdfs/inference/BayerAG_2021_SustainabilityReport.pdf
corporate_data_extraction_projects/ABC/data/input/pdfs/inference/Casio_2021_Sustainabili

In [22]:
# Delete a specific file
file = "corporate_data_extraction_projects/ABC/data/output"
my_bucket = s3c.s3_resource.Bucket(name=s3c.bucket)
for objects in my_bucket.objects.filter(Prefix=file):
    objects.delete()

In [27]:
project_prefix = f"corporate_data_extraction_projects/{project_name}"
# Show only objects which satisfy our prefix
my_bucket = s3c_interim.s3_resource.Bucket(name=s3c_interim.bucket)
for objects in my_bucket.objects.filter(Prefix=project_prefix):
    print(objects.key)

corporate_data_extraction_projects/ABC/data/interim/ml/annotations/aggregated_annotation.csv
corporate_data_extraction_projects/ABC/data/interim/ml/extraction/AstraZeneca_2019_SustainabilityDataSummary.json
corporate_data_extraction_projects/ABC/data/interim/ml/extraction/AstraZeneca_2020_SustainabilityDataSummary.json
corporate_data_extraction_projects/ABC/data/interim/ml/extraction/AstraZeneca_2021_SustainabilityDataSummary.json
corporate_data_extraction_projects/ABC/data/interim/ml/extraction/Atlassian_2022_SustainabilityReport.json
corporate_data_extraction_projects/ABC/data/interim/ml/extraction/BayerAG_2019_SustainabilityReport.json
corporate_data_extraction_projects/ABC/data/interim/ml/extraction/BayerAG_2020_SustainabilityReport.json
corporate_data_extraction_projects/ABC/data/interim/ml/extraction/BayerAG_2021_SustainabilityReport.json
corporate_data_extraction_projects/ABC/data/interim/ml/extraction/Casio_2021_SustainabilityReport.json
corporate_data_extraction_projects/ABC/d

In [30]:
# Download a folder
prefix = "corporate_data_extraction_projects/ABC/data/output/KPI_EXTRACTION/joined_ml_rb"
dest_folder = f"/opt/app-root/src/{project_name}/output"
s3c.download_files_in_prefix_to_dir(
    prefix,
    dest_folder,
)

### Download content from S3

##### Get some model from S3

In [8]:
model_dest_path = f"/opt/app-root/src/{project_name}/models"
model_kpi_prefix = "aicoe-osc-demo/saved_models"
model_rel_prefix = "aicoe-osc-demo/saved_models"
model_kpi_name = "KPI_EXTRACTION.zip"
model_rel_name = "RELEVANCE.zip"

In [12]:
# Show only objects which satisfy our prefix
my_bucket = s3c.s3_resource.Bucket(name=s3c.bucket)
for objects in my_bucket.objects.filter(Prefix=model_kpi_prefix):
    print(objects.key)

aicoe-osc-demo/saved_models/
aicoe-osc-demo/saved_models/KPI_EXTRACTION.zip
aicoe-osc-demo/saved_models/RELEVANCE.zip
aicoe-osc-demo/saved_models/demo_train_kpi_infer/KPI_EXTRACTION/language_model.bin
aicoe-osc-demo/saved_models/demo_train_kpi_infer/KPI_EXTRACTION/language_model_config.json
aicoe-osc-demo/saved_models/demo_train_kpi_infer/KPI_EXTRACTION/merges.txt
aicoe-osc-demo/saved_models/demo_train_kpi_infer/KPI_EXTRACTION/prediction_head_0.bin
aicoe-osc-demo/saved_models/demo_train_kpi_infer/KPI_EXTRACTION/prediction_head_0_config.json
aicoe-osc-demo/saved_models/demo_train_kpi_infer/KPI_EXTRACTION/processor_config.json
aicoe-osc-demo/saved_models/demo_train_kpi_infer/KPI_EXTRACTION/special_tokens_map.json
aicoe-osc-demo/saved_models/demo_train_kpi_infer/KPI_EXTRACTION/tokenizer_config.json
aicoe-osc-demo/saved_models/demo_train_kpi_infer/KPI_EXTRACTION/vocab.json
aicoe-osc-demo/saved_models/demo_train_kpi_infer/relevance_fine_tune_demo_scores.csv
aicoe-osc-demo/saved_models/icdar

In [11]:
model_dest_rel_folder = model_dest_path + "/REL"
model_dest_kpi_folder = model_dest_path + "/KPI"
os.makedirs(model_dest_rel_folder, exist_ok=True)
os.makedirs(model_dest_kpi_folder, exist_ok=True)
model_dest_rel_path = model_dest_rel_folder + "/" + model_rel_name
model_dest_kpi_path = model_dest_kpi_folder + "/" + model_kpi_name

In [10]:
# Download relevance model
s3c.download_file_from_s3(model_dest_rel_path, model_rel_prefix, model_rel_name)

In [12]:
# Download inference model
s3c.download_file_from_s3(model_dest_kpi_path, model_kpi_prefix, model_kpi_name)

##### Get additional files like kpi_mapping or annotations

In [15]:
kpi_name = "kpi_mapping.csv"
kpi_prefix = "aicoe-osc-demo/kpi_mapping"
kpi_dest_path = f"/opt/app-root/src/{project_name}/input/kpi_mapping/{kpi_name}"

In [16]:
# Download kpi file
s3c.download_file_from_s3(kpi_dest_path, kpi_prefix, kpi_name)

### Next we upload the new project to S3

In [9]:
# Variables needed in the following
prefix_data = "corporate_data_extraction_projects/" + project_name + "/data"
prefix_models = "corporate_data_extraction_projects/" + project_name + "/models"

##### Upload settings 

In [19]:
# Uploading settings file first
response = s3c.upload_file_to_s3(
    filepath=str(folder_path / "settings.yaml"), s3_prefix=prefix_data, s3_key="settings.yaml"
)
if response["ResponseMetadata"]["HTTPStatusCode"] != 200:
    print("Error while uploading")

##### Upload folder

In [37]:
# Upload annotations
s3c.upload_files_in_dir_to_prefix(
    source_dir=str(folder_path / "input/annotations"), s3_prefix=prefix_data + "/input/annotations"
)
# Upload kpi_mapping
s3c.upload_files_in_dir_to_prefix(
    source_dir=str(folder_path / "input/kpi_mapping"), s3_prefix=prefix_data + "/input/kpi_mapping"
)
# Upload pdfs for training
s3c.upload_files_in_dir_to_prefix(
    source_dir=str(folder_path / "input/pdfs/training"), s3_prefix=prefix_data + "/input/pdfs/training"
)
# Upload inference model
s3c.upload_files_in_dir_to_prefix(
    source_dir=str(folder_path / "models" / "KPI"), s3_prefix=prefix_models + "/KPI_EXTRACTION/Text"
)
# Upload relevance model
s3c.upload_files_in_dir_to_prefix(
    source_dir=str(folder_path / "models" / "REL"), s3_prefix=prefix_models + "/RELEVANCE/Text"
)

In [14]:
# Upload pdfs for inference
s3c.upload_files_in_dir_to_prefix(
    source_dir=str(folder_path / "input/pdfs/inference"), s3_prefix=prefix_data + "/input/pdfs/inference"
)

Create a zip file from the folder

In [48]:
shutil.make_archive(f"/opt/app-root/src/{project_name}", "zip", folder_path)

'/opt/app-root/src/ESG.zip'

### Check uploads

In [49]:
# Show only objects which satisfy our prefix
my_bucket = s3c.s3_resource.Bucket(name=s3c.bucket)
for objects in my_bucket.objects.filter(Prefix="corporate_data_extraction_projects/" + project_name):
    print(objects.key)

corporate_data_extraction_projects/ESG/data/input/annotations/20201030 1Qbit aggregated_annotations_needs_correction.xlsx
corporate_data_extraction_projects/ESG/data/input/kpi_mapping/kpi_mapping.csv
corporate_data_extraction_projects/ESG/data/input/pdfs/inference/Orsted-ESG-performance-report-2021.pdf
corporate_data_extraction_projects/ESG/data/input/pdfs/inference/Orsted-sustainability-report-2022.pdf
corporate_data_extraction_projects/ESG/data/output/KPI_EXTRACTION/joined_ml_rb/1691745425_Orsted-ESG-performance-report-2021.pdf.csv
corporate_data_extraction_projects/ESG/data/output/KPI_EXTRACTION/joined_ml_rb/1691745425_Orsted-sustainability-report-2022.pdf.csv
corporate_data_extraction_projects/ESG/data/output/KPI_EXTRACTION/ml/Text/Orsted-ESG-performance-report-2021_predictions_kpi.csv
corporate_data_extraction_projects/ESG/data/output/KPI_EXTRACTION/ml/Text/Orsted-sustainability-report-2022_predictions_kpi.csv
corporate_data_extraction_projects/ESG/data/output/KPI_EXTRACTION/rb/Or

In [41]:
# Show only objects which satisfy our prefix
my_bucket = s3c_interim.s3_resource.Bucket(name=s3c_interim.bucket)
for objects in my_bucket.objects.filter(Prefix="corporate_data_extraction_projects/" + project_name):
    print(objects.key)

corporate_data_extraction_projects/ESG/data/interim/ml/annotations/aggregated_annotation.csv
corporate_data_extraction_projects/ESG/data/interim/ml/extraction/Orsted-ESG-performance-report-2021.json
corporate_data_extraction_projects/ESG/data/interim/ml/extraction/Orsted-sustainability-report-2022.json


In [150]:
# Delete specific files
# file_prefix = corporate_data_extraction_projects/data'
# my_bucket = s3c.s3_resource.Bucket(name=s3c.bucket)
# for objects in my_bucket.objects.filter(Prefix=file_prefix):
#    print(objects.key)
#    s3c.s3_resource.Object(s3c.bucket, objects.key).delete()

### Download results

In [None]:
# Download one file
prefix = "corporate_data_extraction_projects/ESG/data/output/KPI_EXTRACTION/joined_ml_rb"
file_name = ""
dest_path = f"/opt/app-root/src/{project_name}/output/{file_name}"
# Download relevance model
s3c.download_file_from_s3(dest_path, prefix, file_name)

In [44]:
# Download a folder
prefix = "corporate_data_extraction_projects/ESG/data/output/KPI_EXTRACTION/joined_ml_rb"
dest_folder = f"/opt/app-root/src/{project_name}/output"
s3c.download_files_in_prefix_to_dir(
    prefix,
    dest_folder,
)

### Store some models in S3

If you have an unzipped model follow the following cells to create zip and upload

In [16]:
def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(path, "..")))


with zipfile.ZipFile("relevance-roberta.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
    zipdir("relevance-roberta/", zipf)

In [19]:
model_path = str(os.getcwd() + "/relevance_roberta.zip")
response = s3c.upload_file_to_s3(
    filepath=model_path, s3_prefix="corporate_data_extraction_projects/base_model", s3_key="relevance-roberta.zip"
)

In [34]:
# Delete a specific file
file = "corporate_data_extraction_projects/ESG/models"
my_bucket = s3c.s3_resource.Bucket(name=s3c.bucket)
for objects in my_bucket.objects.filter(Prefix=file):
    objects.delete()

Test if upload worked

In [46]:
# Show only objects which satisfy our prefix
my_bucket = s3c.s3_resource.Bucket(name=s3c.bucket)
for objects in my_bucket.objects.filter(Prefix="corporate_data_extraction_projects/base_model"):
    print(objects.key)

In [10]:
# relevance roberta is already on S3 so we just download it
s3c.download_file_from_s3(
    os.getcwd() + "/relevance-roberta.zip", "corporate_data_extraction_projects/base_model", "relevance-roberta.zip"
)