In [1]:
# Autoreload if improted modules getting changed
%load_ext autoreload
%autoreload 2

In [2]:
import sagemaker
import os
import time
import json
import math
import uuid
import pandas as pd
import dask.dataframe as dd
import boto3

from urllib.parse import urlparse

from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.estimator import Estimator
from sagemaker.session import TrainingInput
from sagemaker.model import Model
from sagemaker.pytorch import PyTorch, PyTorchModel

from src.utils import convert_to_ancestry_format
from src.model_registry import get_model_registry, AncestryModel

sagemaker.__version__

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd


'2.232.2'

# Configuration

## Environment

In [3]:
BUCKET = 'rnd-sandbox-datasets'
BASE_IMAGE_URI = '778090103881.dkr.ecr.us-east-1.amazonaws.com/sagemaker-training-containers/ancestry-recomb:latest'
ROLE = 'arn:aws:iam::778090103881:role/ancestry_sagemaker_execution'

## Input Arguments

In [4]:
# Input CSV with location of VCF files.
NAME="east-indonesians-alt7"
SAMPLES_SET_PATH = f"s3://rnd-sandbox-datasets/inference/cohorts/east-indonesians/"
OUTPUT_DIR_PATH = f"s3://rnd-sandbox-datasets/inference/results/{NAME}/"
MODEL_DIR_PATH = f"s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/{NAME}/"
MODEL_VERSION = "0.01"

# Specify number of instances to use per model
INSTANCE_COUNT = 5

In [5]:
class S3Path:
    def __init__(self, url: str) -> None:
        parsed_url = urlparse(url)
        self.url = url
        self.path = parsed_url.path[1:]
        self.bucket = parsed_url.hostname
        self.sagemaker_registry_path = "sagemaker/model-registry/lai/"

    @property
    def input_files(self) -> str:
        return os.path.join(self.path, "input-files/")

    @property
    def window_results(self) -> str:
        return os.path.join(self.base, self.path, "window-results/")

    @property
    def base_results(self) -> str:
        return os.path.join(self.base, self.path, "base-results/")

    @property
    def smooth_results(self) -> str:
        return os.path.join(self.base, self.path, "smooth-results/")

    @property
    def model_name(self) -> str:
        return self.path.lstrip(self.sagemaker_registry_path).strip("/")

    @property
    def model_base_path(self) -> str:
        return f"{self.base}{self.sagemaker_registry_path}"

    @property
    def base(self) -> str:
        return f"s3://{self.bucket}/"


SAMPLES_SET = S3Path(SAMPLES_SET_PATH)
OUTPUT_DIR = S3Path(OUTPUT_DIR_PATH)
MODEL_DIR = S3Path(MODEL_DIR_PATH)

## Split Input Files

In [6]:
client = boto3.client("s3")


def write_file(data, idx):
    path = os.path.join(OUTPUT_DIR.input_files, f"input-data-{idx}.csv")
    client.put_object(Body=data, Bucket=OUTPUT_DIR.bucket, Key=path)
    print(f"upload: s3://{OUTPUT_DIR.bucket}/{path}")


result = client.list_objects(
    Bucket=SAMPLES_SET.bucket,
    Prefix=SAMPLES_SET.path,
    Delimiter="/",
)

folders = result["CommonPrefixes"]
lines_per_file = math.ceil(len(folders) / INSTANCE_COUNT)
counter = 1
lines = ""
for i, o in enumerate(folders):
    lines += ",".join([o["Prefix"].split("/")[-2], SAMPLES_SET.bucket, o["Prefix"]]) + "\n"
    if (i + 1) % lines_per_file == 0:
        write_file(lines, counter)
        counter += 1
        lines = ""

if lines:
    write_file(lines, counter)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


upload: s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/input-files/input-data-1.csv
upload: s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/input-files/input-data-2.csv
upload: s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/input-files/input-data-3.csv
upload: s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/input-files/input-data-4.csv
upload: s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/input-files/input-data-5.csv


## Read the Model

In [7]:
ANCESTRY_MODEL = AncestryModel(
    MODEL_DIR.model_base_path,
    MODEL_DIR.model_name,
    MODEL_VERSION,
)
SUBMODELS = ANCESTRY_MODEL._submodels

# Preprocess

Convert dataset into windows.

In [8]:
jobs = []
for SUBMODEL_NAME in SUBMODELS:
    print(f"Running submodel {SUBMODEL_NAME}")

    # base output path
    WINDOW_OUTPUT_DIR = f"{OUTPUT_DIR.window_results}{SUBMODEL_NAME}/"
    print(WINDOW_OUTPUT_DIR)

    # model
    model_registry = get_model_registry(ANCESTRY_MODEL, [SUBMODEL_NAME])
    base_job_name = "vcfwindowizer"
    job_name = f"{base_job_name}-{uuid.uuid4()}"
    jobs.append(job_name)

    processor = ScriptProcessor(
        command=["python3"],
        image_uri=BASE_IMAGE_URI,
        role=ROLE,
        instance_count=INSTANCE_COUNT,
        instance_type="ml.t3.2xlarge",
        base_job_name=base_job_name,
    )
    processor.run(
        code='src/vcf_preprocess.py',
        inputs=[
            ProcessingInput(
                source=f"{OUTPUT_DIR.base}{OUTPUT_DIR.input_files}",
                destination='/opt/ml/processing/input/data/',
                s3_data_distribution_type="ShardedByS3Key",
            ),
            ProcessingInput(
                source=model_registry["sub-models"][SUBMODEL_NAME]["model"]["base_model_uri"],
                destination='/opt/ml/processing/input/model/',
                s3_data_distribution_type="FullyReplicated",
            ),
        ],
        outputs=[
            ProcessingOutput(
                source='/opt/ml/processing/output/data/',
                destination=WINDOW_OUTPUT_DIR,
            ),
        ],
        logs=False,
        wait=False,
        job_name=job_name
    )

INFO:root:register chr1.2


Running submodel chr1.2
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr1.2/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-1732640a-81f3-4853-825f-382b6e043cb6
INFO:root:register chr11.12


Running submodel chr11.12
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr11.12/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-13337979-a13c-4ae2-b144-5872e77b1e22
INFO:root:register chr13.14


Running submodel chr13.14
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr13.14/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-043b3502-9f45-4735-b203-84e180289fef
INFO:root:register chr15.16


Running submodel chr15.16
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr15.16/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-4db9b580-e197-44e2-bf69-8d3b6d8abc48
INFO:root:register chr17.18


Running submodel chr17.18
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr17.18/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-9087ce4e-5d7c-416e-bf32-97f7c91d0f33
INFO:root:register chr19.20.21.22


Running submodel chr19.20.21.22
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr19.20.21.22/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-2da63038-b73e-412a-81d0-41632d6ff9a3
INFO:root:register chr3.4


Running submodel chr3.4
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr3.4/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-8f7fe007-eabe-4c56-b073-8540e646c2eb
INFO:root:register chr5.6


Running submodel chr5.6
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr5.6/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-cee01c04-5a1b-41f4-93ee-91c897254705
INFO:root:register chr7.8


Running submodel chr7.8
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr7.8/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-6b366fd9-933b-4b3b-8821-91db3f2f130c
INFO:root:register chr9.10


Running submodel chr9.10
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/window-results/chr9.10/


INFO:sagemaker:Creating processing-job with name vcfwindowizer-022305d4-4ba3-4499-bc9a-2a570cd5fdc0


### Waiting for results from preprocessing step

In [9]:
client = boto3.client('sagemaker')
errors = []

while len(jobs):
    print(f"{len(jobs)} left")
    for job_name in jobs:
        response = client.describe_processing_job(
            ProcessingJobName=job_name,
        )["ProcessingJobStatus"]

        if response in ["InProgress", "Stopping"]:
            time.sleep(60)
            continue

        if response in ["Failed", "Stopped"]:
            errors.append(job_name)

        indx = jobs.index(job_name)
        jobs.pop(indx)
        break

for error in errors:
    raise Exception(f"Processing job {error} failed.")

print("Next step")

10 left
9 left
8 left
7 left
6 left
5 left
4 left
3 left
2 left
1 left
Next step


# Base Layer Inference

In [10]:
jobs = []
for SUBMODEL_NAME in SUBMODELS:
    print(f"Running submodel {SUBMODEL_NAME}")
    job_name = f"base-{uuid.uuid4()}"
    jobs.append(job_name)

    # base output path
    BASE_OUTPUT_DIR = f"{OUTPUT_DIR.base_results}{SUBMODEL_NAME}/"
    print(BASE_OUTPUT_DIR)

    # create model
    model_registry = get_model_registry(ANCESTRY_MODEL, [SUBMODEL_NAME])
    model = Model(
        image_uri=BASE_IMAGE_URI,
        model_data=model_registry["sub-models"][SUBMODEL_NAME]["model"]["base_model_uri"],
        role=ROLE,
    )

    # create transformer
    transformer = model.transformer(
        instance_type="ml.m5.12xlarge", # NOTE: for more than 2 chromosomes use ml.m5.2xlarge
        instance_count=INSTANCE_COUNT,
        output_path=BASE_OUTPUT_DIR,
        strategy='MultiRecord',
        max_payload=24,  # NOTE: for ml.m5.2xlarge use 4 as the payload
        max_concurrent_transforms=1,
        env={'MODEL_SERVER_TIMEOUT': '3600'},
    )

    # transform data
    WINDOW_OUTPUT_DIR = f"{OUTPUT_DIR.window_results}{SUBMODEL_NAME}/"
    transformer.transform(
        data=WINDOW_OUTPUT_DIR,
        content_type='text/csv',
        split_type='Line',
        model_client_config={
            'InvocationsMaxRetries': 0,
            'InvocationsTimeoutInSeconds': 3600,
        },
        job_name=job_name,
        logs=False,
        wait=False,
    )

INFO:root:register chr1.2


Running submodel chr1.2
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr1.2/


INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-45-501
INFO:sagemaker:Creating transform job with name: base-4470a53d-1dde-495e-a74d-2f064fe6223c
INFO:root:register chr11.12
INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-46-611


Running submodel chr11.12
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr11.12/


INFO:sagemaker:Creating transform job with name: base-f03af665-7375-4703-8e61-7cb69143dd49
INFO:root:register chr13.14
INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-47-749


Running submodel chr13.14
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr13.14/


INFO:sagemaker:Creating transform job with name: base-bd435925-dc06-4db7-bfc3-281196381734
INFO:root:register chr15.16
INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-50-700


Running submodel chr15.16
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr15.16/


INFO:sagemaker:Creating transform job with name: base-c376bb57-ddc6-42d5-ae3e-112cac960a7a
INFO:root:register chr17.18
INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-51-682


Running submodel chr17.18
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr17.18/


INFO:sagemaker:Creating transform job with name: base-8b3fbf2f-7c12-47c9-b238-a50420a87b75
INFO:root:register chr19.20.21.22


Running submodel chr19.20.21.22
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr19.20.21.22/


INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-52-837
INFO:sagemaker:Creating transform job with name: base-a2ff1d66-3823-4fb2-aab1-dd957543d8b7
INFO:root:register chr3.4
INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-53-918


Running submodel chr3.4
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr3.4/


INFO:sagemaker:Creating transform job with name: base-12e2b506-feb4-4d62-9030-6ba0a9408bd2
INFO:root:register chr5.6
INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-55-056


Running submodel chr5.6
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr5.6/


INFO:sagemaker:Creating transform job with name: base-ba9589a2-d115-4b0f-9857-d4e77c2a20e0
INFO:root:register chr7.8
INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-56-067


Running submodel chr7.8
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr7.8/


INFO:sagemaker:Creating transform job with name: base-3b38a530-75a1-4073-92b2-7b54258d8953
INFO:root:register chr9.10
INFO:sagemaker:Creating model with name: ancestry-recomb-2024-11-06-15-41-56-982


Running submodel chr9.10
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/base-results/chr9.10/


INFO:sagemaker:Creating transform job with name: base-84089012-02cb-4066-a704-e59601918271


In [11]:
client = boto3.client('sagemaker')
errors = []
while len(jobs):
    print(f"{len(jobs)} left")
    for job_name in jobs:
        response = client.describe_transform_job(
            TransformJobName=job_name,
        )["TransformJobStatus"]

        if response in ["InProgress", "Stopping"]:
            time.sleep(60)
            continue

        if response in ["Failed", "Stopped"]:
            errors.append(job_name)

        indx = jobs.index(job_name)
        jobs.pop(indx)
        break

for error in errors:
    raise Exception(f"Batch transform job {error} failed.")

print("Next step")

10 left
9 left
8 left
7 left
6 left
5 left
4 left
3 left
2 left
1 left
Next step


# Smoothing Layer Inference

In [12]:
jobs = []
for SUBMODEL_NAME in SUBMODELS:
    print(f"Running submodel {SUBMODEL_NAME}")

    job_name = f"smooth-{uuid.uuid4()}"
    jobs.append(job_name)

    # get directory
    SMOOTH_OUTPUT_DIR = f"{OUTPUT_DIR.smooth_results}{SUBMODEL_NAME}/"
    print(SMOOTH_OUTPUT_DIR)

    # create model
    model_registry = get_model_registry(ANCESTRY_MODEL, [SUBMODEL_NAME])

    model = PyTorchModel(
        model_data=model_registry["sub-models"][SUBMODEL_NAME]["model"]["smooth_model_uri"],
        role=ROLE,
        framework_version="1.10",
        py_version="py38",
        source_dir="src/",
        entry_point="inference_user.py",
    )

    # batch transformer
    transformer = model.transformer(
        output_path=SMOOTH_OUTPUT_DIR,
        instance_count=1,
        instance_type="ml.m5.large",
        accept="text/tsv",
        strategy='MultiRecord',
        max_payload=4,
    )

    # run transformer
    BASE_OUTPUT_DIR = f"{OUTPUT_DIR.base_results}{SUBMODEL_NAME}/"
    transformer.transform(
        data=BASE_OUTPUT_DIR,
        content_type='text/tsv',
        split_type='Line',
        job_name=job_name,
        model_client_config={'InvocationsMaxRetries': 0},
        logs=False,
        wait=False,
    )

INFO:root:register chr1.2


Running submodel chr1.2
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr1.2/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr1.2/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-48-59-437/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-01-046
INFO:sagemaker:Creating transform job with name: smooth-d8df22c5-0725-4b52-be1e-3e26e872be4a
INFO:root:register chr11.12


Running submodel chr11.12
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr11.12/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr11.12/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-49-02-300/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-03-694
INFO:sagemaker:Creating transform job with name: smooth-f6a6efbd-ba6a-461c-ab7f-e63ae41df2d4
INFO:root:register chr13.14


Running submodel chr13.14
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr13.14/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr13.14/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-49-04-697/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-06-698
INFO:sagemaker:Creating transform job with name: smooth-5c6752fc-c9ce-4a18-8f7a-46f0ae8fe9d5
INFO:root:register chr15.16


Running submodel chr15.16
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr15.16/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr15.16/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-49-07-732/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-09-174
INFO:sagemaker:Creating transform job with name: smooth-b62a2940-5a8e-4327-877a-27c5e0444840
INFO:root:register chr17.18


Running submodel chr17.18
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr17.18/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr17.18/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-49-10-423/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-11-664
INFO:sagemaker:Creating transform job with name: smooth-ffd51c28-582e-4f11-a9b6-8c1ac4f4da39
INFO:root:register chr19.20.21.22


Running submodel chr19.20.21.22
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr19.20.21.22/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr19.20.21.22/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-49-12-786/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-14-120
INFO:sagemaker:Creating transform job with name: smooth-dab27c02-0895-42f5-bd4f-49513ed3aa4e
INFO:root:register chr3.4


Running submodel chr3.4
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr3.4/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr3.4/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-49-15-286/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-16-951
INFO:sagemaker:Creating transform job with name: smooth-ad049d14-3adf-4734-a01e-3ac630e6069b
INFO:root:register chr5.6


Running submodel chr5.6
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr5.6/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr5.6/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-49-18-046/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-19-821
INFO:sagemaker:Creating transform job with name: smooth-eae863e9-0e72-43b7-bc8b-c07ccb0c830b
INFO:root:register chr7.8


Running submodel chr7.8
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr7.8/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr7.8/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-49-20-853/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-22-489
INFO:sagemaker:Creating transform job with name: smooth-c5848d8d-c63d-4a82-9e31-850f68a52807
INFO:root:register chr9.10


Running submodel chr9.10
s3://rnd-sandbox-datasets/inference/results/east-indonesians-alt7/smooth-results/chr9.10/


INFO:sagemaker:Repacking model artifact (s3://rnd-sandbox-datasets/sagemaker/model-registry/lai/combined/east-indonesians-alt7/0.01/sub-models/chr9.10/smooth/model.tar.gz), script artifact (src/), and dependencies ([]) into single tar.gz file located at s3://sagemaker-us-east-1-778090103881/pytorch-inference-2024-11-06-15-49-23-515/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: pytorch-inference-2024-11-06-15-49-25-292
INFO:sagemaker:Creating transform job with name: smooth-b48c1fdb-e6f1-4b20-9a82-3eb11930c28b


In [None]:
client = boto3.client('sagemaker')
errors = []

while len(jobs):
    print(f"{len(jobs)} left")
    for job_name in jobs:
        response = client.describe_transform_job(
            TransformJobName=job_name,
        )["TransformJobStatus"]

        if response in ["InProgress", "Stopping"]:
            time.sleep(60)
            continue

        if response in ["Failed", "Stopped"]:
            errors.append(job_name)

        indx = jobs.index(job_name)
        jobs.pop(indx)
        break

for error in errors:
    raise Exception(f"Smooth transform job {error} failed.")

print("Next step")

10 left


# Base & smoothing layers post-processing

In [None]:
# convert base layer results to summary format
# generate summary of the smoothing layer output
for SUBMODEL_NAME in SUBMODELS:
    print(f"Running submodel {SUBMODEL_NAME}")

    # model registry
    model_registry = get_model_registry(ANCESTRY_MODEL, [SUBMODEL_NAME])

    parameters_file = model_registry["sub-models"][SUBMODEL_NAME]["artifacts"]["parameters"]

    !rm -rf temp/
    !mkdir -p temp/
    !aws s3 cp {parameters_file} temp/ --quiet
    with open(f'temp/parameters.json', 'rt') as fin:
        params = json.load(fin)    
        
    for layer in ["base", "smooth"]:
        # base output path
        path = getattr(OUTPUT_DIR, f"{layer}_results")
        LAYER_OUTPUT_DIR = f"{path}{SUBMODEL_NAME}/"

        # read data and convert to regular format
        !aws s3 cp --recursive $LAYER_OUTPUT_DIR temp/{layer}/intermediate/ --quiet
        !echo "Files count ({LAYER_OUTPUT_DIR}):" $(ls temp/{layer}/intermediate/ | wc -l)
        
        # read base results & save
        df_base = convert_to_ancestry_format(
            dd.read_csv(f'temp/{layer}/intermediate/*', sep='\t', header=None, dtype={2: "object"}).compute(), 
            model_registry["artifacts"]["population_map_uri"],
            params,
            pred_by_argmin=bool(layer=="base"),
        )
        df_base.to_csv(f'{OUTPUT_DIR.base}{OUTPUT_DIR.path}summary-results/{layer}_samples.{SUBMODEL_NAME}.tsv.gz', sep='\t', index=False)
        del df_base

### Performing basic analysis of the results (saved to ancestry.tsv)

In [None]:
!mkdir -p temp/
!aws s3 cp {OUTPUT_DIR.base}{OUTPUT_DIR.path}summary-results/ temp/ --recursive

df_base = dd.read_csv('temp/base_samples.*.tsv.gz', sep='\t', blocksize=None, dtype={"sample_id": "object"}).compute()
df_base["layer"] = "base"

df_smooth = dd.read_csv('temp/smooth_samples.*.tsv.gz', sep='\t', blocksize=None, dtype={"sample_id": "object"}).compute()
df_smooth["layer"] = "smooth"

df = pd.concat([df_smooth, df_base])
del df_base, df_smooth

with open('chr_map.json', 'rt') as fin:
    chr_fractions = json.load(fin)

chr_fractions = {int(k): v for k, v in chr_fractions.items()}
window_lengths = df.groupby(["chrom"])["window"].nunique().to_dict()
chr_map = {k: v/window_lengths[k] for k, v in chr_fractions.items() if k in window_lengths}

df["weight"] = df.chrom.map(chr_map)


def weight_to_str(x):
    x = x/x.sum()
    x = x.sort_values(ascending=False)
    return ", ".join(f"{idx[2]}: {v*100:0.2f}" for idx, v in x.items())


df_ = (
    df
    .groupby(["sample_id", "layer", "pred"])
    .agg({"weight": "sum"})
    .groupby(["sample_id", "layer"])
    .agg({"weight": weight_to_str})
    .rename(columns={"weight": "predicted ancestry"})
    .reset_index()
    .sort_values(["sample_id", 'layer'])
)
df_.to_csv(f"{NAME}-ancestry.tsv", sep="\t", index=False)

!rm -rf temp/