## Yandex DataSphere Delivery example

**Задача**: доставить Training Job на DataSphere и запустить там.

#### План

* Создать локально TrainingJob со списком task'ов и всей необходимой информацией о датасете

* Запустить локально тем образом, каким будем запускать в DataSphere

* Запустить в DataSphere с помощью Docker, как описано в training_grounds/Delivery.md

### Загрузим бандл на YandexStorage

In [1]:
from sklearn import datasets
import pandas as pd
from tg.common.ml import batched_training as bt


In [2]:
project_name = 'testirisproject'
dataset_name = 'irisdataset'
bucket = 'testirisbucket'


In [3]:
import os
from pathlib import Path
from tg.grammar_ru.common.loc import Loc
from dotenv import load_dotenv
load_dotenv(Loc.root_path / 'environment.env')


True

#### Создадим бандл

In [4]:
from tg.grammar_ru.common.loc import Loc


def get_multilabel_classification_bundle():
    ds = datasets.load_iris()
    features = pd.DataFrame(ds['data'], columns=ds['feature_names'])
    df = pd.DataFrame(ds['target_names'][ds['target']], columns=['label'])
    df['split'] = bt.train_display_test_split(df, 0.2, 0.2, 'label')
    bundle = bt.DataBundle(index=df, features=features)
    return bundle


bundle_temp_folder = Loc.temp_path / 'temp_bundle'
bundle = get_multilabel_classification_bundle()
bundle.save(bundle_temp_folder)


##### Создадим бакет

In [5]:
from tg.grammar_ru.ml.components.yandex_storage.s3_yandex_helpers import S3YandexHandler

try:
    S3YandexHandler.create_bucket(bucket)
except:
    pass  # TODO удалять бакет перед созданием?


##### Загрузим бандл в бакет

In [6]:
s3path = f'datasphere/{project_name}/datasets/{dataset_name}'
S3YandexHandler.upload_folder(bucket, s3path, bundle_temp_folder)


### Создадим task - TaskFactory


copy-paste from TrainingTaskDemo

In [7]:
from tg.grammar_ru.ml.components.training_task_factory import TaskFactory, Conventions
from tg.common.ml import dft


def get_feature_extractor():
    feature_extractor = (bt.PlainExtractor
                         .build('features')
                         .index('features')
                         .apply(transformer=dft.DataFrameTransformerFactory.default_factory())
                         )
    return feature_extractor


def get_multilabel_extractor():
    label_extractor = (bt.PlainExtractor
                       .build(Conventions.LabelFrame)
                       .index()
                       .apply(take_columns=['label'], transformer=dft.DataFrameTransformerFactory.default_factory())
                       )
    return label_extractor


In [8]:
import torch


class ClassificationNetwork(torch.nn.Module):
    def __init__(self, hidden_size, sample):
        super(ClassificationNetwork, self).__init__()
        self.hidden = torch.nn.Linear(sample['features'].shape[1], hidden_size)
        self.output = torch.nn.Linear(hidden_size, sample['label'].shape[1])

    def forward(self, input):
        X = input['features']
        X = torch.tensor(X.astype(float).values).float()
        X = self.hidden(X)
        X = torch.sigmoid(X)
        X = self.output(X)
        X = torch.sigmoid(X)
        return X


In [9]:
from sklearn.metrics import roc_auc_score
from tg.common import Logger
from yo_fluq_ds import *

Logger.disable()


class MulticlassMetrics(bt.Metric):
    def __init__(self, add_accuracy=True, add_rating=False):
        self.add_accuracy = add_accuracy
        self.add_rating = add_rating

    def get_names(self):
        result = []
        if self.add_accuracy:
            result.append('accuracy')
        if self.add_rating:
            result.append('rating')
        return result

    def measure(self, df, _):
        prefix = 'true_label_'
        labels = []
        for c in df.columns:
            if c.startswith(prefix):
                labels.append(c.replace(prefix, ''))

        def ustack(df, prefix, cols, name):
            df = df[[prefix+c for c in cols]]
            df.columns = [c for c in cols]
            df = df.unstack().to_frame(name)
            return df

        predicted = ustack(df, 'predicted_label_', labels, 'predicted')
        true = ustack(df, 'true_label_', labels, 'true')
        df = predicted.merge(true, left_index=True,
                             right_index=True).reset_index()
        df.columns = ['label', 'sample', 'predicted', 'true']
        df = df.feed(fluq.add_ordering_column(
            'sample', ('predicted', False), 'predicted_rating'))

        match = (df.loc[df.predicted_rating ==
                 0].set_index('sample').true > 0.5)
        rating = df.loc[df.true > 0.5].set_index('sample').predicted_rating
        result = []
        if self.add_accuracy:
            result.append(match.mean())
        if self.add_rating:
            result.append(rating.mean())
        return result


def _inner(x, sample):
    return ClassificationNetwork(20, sample)


class ClassificationTask(TaskFactory):
    def create_task(self, data, env):
        metrics = bt.MetricPool().add(MulticlassMetrics())
        self.instantiate_default_task(
            epoch_count=20, batch_size=10000, mini_batch_size=None, metric_pool=metrics)
        self.setup_batcher(
            data, [get_feature_extractor(), get_multilabel_extractor()])
        self.setup_model(_inner, learning_rate=1)


task = ClassificationTask()
task.info['dataset'] = dataset_name
task.info['name'] = 'classification_iris_task'


**Промежуточный результат**: создали task. Обернем его в TrainingJob (~ DeliverableJob).

Реализован класс TrainingJob.

Принимает список task'ов, название проекта и название бандла.

Для каждой таски

* Загружает бандл из ObjectStorage

* Запускает task

* Получает output task'а - модель. Архивирует её и отправляет в ObjectStorage.

* Exception'ы, возникшие при запуске task'а записывает в ObjectStorage.

In [10]:
from tg.grammar_ru.ml.components.yandex_delivery.training_job import TrainingJob


job = TrainingJob(tasks=[task],
                  project_name=project_name,
                  bucket=bucket)


In [11]:
# job.run()


Model uploaded at datasphere/testirisproject/output/classification_iris_task 09:58:25.848920/output/model.tar.gz


#### Структура ObjectStorage после обучения

* datasphere/project_name
    * datasets/dataset_name
        * file1_of_bundle .parquet
        * file2_of_bundle .parquet
    * output/task_name
        * output/model.tar.gz
    * exceptions
        * task_name_time1.txt
        * task_name_time2.txt

## Delivery

1. Из корня проекта, в котором используется tg, запустить скрипт dependencies_fix.py


In [31]:
# ! cd ~/grammar_ru && python3 dependencies_fix.py

2. Сделать инстанс класса SSHDockerJobRoutine, передав в конструктор необходимую работу, например, как в демке DeliverableJobs.

In [13]:
from tg.common.delivery.jobs import SSHDockerJobRoutine, DockerOptions
from tg.common.delivery.packaging import FakeContainerHandler

routine = SSHDockerJobRoutine(
    job = job,
    remote_host_address=None,
    remote_host_user=None,
    handler_factory = FakeContainerHandler.Factory(),
    options = DockerOptions(propagate_environmental_variables=[])
)

3. Убедиться, что докер выполняет команды без ```sudo```. Если это не так, то [выполнить шаги](https://docs.docker.com/engine/install/linux-postinstall/) из документации.

In [34]:
# ! docker run hello-world

### Fix по [документации](https://docs.docker.com/engine/install/linux-postinstall/).

In [15]:
# !sudo groupadd docker

groupadd: group 'docker' already exists


In [16]:
# !sudo usermod -aG docker $USER


In [None]:
# !newgrp docker

In [20]:
# !docker run hello-world

# *

4. Запустить сборку контейнера с помощью метода ```build_container``` из ```tg/common/delivery/jobs/ssh_docker_job_routine```.

In [35]:
from tg.common.delivery.jobs.ssh_docker_job_routine import build_container

build_container(job, 'test_job_iris', '1', 'test_iris_img',
                image_tag='test_iris_tag')


running sdist
running egg_info
creating test_job_iris.egg-info
writing test_job_iris.egg-info/PKG-INFO
writing dependency_links to test_job_iris.egg-info/dependency_links.txt
writing requirements to test_job_iris.egg-info/requires.txt
writing top-level names to test_job_iris.egg-info/top_level.txt
writing manifest file 'test_job_iris.egg-info/SOURCES.txt'
reading manifest file 'test_job_iris.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'test_job_iris.egg-info/SOURCES.txt'
running check
creating test_job_iris-1
creating test_job_iris-1/test_job_iris.egg-info
creating test_job_iris-1/test_job_iris__1
creating test_job_iris-1/test_job_iris__1/resources
creating test_job_iris-1/test_job_iris__1/tg
creating test_job_iris-1/test_job_iris__1/tg/amenities
creating test_job_iris-1/test_job_iris__1/tg/common
creating test_job_iris-1/test_job_iris__1/tg/common/_common
creating test_job_iris-1/test_job_iris__1/tg/common/_common/logger
creating test_job_iris-1






removing 'test_job_iris-1' (and everything under it)
Sending build context to Docker daemon  173.1kB
Step 1/7 : FROM python:3.7
 ---> 3e36461b4ff4
Step 2/7 : RUN pip install argon2-cffi==21.3.0 argon2-cffi-bindings==21.2.0 asttokens==2.0.8 attrs==21.4.0 backcall==0.2.0 beautifulsoup4==4.11.1 bleach==5.0.1 boto3==1.24.56 botocore==1.27.56 cffi==1.15.1 click==8.1.3 coverage==6.4.4 cramjam==2.5.0 cycler==0.11.0 DAWG-Python==0.7.2 debugpy==1.6.3 decorator==5.1.1 defusedxml==0.7.1 dill==0.3.5.1 docopt==0.6.2 entrypoints==0.4 executing==0.10.0 fastjsonschema==2.16.1 fastparquet==0.8.2 Flask==2.1.0 fonttools==4.36.0 fsspec==2022.7.1 google-pasta==0.2.0 importlib-metadata==4.12.0 importlib-resources==5.9.0 ipykernel==6.15.1 ipython==8.4.0 ipython-genutils==0.2.0 ipywidgets==8.0.1 itsdangerous==2.1.2 jedi==0.18.1 Jinja2==3.1.2 jmespath==1.0.1 joblib==1.1.0 jsonpickle==2.2.0 jsonschema==4.13.0 jupyter==1.0.0 jupyter-client==7.3.4 jupyter-console==6.4.4 jupyter-core==4.11.1 jupyterlab-pygments==0

The command '/bin/sh -c pip install argon2-cffi==21.3.0 argon2-cffi-bindings==21.2.0 asttokens==2.0.8 attrs==21.4.0 backcall==0.2.0 beautifulsoup4==4.11.1 bleach==5.0.1 boto3==1.24.56 botocore==1.27.56 cffi==1.15.1 click==8.1.3 coverage==6.4.4 cramjam==2.5.0 cycler==0.11.0 DAWG-Python==0.7.2 debugpy==1.6.3 decorator==5.1.1 defusedxml==0.7.1 dill==0.3.5.1 docopt==0.6.2 entrypoints==0.4 executing==0.10.0 fastjsonschema==2.16.1 fastparquet==0.8.2 Flask==2.1.0 fonttools==4.36.0 fsspec==2022.7.1 google-pasta==0.2.0 importlib-metadata==4.12.0 importlib-resources==5.9.0 ipykernel==6.15.1 ipython==8.4.0 ipython-genutils==0.2.0 ipywidgets==8.0.1 itsdangerous==2.1.2 jedi==0.18.1 Jinja2==3.1.2 jmespath==1.0.1 joblib==1.1.0 jsonpickle==2.2.0 jsonschema==4.13.0 jupyter==1.0.0 jupyter-client==7.3.4 jupyter-console==6.4.4 jupyter-core==4.11.1 jupyterlab-pygments==0.2.2 jupyterlab-widgets==3.0.2 kiwisolver==1.4.4 lxml==4.9.1 MarkupSafe==2.1.1 matplotlib==3.5.3 matplotlib-inline==0.1.6 mistune==0.8.4 m