In [33]:
import mlflow
from datetime import datetime
from pathlib import Path
import time

from mlflow import MlflowClient
from mlflow.entities import Metric, Param, RunTag
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID

import yaml


In [45]:
with open('log.txt') as f:
    train_log = f.read()
with open('log2.txt') as f:
    clean_log = f.read()

In [37]:
with open('config.yaml') as f:
    config = yaml.safe_load(f)

In [38]:
config

{'experiment': {'name': 'prod',
  'src': 'ru',
  'trg': 'en',
  'teacher-ensemble': 2,
  'backward-model': '',
  'vocab': '',
  'mono-max-sentences-src': 100000000,
  'mono-max-sentences-trg': 20000000,
  'split-length': 2000000,
  'spm-sample-size': 10000000,
  'best-model': 'chrf',
  'bicleaner': {'default-threshold': 0.5,
   'dataset-thresholds': {'opus_CCAligned/v1': 0.7,
    'opus_OpenSubtitles/v2018': 0.8,
    'opus_bible-uedin/v1': 0.7,
    'mtdata_Statmt-wiki_titles-1-rus-eng': 0.7,
    'mtdata_Facebook-wikimatrix-1-eng-rus': 0.7,
    'mtdata_Statmt-wiki_titles-2-rus-eng': 0.7,
    'mtdata_Statmt-commoncrawl_wmt13-1-rus-eng': 0.7,
    'opus_ParaCrawl/v8': 0}}},
 'marian-args': {'training-backward': {'after': '10e'},
  'training-teacher-base': {'after': '2e'},
  'decoding-backward': {'mini-batch-words': 2000, 'beam-size': 12},
  'decoding-teacher': {'mini-batch-words': 1000, 'precision': 'float16'}},
 'datasets': {'train': ['opus_ada83/v1',
   'opus_UN/v20090831',
   'opus_GNOME

In [39]:
client = MlflowClient()

In [None]:
experiment_id = client.create_experiment(
    "fr-en/bicleaner",
    artifact_location=Path.cwd().joinpath("mlruns5").as_uri(),
    tags={"version": "v1", "priority": "P1"},
)
client.set_experiment_tag(experiment_id, "testing", "quality")

In [43]:
parent_run = client.create_run(experiment_id,  
                        run_name="tc-run-2023-05-15-0800", 
                        start_time=int(datetime.utcnow().timestamp()))


In [47]:
from collections.abc import MutableMapping

def flatten(dictionary, parent_key='', separator='_'):
    items = []
    for key, value in dictionary.items():
        new_key = parent_key + separator + key if parent_key else key
        if isinstance(value, MutableMapping):
            items.extend(flatten(value, new_key, separator=separator).items())
        else:
            items.append((new_key, value))
    return dict(items)

In [None]:
flatten_config = flatten(config)
flatten_config

In [53]:
params = [Param(k,str(v)) for k,v in flatten_config.items() if 'datasets' not in k]
client.log_batch(parent_run.info.run_id, params=params)
client.log_artifact(parent_run.info.run_id, 'config.yaml')

In [112]:
clean_corpus_run = client.create_run(experiment_id, 
                                 tags={MLFLOW_PARENT_RUN_ID: parent_run.info.run_id}, 
                                 run_name="clean-corpus-OPUS_Books-v1", 
                                 start_time=int(datetime.utcnow().timestamp()))


In [113]:
client.log_text(clean_corpus_run.info.run_id, clean_log, 'clean-corpus.txt')
params = [Param('src_corpus', '/data/ru-en/src_corpus.gz'), Param('trg_corpus', '/data/ru-en/src_corpus.gz')]
client.log_batch(clean_corpus_run.info.run_id, params=params)


In [None]:
client.log_inputs(clean_corpus_run.info.run_id

In [111]:
client.set_terminated(clean_corpus_run.info.run_id)

In [109]:
training_run = client.create_run(experiment_id, 
                                 tags={MLFLOW_PARENT_RUN_ID: parent_run.info.run_id}, 
                                 run_name="train-student", 
                                 start_time=int(datetime.utcnow().timestamp()))

In [63]:
import re

In [99]:
lines = [l for l in train_log.split('\n') if '[config]' in l]
training_config_lines = [str(re.search(r'.*\[config\]\s(.*)', l, re.IGNORECASE).group(1))
                            for l in lines]



In [102]:
with open('training_config.yaml', 'w') as f:
    f.writelines('\n'.join(training_config_lines) + '\n')

In [103]:
with open('training_config.yaml', 'r') as f:
    training_config = yaml.safe_load(f)

In [None]:
training_config

In [110]:


# Create MLflow entities and a run under the default experiment (whose id is '0').
timestamp = int(time.time() * 1000)
metrics = [Metric("train-BLEU", 0.5, timestamp+5000, 0),
          Metric("train-BLEU", 5.5, timestamp+10000, 1),
          Metric("train-BLEU", 10.5, timestamp+20000, 2),
          Metric("train-BLEU", 20.5, timestamp+30000, 3),
          Metric("train-BLEU", 27.5, timestamp+35000, 4),
          Metric("valid-BLEU", 0.5, timestamp+5000, 0),
          Metric("valid-BLEU", 9.5, timestamp+10000, 1),
          Metric("valid-BLEU", 14.5, timestamp+20000, 2),
          Metric("valid-BLEU", 20.5, timestamp+30000, 3),
          Metric("valid-BLEU", 25.5, timestamp+35000, 4),
          
          ]

params = [Param(k,str(v)) for k,v in training_config.items()]
tags = [RunTag("train", "small")]

# Log entities, terminate the run, and fetch run status
client.log_batch(training_run.info.run_id, metrics=metrics, params=params, tags=tags)
client.log_artifact(training_run.info.run_id, 'log.txt')
client.log_artifact(training_run.info.run_id, 'training_config.yaml')
client.set_terminated(training_run.info.run_id)


In [27]:
parent_run = run

In [10]:

log[:10]

'[task 2023'

In [31]:

def print_run_info(r):
    print("run_id: {}".format(r.info.run_id))
    print("params: {}".format(r.data.params))
    print("metrics: {}".format(r.data.metrics))
    print("tags: {}".format(r.data.tags))
    print("status: {}".format(r.info.status))


# Create MLflow entities and a run under the default experiment (whose id is '0').
timestamp = int(time.time() * 1000)
metrics = [Metric("train-BLEU", 0.5, timestamp+5000, 0),
          Metric("train-BLEU", 10.5, timestamp+10000, 1),
          Metric("train-BLEU", 15.5, timestamp+20000, 2),
          Metric("train-BLEU", 25.5, timestamp+30000, 3),
          Metric("train-BLEU", 31.5, timestamp+35000, 4),
          Metric("valid-BLEU", 0.5, timestamp+5000, 0),
          Metric("valid-BLEU", 10.5, timestamp+10000, 1),
          Metric("valid-BLEU", 15.5, timestamp+20000, 2),
          Metric("valid-BLEU", 27.5, timestamp+30000, 3),
          Metric("valid-BLEU", 30.5, timestamp+35000, 4),
          
          ]

params = [Param("after", "600e"), Param('after-batches', '0')]
tags = [RunTag("train", "big")]

# Log entities, terminate the run, and fetch run status
client.log_batch(run.info.run_id, metrics=metrics, params=params, tags=tags)
client.log_text(run.info.run_id, log, 'training-log12345.txt')
client.set_terminated(run.info.run_id)
run = client.get_run(run.info.run_id)
print_run_info(run)


run_id: 60828a8df95c4967802ef08111eec290
params: {'after': '600e', 'after-batches': '0'}
metrics: {'train-BLEU': 31.5, 'valid-BLEU': 30.5}
tags: {'mlflow.parentRunId': 'd404bfd448c047d58875d79dc1ece59c', 'mlflow.runName': 'child-run1', 'train': 'big'}
status: FINISHED


In [None]:
run2 = client.create_run(experiment_id, name='nested', nested=True)

In [19]:
parent_run

<Run: data=<RunData: metrics={'m': 1.5}, params={'p': 'p'}, tags={'mlflow.runName': 'main',
 'mlflow.source.name': '/Users/epavlov/opt/anaconda3/envs/mlflow/lib/python3.9/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'epavlov',
 't': 't'}>, info=<RunInfo: artifact_uri='mlflow-artifacts:/0/d266a8f8c3984b9fa693f7203c99a266/artifacts', end_time=1693354052147, experiment_id='0', lifecycle_stage='active', run_id='d266a8f8c3984b9fa693f7203c99a266', run_name='main', run_uuid='d266a8f8c3984b9fa693f7203c99a266', start_time=1693354052113, status='FINISHED', user_id='epavlov'>, inputs=<RunInputs: dataset_inputs=[]>>

In [20]:
child_run

<ActiveRun: >

In [15]:
from aimlflow.utils import convert_existing_logs
from aim.sdk.utils import clean_repo_path
from aim.sdk.repo import Repo

In [17]:
repo_path = clean_repo_path('aim-repo2')
repo_inst = Repo.from_path(repo_path)
convert_existing_logs(repo_inst, tracking_uri='mlruns3')

Traceback (most recent call last):
  File "/Users/epavlov/opt/anaconda3/envs/mlflow/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 304, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "/Users/epavlov/opt/anaconda3/envs/mlflow/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 397, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/Users/epavlov/opt/anaconda3/envs/mlflow/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1306, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/Users/epavlov/opt/anaconda3/envs/mlflow/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1299, in _read_helper
    result = read_yaml(root, file_name)
  File "/Users/epavlov/opt/anaconda3/envs/mlflow/lib/python3.9/site-packages/mlflow/utils/file_utils.py", line 282, in read_yaml
    raise MissingConfigEx