# Purpose
Test mlflow location utils before kicking off multiple jobs.


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
from pathlib import Path

import mlflow
import numpy as np
import pandas as pd

import subclu
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    print_lib_versions, setup_logging, notebook_display_config, 
)


print_lib_versions([mlflow, np, pd, subclu])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.18.5
pandas		v: 1.2.5
subclu		v: 0.4.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Check mlflow location

## Default (without a server)
By default, mlflow will save all data to local files & folders.

In [4]:
mlflow.get_tracking_uri()

'file:///home/jupyter/subreddit_clustering_i18n/notebooks/tests/mlruns'

## But with my `MlflowLogger` class, I can set it to a local sqlite database

In [5]:
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db'

# Get list of experiments with new function
With my custom class, it's easier to pull data from the mlflow database

## As list

In [6]:
mlf.list_experiment_meta()

[{'experiment_id': '0',
  'name': 'Default',
  'artifact_location': './mlruns/0',
  'lifecycle_stage': 'active'},
 {'experiment_id': '1',
  'name': 'fse_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/1',
  'lifecycle_stage': 'active'},
 {'experiment_id': '2',
  'name': 'fse_vectorize_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/2',
  'lifecycle_stage': 'active'},
 {'experiment_id': '3',
  'name': 'subreddit_description_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/3',
  'lifecycle_stage': 'active'},
 {'experiment_id': '4',
  'name': 'fse_vectorize_v1.1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/4',
  'lifecycle_stage': 'active'},
 {'experiment_id': '5',
  'name': 'use_multilingual_v0.1_test',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/5',
  'lifecycle_stage': 'active'},
 {'experiment_id': '6',
  'name': 'use_multilingual_v1',
  'artifact_location': '

## As df

In [7]:
mlf.list_experiment_meta(output_format='pandas')

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
0,0,Default,./mlruns/0,active
1,1,fse_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/1,active
2,2,fse_vectorize_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/2,active
3,3,subreddit_description_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/3,active
4,4,fse_vectorize_v1.1,gs://i18n-subreddit-clustering/mlflow/mlruns/4,active
5,5,use_multilingual_v0.1_test,gs://i18n-subreddit-clustering/mlflow/mlruns/5,active
6,6,use_multilingual_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/6,active
7,7,use_multilingual_v1_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/7,active
8,8,use_multilingual_v1_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/8,active
9,9,v0.3.2_use_multi_inference_test,gs://i18n-subreddit-clustering/mlflow/mlruns/9,active


# Get run metadata

## For a specific experiment ID (or multiple IDs)

In [15]:
mlf.search_all_runs(experiment_ids=[4, 5]).head(8)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.df_vect_comments_cols,metrics.df_vect_posts_cols,metrics.vectorizing_time_minutes,metrics.df_vect_subreddits_description_rows,metrics.df_vect_posts_rows,metrics.df_vect_comments_rows,metrics.df_vect_subreddits_description_cols,metrics.df_subs_len,params.col_text_post,params.host_name,params.subreddits_path,params.tokenize_lowercase,params.posts_path,params.col_text_comment_word_count,params.tokenize_function,params.col_text_post_url,params.col_post_id,params.tf_limit_first_n_chars,params.col_comment_id,params.n_sample_comments,params.model_name,params.tf_batch_inference_rows,params.model_location,params.n_sample_posts,params.bucket_name,params.col_text_subreddit_description,params.col_subreddit_id,params.col_text_comment,params.col_text_post_word_count,params.comments_path,params.preprocess_text_folder,params.col_text_subreddit_word_count,params.training_data,params.train_exclude_duplicated_docs,params.train_use_comments,params.train_min_word_count,tags.host_name,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.source.name,tags.mlflow.source.git.commit,tags.mlflow.runName
0,45201072143a4d7fbb86a2f2b7d85520,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/45201072143a4d7fbb86a2f2b7d85520/artifacts,2021-07-01 10:13:10.544000+00:00,2021-07-01 10:13:29.348000+00:00,512.0,512.0,1.212334,629.0,1500.0,2100.0,512.0,,text,tensorflow-2-3-20210617-fix,subreddits/de/2021-06-16,True,posts/de/2021-06-16,comment_text_word_count,sklearn,post_url_for_embeddings,post_id,1000.0,comment_id,2100.0,use_multilingual,1000.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,1500.0,i18n-subreddit-clustering,subreddit_name_title_and_clean_descriptions,subreddit_id,comment_body_text,text_word_count,comments/de/2021-06-16,,subreddit_name_title_and_clean_descriptions_word_count,,,,,tensorflow-2-3-20210617-fix,LOCAL,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d,test_n_samples
1,b0569cb9a7fa4820a940cb6eee6f2045,5,KILLED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/b0569cb9a7fa4820a940cb6eee6f2045/artifacts,2021-07-01 10:05:26.367000+00:00,2021-07-01 10:12:16.530000+00:00,,512.0,,629.0,1500.0,,512.0,,text,tensorflow-2-3-20210617-fix,subreddits/de/2021-06-16,True,posts/de/2021-06-16,comment_text_word_count,sklearn,post_url_for_embeddings,post_id,1000.0,comment_id,2100.0,use_multilingual,1000.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,1500.0,i18n-subreddit-clustering,subreddit_name_title_and_clean_descriptions,subreddit_id,comment_body_text,text_word_count,comments/de/2021-06-16,,subreddit_name_title_and_clean_descriptions_word_count,,,,,tensorflow-2-3-20210617-fix,LOCAL,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d,test_n_samples
2,19cc9e3673b24b10bc56b96ccf3fefb7,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/19cc9e3673b24b10bc56b96ccf3fefb7/artifacts,2021-07-01 10:00:28.094000+00:00,2021-07-01 10:00:45.939000+00:00,512.0,512.0,1.219346,629.0,1500.0,2100.0,512.0,,text,tensorflow-2-3-20210617-fix,subreddits/de/2021-06-16,True,posts/de/2021-06-16,comment_text_word_count,sklearn,post_url_for_embeddings,post_id,1000.0,comment_id,2100.0,use_multilingual,1000.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,1500.0,i18n-subreddit-clustering,subreddit_name_title_and_clean_descriptions,subreddit_id,comment_body_text,text_word_count,comments/de/2021-06-16,,subreddit_name_title_and_clean_descriptions_word_count,,,,,tensorflow-2-3-20210617-fix,LOCAL,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d,test_n_samples
3,e48b0170c7ec4b3a9f4712676de6115e,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/e48b0170c7ec4b3a9f4712676de6115e/artifacts,2021-07-01 09:50:34.758000+00:00,2021-07-01 09:50:53.902000+00:00,512.0,512.0,1.366118,629.0,1500.0,2200.0,512.0,,text,tensorflow-2-3-20210617-fix,subreddits/de/2021-06-16,False,posts/de/2021-06-16,comment_text_word_count,sklearn,post_url_for_embeddings,post_id,1200.0,comment_id,2200.0,use_multilingual,2000.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,1500.0,i18n-subreddit-clustering,subreddit_name_title_and_clean_descriptions,subreddit_id,comment_body_text,text_word_count,comments/de/2021-06-16,,subreddit_name_title_and_clean_descriptions_word_count,,,,,tensorflow-2-3-20210617-fix,LOCAL,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d,test_n_samples
4,f4146aeea6f740ceadcf53c3da0c55cc,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/f4146aeea6f740ceadcf53c3da0c55cc/artifacts,2021-07-01 09:26:54.619000+00:00,2021-07-01 09:27:14.869000+00:00,512.0,512.0,1.346785,629.0,2988.0,3500.0,512.0,,text,tensorflow-2-3-20210617-fix,subreddits/de/2021-06-16,False,posts/de/2021-06-16,comment_text_word_count,sklearn,post_url_for_embeddings,post_id,1200.0,comment_id,3500.0,use_multilingual,3000.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,1500.0,i18n-subreddit-clustering,subreddit_name_title_and_clean_descriptions,subreddit_id,comment_body_text,text_word_count,comments/de/2021-06-16,,subreddit_name_title_and_clean_descriptions_word_count,,,,,tensorflow-2-3-20210617-fix,LOCAL,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d,
5,228dc2d325af46aa8b77fcf9b48fb4f9,5,KILLED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/228dc2d325af46aa8b77fcf9b48fb4f9/artifacts,2021-07-01 09:18:34.401000+00:00,2021-07-01 09:24:45.508000+00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,LOCAL,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d,
6,86a7fd9c16924966a46daecf71f32597,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/86a7fd9c16924966a46daecf71f32597/artifacts,2021-07-01 09:06:22.773000+00:00,2021-07-01 09:06:43.136000+00:00,512.0,512.0,1.422616,629.0,1997.0,2500.0,512.0,,text,,subreddits/de/2021-06-16,False,posts/de/2021-06-16,comment_text_word_count,sklearn,post_url_for_embeddings,post_id,1200.0,comment_id,2500.0,use_multilingual,2000.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual/3,1000.0,i18n-subreddit-clustering,subreddit_name_title_and_clean_descriptions,subreddit_id,comment_body_text,text_word_count,comments/de/2021-06-16,,subreddit_name_title_and_clean_descriptions_word_count,,,,,tensorflow-2-3-20210617-fix,LOCAL,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d,
7,49934e85170b44bf8da96949b28b9edd,5,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/5/49934e85170b44bf8da96949b28b9edd/artifacts,2021-07-01 07:59:05.396000+00:00,2021-07-01 09:04:33.319000+00:00,,,,629.0,,,512.0,,text,,subreddits/de/2021-06-16,False,posts/de/2021-06-16,comment_text_word_count,sklearn,post_url_for_embeddings,post_id,1200.0,comment_id,1500.0,use_multilingual_large,2000.0,https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3,1000.0,i18n-subreddit-clustering,subreddit_name_title_and_clean_descriptions,subreddit_id,comment_body_text,text_word_count,comments/de/2021-06-16,,subreddit_name_title_and_clean_descriptions_word_count,,,,,tensorflow-2-3-20210617-fix,LOCAL,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,313f8a4a1228c7f6cf6230c85f8673923d33fe3d,


## For all runs

In [16]:
mlf.search_all_runs(experiment_ids=None).shape

(245, 140)

# Get artifact based only on run_id

## Using new function

In [23]:
# TODO(djb): fix read_csv error
run_id = 'aac3e007dfc2446790e25887adf287f6'

df_ix_to_id = mlf.read_run_artifact(
    run_id=run_id,
    artifact_folder='d_ix_to_id',
    artifact_file='d_ix_to_id.csv',
    read_function=pd.read_csv,
)
print(df_ix_to_id.shape)
df_ix_to_id.head()

18:55:03 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/4/aac3e007dfc2446790e25887adf287f6/artifacts/d_ix_to_id"


  0%|          | 0/1 [00:00<?, ?it/s]

18:55:03 | INFO | "  Parquet files found: 0"


path to load
 /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/4/aac3e007dfc2446790e25887adf287f6/artifacts/d_ix_to_id
path to TYPE
 <class 'pathlib.PosixPath'>
list of parquet
 []


IsADirectoryError: [Errno 21] Is a directory: '/home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/4/aac3e007dfc2446790e25887adf287f6/artifacts/d_ix_to_id/d_ix_to_id.csv'

In [None]:
run_id = '99158c385c5442b1bf5ff96fc44af7da'

df_ix_to_id = mlf.read_run_artifact(
    run_id=run_id,
    artifact_folder='df_sub_level_agg_a_post_only',
    read_function=pd.read_parquet,
)
print(df_ix_to_id.shape)
df_ix_to_id.head()

## Or you could do it manually... 
By pulling the run's `artifact_uri`

In [11]:
run_id = 'aac3e007dfc2446790e25887adf287f6'
run = mlflow.get_run(run_id)

In [12]:
f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv"

'gs://i18n-subreddit-clustering/mlflow/mlruns/4/aac3e007dfc2446790e25887adf287f6/artifacts/d_ix_to_id/d_ix_to_id.csv'

In [13]:
df_idx = pd.read_csv(f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv")
print(df_idx.shape)
df_idx.head()

(111669, 2)


Unnamed: 0,training_index,post_id
0,0,t3_mkyj2k
1,1,t3_mkynzi
2,2,t3_mkyolv
3,3,t3_mkyp17
4,4,t3_mkyqrz


# Test connecting mlflow to remote server...

Need to figure out some VPC/authentication/firewall stuff to figure this out.

In [14]:
# mlflow_central_server_internal_ip = "https://10.138.0.3:5000" # internal
# # mlflow_central_server_internal_ip = "https://34.82.93.40:5000" # external...

# mlf = MlflowLogger(tracking_uri=mlflow_central_server_internal_ip)
# mlflow.get_tracking_uri()