# Purpose
Test mlflow location utils before kicking off multiple jobs.


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
from pathlib import Path

import mlflow
import numpy as np
import pandas as pd

import subclu
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    print_lib_versions, setup_logging, notebook_display_config, 
)


print_lib_versions([mlflow, np, pd, subclu])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
subclu		v: 0.1.2


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Check mlflow location

## Default (without a server)
By default, mlflow will save all data to local files & folders.

In [4]:
mlflow.get_tracking_uri()

'file:///home/jupyter/subreddit_clustering_i18n/notebooks/tests/mlruns'

## But with my `MlflowLogger` class, I can set it to a local sqlite database

In [5]:
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/mlflow/mlruns.db'

# Get list of experiments with new function
With my custom class, it's easier to pull data from the mlflow database

## As list

In [6]:
mlf.list_experiment_meta()

[{'experiment_id': '0',
  'name': 'Default',
  'artifact_location': './mlruns/0',
  'lifecycle_stage': 'active'},
 {'experiment_id': '1',
  'name': 'fse_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/1',
  'lifecycle_stage': 'active'},
 {'experiment_id': '2',
  'name': 'fse_vectorize_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/2',
  'lifecycle_stage': 'active'},
 {'experiment_id': '3',
  'name': 'subreddit_description_v1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/3',
  'lifecycle_stage': 'active'},
 {'experiment_id': '4',
  'name': 'fse_vectorize_v1.1',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/4',
  'lifecycle_stage': 'active',
  'tags': [{'key': 'mlflow.note.content', 'value': 'Posts & comments.'}]},
 {'experiment_id': '5',
  'name': 'use_multilingual_v0.1_test',
  'artifact_location': 'gs://i18n-subreddit-clustering/mlflow/mlruns/5',
  'lifecycle_stage': 'active'},
 {'expe

## As df

In [7]:
mlf.list_experiment_meta(output_format='pandas')

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage,tags
0,0,Default,./mlruns/0,active,
1,1,fse_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/1,active,
2,2,fse_vectorize_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/2,active,
3,3,subreddit_description_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/3,active,
4,4,fse_vectorize_v1.1,gs://i18n-subreddit-clustering/mlflow/mlruns/4,active,"[{'key': 'mlflow.note.content', 'value': 'Posts & comments.'}]"
5,5,use_multilingual_v0.1_test,gs://i18n-subreddit-clustering/mlflow/mlruns/5,active,
6,6,use_multilingual_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/6,active,


# Get run metadata

## For a specific experiment ID (or multiple IDs)

In [8]:
mlf.search_all_runs(experiment_ids=[4]).head(8)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.training_docs_count,metrics.df_posts_len,metrics.vectorizing_time_minutes,metrics.df_comments_len,params.posts_path,params.bucket_name,params.train_min_word_count,params.preprocess_text_folder,params.col_text_comment,params.tokenize_function,params.col_text_comment_word_count,params.col_text_post_url,params.col_subreddit_id,params.subreddits_path,params.training_data,params.tokenize_lowercase,params.train_use_comments,params.col_text_subreddit_description,params.train_exclude_duplicated_docs,params.col_post_id,params.col_text_post,params.col_comment_id,params.comments_path,params.model_name,params.col_text_subreddit_word_count,params.col_text_post_word_count,tags.mlflow.user,tags.mlflow.source.name,tags.mlflow.source.git.commit,tags.mlflow.source.type
0,aac3e007dfc2446790e25887adf287f6,4,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/4/aac3e007dfc2446790e25887adf287f6/artifacts,2021-06-02 07:50:12.635000+00:00,2021-06-02 07:53:23.837000+00:00,43452.0,111669.0,3.381014,638052.0,posts/2021-05-19,i18n-subreddit-clustering,4,,comment_body_text,gensim,comment_text_word_count,post_url_for_embeddings,subreddit_id,,post_title_and_body,False,False,subreddit_name_title_and_clean_descriptions,True,post_id,text,comment_id,comments/2021-05-19,fasttext_usif_de,subreddit_name_title_and_clean_descriptions_word_count,text_word_count,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,578a60e37c39cfd2a09de1da8a41ec9f9b76befd,LOCAL
1,ecb1e4292c8b43159c8e982b75bb7988,4,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/4/ecb1e4292c8b43159c8e982b75bb7988/artifacts,2021-06-02 07:50:00.233000+00:00,2021-06-02 07:53:14.996000+00:00,31790.0,111669.0,3.517798,638052.0,posts/2021-05-19,i18n-subreddit-clustering,4,,comment_body_text,gensim,comment_text_word_count,post_url_for_embeddings,subreddit_id,,post_title_and_body,False,False,subreddit_name_title_and_clean_descriptions,True,post_id,text,comment_id,comments/2021-05-19,fasttext_usif_de,subreddit_name_title_and_clean_descriptions_word_count,text_word_count,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,578a60e37c39cfd2a09de1da8a41ec9f9b76befd,LOCAL
2,7052ba2c99c5454f9a29d662fc9e40aa,4,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/4/7052ba2c99c5454f9a29d662fc9e40aa/artifacts,2021-06-02 07:46:14.355000+00:00,2021-06-02 07:49:59.094000+00:00,43452.0,111669.0,3.934723,638052.0,posts/2021-05-19,i18n-subreddit-clustering,4,,comment_body_text,gensim,comment_text_word_count,post_url_for_embeddings,subreddit_id,,post_title_and_body,True,False,subreddit_name_title_and_clean_descriptions,True,post_id,text,comment_id,comments/2021-05-19,fasttext_usif_de,subreddit_name_title_and_clean_descriptions_word_count,text_word_count,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,578a60e37c39cfd2a09de1da8a41ec9f9b76befd,LOCAL
3,fcd57925c012414491f90c29df5bc7d1,4,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/4/fcd57925c012414491f90c29df5bc7d1/artifacts,2021-06-02 07:45:59.736000+00:00,2021-06-02 07:49:42.134000+00:00,31790.0,111669.0,3.915259,638052.0,posts/2021-05-19,i18n-subreddit-clustering,4,,comment_body_text,gensim,comment_text_word_count,post_url_for_embeddings,subreddit_id,,post_title_and_body,True,False,subreddit_name_title_and_clean_descriptions,True,post_id,text,comment_id,comments/2021-05-19,fasttext_usif_de,subreddit_name_title_and_clean_descriptions_word_count,text_word_count,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,578a60e37c39cfd2a09de1da8a41ec9f9b76befd,LOCAL
4,1a57bdc48f6d42009ce2fccb6d850021,4,KILLED,gs://i18n-subreddit-clustering/mlflow/mlruns/4/1a57bdc48f6d42009ce2fccb6d850021/artifacts,2021-06-02 07:41:48.367000+00:00,2021-06-02 07:46:02.572000+00:00,43452.0,,,,posts/2021-05-19,i18n-subreddit-clustering,4,,comment_body_text,gensim,comment_text_word_count,post_url_for_embeddings,subreddit_id,,post_title_and_body,True,False,subreddit_name_title_and_clean_descriptions,True,post_id,text,comment_id,comments/2021-05-19,fasttext_usif_de,subreddit_name_title_and_clean_descriptions_word_count,text_word_count,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,de59320181d67c9d75c3db7ab19d07283b587a54,LOCAL
5,87471c528fb54ccba18b317322d7aace,4,KILLED,gs://i18n-subreddit-clustering/mlflow/mlruns/4/87471c528fb54ccba18b317322d7aace/artifacts,2021-06-02 07:41:11.661000+00:00,2021-06-02 07:45:47.102000+00:00,31790.0,,,,posts/2021-05-19,i18n-subreddit-clustering,4,,comment_body_text,gensim,comment_text_word_count,post_url_for_embeddings,subreddit_id,,post_title_and_body,True,False,subreddit_name_title_and_clean_descriptions,True,post_id,text,comment_id,comments/2021-05-19,fasttext_usif_de,subreddit_name_title_and_clean_descriptions_word_count,text_word_count,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,de59320181d67c9d75c3db7ab19d07283b587a54,LOCAL
6,61e5d620095e491b8bff72895e8b9855,4,KILLED,gs://i18n-subreddit-clustering/mlflow/mlruns/4/61e5d620095e491b8bff72895e8b9855/artifacts,2021-06-02 07:28:36.526000+00:00,2021-06-02 07:40:57.756000+00:00,31790.0,,,,posts/2021-05-19,i18n-subreddit-clustering,4,,comment_body_text,gensim,comment_text_word_count,post_url_for_embeddings,subreddit_id,,post_title_and_body,True,False,subreddit_name_title_and_clean_descriptions,True,post_id,text,comment_id,comments/2021-05-19,fasttext_usif_de,subreddit_name_title_and_clean_descriptions_word_count,text_word_count,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,5f40b761b8590860951ad603d6806efff5cb2eed,LOCAL
7,3bf280ee76fc4595afc5e8cbaaf79a7d,4,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/4/3bf280ee76fc4595afc5e8cbaaf79a7d/artifacts,2021-06-02 07:25:30.227000+00:00,2021-06-02 07:28:22.093000+00:00,31790.0,111669.0,3.090045,638052.0,posts/2021-05-19,i18n-subreddit-clustering,4,,comment_body_text,sklearn_acronyms_emoji,comment_text_word_count,post_url_for_embeddings,subreddit_id,,post_title_and_body,True,False,subreddit_name_title_and_clean_descriptions,True,post_id,text,comment_id,comments/2021-05-19,fasttext_usif_de,subreddit_name_title_and_clean_descriptions_word_count,text_word_count,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,5f40b761b8590860951ad603d6806efff5cb2eed,LOCAL


## For all runs

In [9]:
mlf.search_all_runs(experiment_ids=None).shape

(32, 41)

# Get artifact based only on run_id

## Using new function

In [15]:
run_id = 'aac3e007dfc2446790e25887adf287f6'

df_ix_to_id = mlf.read_run_artifact(
    run_id=run_id,
    artifact_folder='d_ix_to_id/d_ix_to_id.csv',
    read_function=pd.read_csv,
)
print(df_ix_to_id.shape)
df_ix_to_id.head()

(111669, 2)


Unnamed: 0,training_index,post_id
0,0,t3_mkyj2k
1,1,t3_mkynzi
2,2,t3_mkyolv
3,3,t3_mkyp17
4,4,t3_mkyqrz


## Or you could do it manually... 
By pulling the run's `artifact_uri`

In [11]:
run_id = 'aac3e007dfc2446790e25887adf287f6'
run = mlflow.get_run(run_id)

In [12]:
f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv"

'gs://i18n-subreddit-clustering/mlflow/mlruns/4/aac3e007dfc2446790e25887adf287f6/artifacts/d_ix_to_id/d_ix_to_id.csv'

In [13]:
df_idx = pd.read_csv(f"{run.info.artifact_uri}/d_ix_to_id/d_ix_to_id.csv")
print(df_idx.shape)
df_idx.head()

(111669, 2)


Unnamed: 0,training_index,post_id
0,0,t3_mkyj2k
1,1,t3_mkynzi
2,2,t3_mkyolv
3,3,t3_mkyp17
4,4,t3_mkyqrz


# Test connecting mlflow to remote server...

Need to figure out some VPC/authentication/firewall stuff to figure this out.

In [14]:
# mlflow_central_server_internal_ip = "https://10.138.0.3:5000" # internal
# # mlflow_central_server_internal_ip = "https://34.82.93.40:5000" # external...

# mlf = MlflowLogger(tracking_uri=mlflow_central_server_internal_ip)
# mlflow.get_tracking_uri()