# Purpose

2022-03
One of the big challenges in using the current clusters is that they don't have names, only IDs. Not having labels (besides the generic primary topic) makes it harder for people to understand and use them.

In this notebook we'll try a baseline TF-IDF approach to create cluster labels.  We could use them during QA and/or during curation.


# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
import os
from pathlib import Path

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import mlflow
import hydra

import subclu
from subclu.eda.aggregates import compare_raw_v_weighted_language
from subclu.utils import set_working_directory, get_project_subfolder
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric, reorder_array,
)
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl, 
    get_color_dict, base_colors_for_manual_labels,
    check_colors_used,
)
from subclu.data.data_loaders import LoadPosts, LoadSubreddits, create_sub_level_aggregates


# ===
# imports specific to this notebook



print_lib_versions([hydra, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
hydra		v: 1.1.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.4.1


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set sqlite database as MLflow URI

In [4]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db'

## Get list of experiments with new function

In [5]:
mlf.list_experiment_meta(output_format='pandas').tail(9)

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
17,17,v0.4.0_use_multi_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/17,active
18,18,v0.4.0_use_multi_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/18,active
19,19,v0.4.1_mUSE_inference_test,gs://i18n-subreddit-clustering/mlflow/mlruns/19,active
20,20,v0.4.1_mUSE_inference,gs://i18n-subreddit-clustering/mlflow/mlruns/20,active
21,21,v0.4.1_mUSE_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/21,active
22,22,v0.4.1_mUSE_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/22,active
23,23,v0.4.1_mUSE_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/23,active
24,24,v0.4.1_mUSE_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/24,active
25,25,v0.4.1_mUSE_clustering_new_metrics,gs://i18n-subreddit-clustering/mlflow/mlruns/25,active


## Get experiment ID's for models to check

experiment ID 25 as the latest runs

In [6]:
%%time

df_mlf = mlf.search_all_runs(experiment_ids=[25])
df_mlf.shape

CPU times: user 1.39 s, sys: 72.6 ms, total: 1.46 s
Wall time: 1.46 s


(132, 273)

In [7]:
df_mlf.iloc[:5, :10]

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.primary_topic-2700_to_3000-f1_score-macro_avg,metrics.primary_topic-0250_to_0500-recall-weighted_avg,metrics.optimal_k-0010_to_0020,metrics.primary_topic-0750_to_1000-adjusted_rand_score
0,a6ee2f75491d4449a05fad502d7b80c3,25,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/25/a6ee2f75491d4449a05fad502d7b80c3/artifacts,2022-01-20 19:50:19.553000+00:00,2022-01-20 19:54:40.237000+00:00,0.15018,0.398119,11.0,0.375685
1,4b246da72d254bf9888962d483ed49a3,25,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/25/4b246da72d254bf9888962d483ed49a3/artifacts,2022-01-20 19:49:26.745000+00:00,2022-01-20 20:02:40.384000+00:00,0.478527,0.268315,14.0,0.03643
2,619e29db458a43e6ac726eac7145db89,25,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/25/619e29db458a43e6ac726eac7145db89/artifacts,2022-01-20 19:49:09.043000+00:00,2022-01-20 19:53:31.024000+00:00,0.15479,0.400613,10.0,0.384005
3,25c5dfaa03d34da88fdfb3a1850d7d44,25,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/25/25c5dfaa03d34da88fdfb3a1850d7d44/artifacts,2022-01-20 19:48:44.509000+00:00,2022-01-20 20:02:25.618000+00:00,0.471462,0.268139,14.0,0.012955
4,b29776a461994e00b00139ec1bb6270b,25,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/25/b29776a461994e00b00139ec1bb6270b/artifacts,2022-01-20 19:46:49.578000+00:00,2022-01-20 19:51:15.689000+00:00,0.145046,0.398264,17.0,0.382473


# Load labels from selected model
This run was selected for model v0.4.1, so let's use its labels for our analysis

`e37b0a2c3af54c588818e7efdde15df5`


In [8]:
model_uuid = 'e37b0a2c3af54c588818e7efdde15df5'
mlf.list_run_artifacts(model_uuid)

18:56:43 | INFO | "    93 <- Artifacts clean count"
18:56:43 | INFO | "    12 <- Artifacts & folders at TOP LEVEL clean count"


['X_linkage',
 'clustering.log',
 'clustering_model',
 'config',
 'df_accel',
 'df_classification_reports',
 'df_labels',
 'df_supervised_metrics',
 'figures',
 'hydra',
 'optimal_ks',
 'pipeline_params']

In [10]:
?mlf.read_run_artifact

[0;31mSignature:[0m
[0mmlf[0m[0;34m.[0m[0mread_run_artifact[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mrun_id[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0martifact_folder[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0martifact_file[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mexperiment_ids[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mint[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mread_function[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0;34m<[0m[0mbuilt[0m[0;34m-[0m[0;32min[0m [0mfunction[0m [0mcallable[0m[0;34m>[0m[0;34m,[0m [0mstr[0m[0;34m][0m [0;34m=[0m [0;34m'pd_parquet'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0;34m<[0m[0mbuilt[0m[0;34m-[0m[0;32min[0m [0mf

In [13]:
df_labels = mlf.read_run_artifact(
    run_id=model_uuid,
    artifact_folder='df_labels',
    read_function='pd_parquet'
)

18:59:36 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/25/e37b0a2c3af54c588818e7efdde15df5/artifacts/df_labels"
100%|##########################################| 2/2 [00:00<00:00, 10034.22it/s]
18:59:36 | INFO | "  Parquet files found:     1"
18:59:36 | INFO | "  Parquet files to use:     1"


FileNotFoundError: [Errno 2] No such file or directory: '/home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/25/e37b0a2c3af54c588818e7efdde15df5/artifacts/df_labels/*.parquet'

# Level 1 - only subreddit metadata