# Purpose

2021-08-10: Finally completed testing with sampling <= 10 files. Now ready to run process on full data!

Ended up doing it all in dask + pandas + numpy because of problems installing `cuDF`.

---
2021-08-02: Now that I'm processing millions of comments and posts, I need to re-write the functions to try to do some work in parallel and reduce the amount of data loaded in RAM.

- `Dask` seems like a great option to load data and only compute some of it as needed.
- `cuDF` could be a way to speed up some computation using GPUs
- `Dask-delayed` could be a way to create a task DAG lazily before computing all the aggregates.


---

In notebook 09 I combined embeddings from posts & subreddits (`djb_09.00-combine_post_and_comments_and_visualize_for_presentation.ipynb`).

In this notebook I'll be testing functions that include mlflow so that it's easier to try a lot of different weights to find better respresentations.

Take embeddings created by other models & combine them:
```
new post embeddings = post + comments + subreddit description

new subreddit embeddings = new posts (weighted by post length or upvotes?)
```

# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
import os
import logging
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm.auto import tqdm

import mlflow
import hydra

import subclu
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)

from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)


print_lib_versions([dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.3.2


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set sqlite database as MLflow URI

In [4]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db'

## Get list of experiments with new function

In [5]:
mlf.list_experiment_meta(output_format='pandas')

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
0,0,Default,./mlruns/0,active
1,1,fse_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/1,active
2,2,fse_vectorize_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/2,active
3,3,subreddit_description_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/3,active
4,4,fse_vectorize_v1.1,gs://i18n-subreddit-clustering/mlflow/mlruns/4,active
5,5,use_multilingual_v0.1_test,gs://i18n-subreddit-clustering/mlflow/mlruns/5,active
6,6,use_multilingual_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/6,active
7,7,use_multilingual_v1_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/7,active
8,8,use_multilingual_v1_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/8,active
9,9,v0.3.2_use_multi_inference_test,gs://i18n-subreddit-clustering/mlflow/mlruns/9,active


## Get runs that we can use for embeddings aggregation jobs

In [6]:
%%time

df_mlf_runs =  mlf.search_all_runs(experiment_ids=[9, '10', 11, 12])
df_mlf_runs.shape

CPU times: user 254 ms, sys: 12.5 ms, total: 266 ms
Wall time: 266 ms


(110, 101)

In [7]:
mask_finished = df_mlf_runs['status'] == 'FINISHED'
mask_output_over_1M_rows = (
    (df_mlf_runs['metrics.df_vect_posts_rows'] >= 1e6) |
    (df_mlf_runs['metrics.df_vect_comments'] >= 1e6)
)
# df_mlf_runs[mask_finished].shape

df_mlf_use_for_agg = df_mlf_runs[mask_output_over_1M_rows]
df_mlf_use_for_agg.shape

(3, 101)

In [8]:
cols_with_multiple_vals = df_mlf_use_for_agg.columns[df_mlf_use_for_agg.nunique(dropna=False) > 1]
# len(cols_with_multiple_vals)

style_df_numeric(
    df_mlf_use_for_agg
    [cols_with_multiple_vals]
    .drop(['artifact_uri', 'end_time',
           # 'start_time',
           ], 
          axis=1)
    .dropna(axis='columns', how='all')
    .iloc[:, :30]
    ,
    rename_cols_for_display=True,
)

Unnamed: 0,run id,experiment id,status,start time,metrics.df vect subreddits description rows,metrics.vectorizing time minutes posts,metrics.total comment files processed,metrics.vectorizing time minutes subreddit meta,metrics.df vect comments,metrics.df vect subreddits description cols,metrics.vectorizing time minutes comments,metrics.vectorizing time minutes full function,metrics.df vect posts rows,metrics.df vect posts cols,params.tf batch inference rows,params.tokenize lowercase,params.tf limit first n chars,params.batch comment files,params.posts path,params.subreddits path,tags.mlflow.source.git.commit,tags.mlflow.runName
87,a948e9fd651545f997430cddc6b529eb,10,FINISHED,2021-07-29 23:02:33.997000+00:00,3767.00,14.74,37.00,0.08,19168154.00,514.00,145.73,176.77,1649929.00,515.00,2000,True,1000,True,posts/top/2021-07-16,subreddits/top/2021-07-16,63f5f420fb6b48d8243749cba183071757dac531,new_batch_fxn-2021-07-29_230233
89,e66c5db26bd64f6da09c012eea700d0a,10,FINISHED,2021-07-29 18:59:48.715000+00:00,-,-,37.00,-,19200854.00,-,117.47,133.16,-,-,6100,False,850,True,,,64f49e85a8ef56a6795edf9da9a6f5964cb6830b,new_batch_fxn_2021-07-29_185948
98,614a38e6690c4f3ba08725b1585b2ee9,9,KILLED,2021-07-29 11:49:53.924000+00:00,3767.00,10.01,-,0.07,-,514.00,-,-,1649929.00,515.00,2100,False,1000,,posts/top/2021-07-16,subreddits/top/2021-07-16,64f49e85a8ef56a6795edf9da9a6f5964cb6830b,test_new_fxn2021-07-29_114953


# Load configs for aggregation jobs

`n_sample_comments_files` and `n_sample_posts_files` allow us to only load a few files at a time (e.g., 2 instead of 50) to test the process end-to-end.

---
Note that by default `hydra` is a cli tool. If we want to call use it in jupyter, we need to manually initialize configs & compose the configuration. See my custom function `load_config_agg_jupyter`. Also see:
- [Notebook with `Hydra` examples in a notebook](https://github.com/facebookresearch/hydra/blob/master/examples/jupyter_notebooks/compose_configs_in_notebook.ipynb).
- [Hydra docs, Hydra in Jupyter](https://hydra.cc/docs/next/advanced/jupyter_notebooks/).


In [9]:
mlflow_experiment_test = 'v0.3.2_use_multi_aggregates_test'
mlflow_experiment_full = 'v0.3.2_use_multi_aggregates'

config_test_sample_lc_false = AggregateEmbeddingsConfig(
    config_path="../config",
    config_name='aggregate_embeddings',
    overrides=[f"mlflow_experiment={mlflow_experiment_test}",
               'n_sample_posts_files=5',     # 51 total
               'n_sample_comments_files=10',  # 34 total
               'data_embeddings_to_aggregate=top_subs-2021_07_16-use_multi_lower_case_false',
              ]
)
config_test_full_lc_false = AggregateEmbeddingsConfig(
    config_path="../config",
    config_name='aggregate_embeddings',
    overrides=[f"mlflow_experiment={mlflow_experiment_test}",
               'n_sample_posts_files=null', 
               'n_sample_comments_files=null',
               'data_embeddings_to_aggregate=top_subs-2021_07_16-use_multi_lower_case_false',
              ]
)

config_full_lc_false = AggregateEmbeddingsConfig(
    config_path="../config",
    config_name='aggregate_embeddings',
    overrides=[f"mlflow_experiment={mlflow_experiment_full}",
               'n_sample_posts_files=null', 
               'n_sample_comments_files=null',
               'data_embeddings_to_aggregate=top_subs-2021_07_16-use_multi_lower_case_false',
              ]
)

config_full_lc_true = AggregateEmbeddingsConfig(
    config_path="../config",
    config_name='aggregate_embeddings',
    overrides=[f"mlflow_experiment={mlflow_experiment_full}",
               'n_sample_posts_files=null', 
               'n_sample_comments_files=null',
               'data_embeddings_to_aggregate=top_subs-2021_07_16-use_multi_lower_case_true',
              ]
)
# pprint(config_test_sample_lc_false.config_dict, indent=2)

In [10]:
df_configs = pd.DataFrame(
    [config_test_sample_lc_false.config_flat,
     config_test_full_lc_false.config_flat,
     config_full_lc_false.config_flat,
     config_full_lc_true.config_flat,
    ]
)
cols_with_diffs_config = df_configs.columns[df_configs.nunique(dropna=False) > 1]
df_configs[cols_with_diffs_config]

Unnamed: 0,comments_uuid,posts_uuid,subreddit_desc_uuid,mlflow_experiment,n_sample_posts_files,n_sample_comments_files
0,e66c5db26bd64f6da09c012eea700d0a,614a38e6690c4f3ba08725b1585b2ee9,614a38e6690c4f3ba08725b1585b2ee9,v0.3.2_use_multi_aggregates_test,5.0,10.0
1,e66c5db26bd64f6da09c012eea700d0a,614a38e6690c4f3ba08725b1585b2ee9,614a38e6690c4f3ba08725b1585b2ee9,v0.3.2_use_multi_aggregates_test,,
2,e66c5db26bd64f6da09c012eea700d0a,614a38e6690c4f3ba08725b1585b2ee9,614a38e6690c4f3ba08725b1585b2ee9,v0.3.2_use_multi_aggregates,,
3,a948e9fd651545f997430cddc6b529eb,a948e9fd651545f997430cddc6b529eb,a948e9fd651545f997430cddc6b529eb,v0.3.2_use_multi_aggregates,,


In [11]:
pprint(config_test_sample_lc_false.config_flat, indent=2)

{ 'agg_comments_to_post_weight_col': None,
  'agg_post_comment_weight': 20,
  'agg_post_post_weight': 70,
  'agg_post_subreddit_desc_weight': 10,
  'agg_post_to_subreddit_weight_col': None,
  'bucket_name': 'i18n-subreddit-clustering',
  'col_comment_id': 'comment_id',
  'col_post_id': 'post_id',
  'col_subreddit_id': 'subreddit_id',
  'col_text_comment_word_count': 'comment_text_word_count',
  'col_text_post_word_count': 'text_word_count',
  'comments_folder_embeddings': 'df_vect_comments',
  'comments_uuid': 'e66c5db26bd64f6da09c012eea700d0a',
  'dataset_name': 'Top Subreddits (no Geo) + German Subs 2021-07-16, comments: '
                  '2021-07-09',
  'folder_comments_text_and_meta': 'comments/top/2021-07-09',
  'folder_posts_text_and_meta': 'posts/top/2021-07-16',
  'folder_subreddits_text_and_meta': 'subreddits/top/2021-07-16',
  'min_comment_text_len': 10,
  'mlflow_experiment': 'v0.3.2_use_multi_aggregates_test',
  'mlflow_tracking_uri': 'sqlite',
  'n_sample_comments_files'

In [None]:
BREAK

# Initialize a local dask client
so that we can see the progress/process for dask jobs

In [13]:
%%time

from dask.distributed import Client, LocalCluster

# dask default: 8 workers with 64 CPUs present, 16 -> runs out of RAM per worker...
cluster = LocalCluster(n_workers=8)
client = Client(cluster)

CPU times: user 345 ms, sys: 173 ms, total: 517 ms
Wall time: 1.13 s


In [14]:
client.dashboard_link

'http://127.0.0.1:8787/status'

# Run Full data with `lower_case=False`

The logic for sampling files and download/`caching` files locally lives in the `mlf` custom function.

Caching can save 9+ minutes if we try to download the files from GCS every time.

In [None]:
BREAK

In [None]:
%%time

mlflow.end_run("FAILED")
gc.collect()
try:
    del job_agg1
    del d_dfs1
except NameError:
    pass
gc.collect()

job_agg1 = AggregateEmbeddings(
    run_name=f"full_lc_false-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    **config_full_lc_false.config_flat
)
job_agg1.run_aggregation()

gc.collect()

07:02:28 | INFO | "== Start run_aggregation() method =="
07:02:28 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db"
07:02:29 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/aggregate_embeddings/2021-08-10_070229-full_lc_false-2021-08-10_070228"
07:02:29 | INFO | "  Saving config to local path..."
07:02:29 | INFO | "  Logging config to mlflow..."
07:02:29 | INFO | "-- Start _load_raw_embeddings() method --"
07:02:29 | INFO | "Loading subreddit description embeddings..."
07:02:30 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/9/614a38e6690c4f3ba08725b1585b2ee9/artifacts/df_vect_subreddits_description"


  0%|          | 0/4 [00:00<?, ?it/s]

07:02:30 | INFO | "  Reading 1 files"
07:02:31 | INFO | "       3,767 |  513 <- Raw vectorized subreddit description shape"
07:02:32 | INFO | "Loading POSTS embeddings..."
07:02:33 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/9/614a38e6690c4f3ba08725b1585b2ee9/artifacts/df_vect_posts"


  0%|          | 0/51 [00:00<?, ?it/s]

07:02:33 | INFO | "  Reading 48 files"
07:02:35 | INFO | "   1,649,929 |  514 <- Raw POSTS shape"
07:02:38 | INFO | "Loading COMMENTS embeddings..."
07:02:39 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/10/e66c5db26bd64f6da09c012eea700d0a/artifacts/df_vect_comments"


  0%|          | 0/38 [00:00<?, ?it/s]

07:02:39 | INFO | "  Reading 37 files"
07:02:39 | INFO | "  0:00:09.844584 <- Total raw embeddings load time elapsed"
07:02:39 | INFO | "-- Start _agg_comments_to_post_level() method --"
07:02:40 | INFO | "Getting count of comments per post..."
'<=' not supported between instances of 'NoneType' and 'int'"
07:02:55 | INFO | "Filtering which comments need to be averaged..."
07:04:23 | INFO | "      128,716 <- Comments that DON'T need to be averaged"
07:04:24 | INFO | "   19,072,138 <- Comments that need to be averaged"
07:04:24 | INFO | "No column to weight comments, simple mean for comments at post level"
07:06:14 | INFO | "      985,894 |  514 <- df_v_com_agg SHAPE"
07:06:14 | INFO | "  0:03:34.315443 <- Total comments to post agg loading time elapsed"
07:06:14 | INFO | "-- Start (df_posts_agg_b) _agg_posts_and_comments_to_post_level() method --"
07:06:15 | INFO | "DEFINE agg_posts_w_comments..."
07:06:16 | INFO | "  (Delayed('int-e8e786da-22e6-4893-a586-2b056bcc6e58'), 513) <- df_agg_

  0%|          | 0/11 [00:00<?, ?it/s]

08:36:46 | INFO | "** df_post_level_agg_b_post_and_comments **"
08:36:46 | INFO | "Saving locally..."
08:54:18 | INFO | "Logging artifact to mlflow..."
08:55:34 | INFO | "** df_post_level_agg_c_post_comments_sub_desc **"
08:55:34 | INFO | "Saving locally..."
09:16:16 | INFO | "     268	<- EXISTING Dask partitions"
09:38:02 | INFO | "Logging artifact to mlflow..."
09:39:58 | INFO | "** df_sub_level_agg_a_post_only **"
09:39:58 | INFO | "Saving locally..."
09:40:04 | INFO | "       1	<- EXISTING Dask partitions"
09:40:12 | INFO | "Logging artifact to mlflow..."
09:40:13 | INFO | "** df_sub_level_agg_a_post_only_similarity **"
09:40:13 | INFO | "Saving locally..."
09:40:13 | INFO | "Keeping index intact..."
09:40:13 | INFO | "Converting pandas to dask..."
09:40:13 | INFO | "   108.6 MB <- Memory usage"
09:40:13 | INFO | "       3	<- target Dask partitions	   40.0 <- target MB partition size"
09:40:16 | INFO | "Logging artifact to mlflow..."
09:40:18 | INFO | "** df_sub_level_agg_a_post_on

In [None]:
%%time

mlflow.end_run("FAILED")
gc.collect()
try:
    del job_agg2
    del d_dfs2
except NameError:
    pass
gc.collect()

job_agg2 = AggregateEmbeddings(
    run_name=f"full_lc_true-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    **config_full_lc_true.config_flat
)
job_agg2.run_aggregation()

gc.collect()

In [18]:
mlflow.end_run("FAILED")

## Check output dfs

In [16]:
%%time

d_dfs2 = dict()
(
    d_dfs2['df_v_sub'], d_dfs2['df_v_posts'], d_dfs2['df_v_comments'],
#     d_dfs2['df_subs_meta'], d_dfs2['df_posts_meta'], d_dfs2['df_comments_meta'],
    
    # Aggs don't get computed until run_aggergation() method gets called
    d_dfs2['df_subs_agg_a'], d_dfs2['df_subs_agg_b'], d_dfs2['df_subs_agg_c'], 
    d_dfs2['df_posts_agg_b'], d_dfs2['df_posts_agg_c'], 
    # d_dfs2['df_posts_agg_d'],

) = (
    job_agg2.df_v_sub, job_agg2.df_v_posts, job_agg2.df_v_comments,
#     job_agg2.df_subs_meta, job_agg2.df_posts_meta, job_agg2.df_comments_meta,
    
    job_agg2.df_subs_agg_a, job_agg2.df_subs_agg_b, job_agg2.df_subs_agg_c, 
    job_agg2.df_posts_agg_b, job_agg2.df_posts_agg_c,
    # job_agg2.df_posts_agg_d,  # D doesn't exist yet
)

for k2, df_2 in tqdm(d_dfs2.items()):
    print(f"\n{k2}")
    try:
        print(f"{df_2.shape} <- df shape")
        print(f"{df_2.npartitions} <- dask partitions")
        # print(f"{get_dask_df_shape(df_2)} <- df.shape")
        # print(f"  {df_2.memory_usage(deep=True).sum() / 1048576:4,.1f} MB <- Memory usage")
        if any(['meta' in k2, '_v_' in k2]):
            pass
        else:
            pass
            # display(df_2.iloc[:5, :15])

    except (TypeError, AttributeError):
        if isinstance(df_2, pd.DataFrame):
            print(f"{df_2.shape} <- df shape")

  0%|          | 0/8 [00:00<?, ?it/s]


df_v_sub
(Delayed('int-d9fa5280-15a7-4cb5-b1d7-29e87e661c24'), 513) <- df shape
1 <- dask partitions

df_v_posts
(Delayed('int-16cc4711-5229-4556-b28a-df08e562ad30'), 514) <- df shape
5 <- dask partitions

df_v_comments
(Delayed('int-64852139-ad50-4e52-a77b-aeb6d5b6155d'), 515) <- df shape
10 <- dask partitions

df_subs_agg_a
(Delayed('int-0267d822-1516-4476-a40f-6c6f0796d8dc'), 513) <- df shape
1 <- dask partitions

df_subs_agg_b
(Delayed('int-0a0377af-db4d-4ad5-8c9f-e1a3685df967'), 513) <- df shape
1 <- dask partitions

df_subs_agg_c
(Delayed('int-7685a498-cfc8-49e1-b91a-a797d70dbb88'), 513) <- df shape
1 <- dask partitions

df_posts_agg_b
(Delayed('int-0598994f-435b-450c-9565-bc875b98f4dc'), 514) <- df shape
21 <- dask partitions

df_posts_agg_c
(Delayed('int-2ed0f6f0-1efc-48e6-9621-7310bc0b6030'), 514) <- df shape
42 <- dask partitions
CPU times: user 33.4 ms, sys: 9.03 ms, total: 42.4 ms
Wall time: 37.6 ms
