# Purpose


2021-12-21:
We're going back to pandas now that I have the VM machine with a ton of RAM.

There might be some tweaks needed to batch a few subreddits at a time, but at least we can get more consistent state/progress than with `dask`.


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
import os
import logging
from logging import info
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm.auto import tqdm

import mlflow
import hydra

import subclu
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)
from subclu.models import aggregate_embeddings_pd

from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)


print_lib_versions([dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.4.1


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set sqlite database as MLflow URI

In [4]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db'

## Get list of experiments with new function

In [5]:
df_mlf_exp = mlf.list_experiment_meta(output_format='pandas')
df_mlf_exp.tail(10)

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
15,15,v0.4.0_use_multi_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/15,active
16,16,v0.4.0_use_multi_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/16,active
17,17,v0.4.0_use_multi_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/17,active
18,18,v0.4.0_use_multi_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/18,active
19,19,v0.4.1_mUSE_inference_test,gs://i18n-subreddit-clustering/mlflow/mlruns/19,active
20,20,v0.4.1_mUSE_inference,gs://i18n-subreddit-clustering/mlflow/mlruns/20,active
21,21,v0.4.1_mUSE_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/21,active
22,22,v0.4.1_mUSE_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/22,active
23,23,v0.4.1_mUSE_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/23,active
24,24,v0.4.1_mUSE_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/24,active


In [6]:
# df_mlf_exp.iloc[9:15, :]

## Get runs that we can use for embeddings aggregation jobs

In [7]:
%%time

df_mlf_runs =  mlf.search_all_runs(experiment_ids=[20])
df_mlf_runs.shape

CPU times: user 78 ms, sys: 4.94 ms, total: 83 ms
Wall time: 81.6 ms


(22, 55)

In [8]:
mask_finished = df_mlf_runs['status'] == 'FINISHED'
mask_output_over_1M_rows = (
    (df_mlf_runs['metrics.df_vect_posts_rows'] >= 1e5) |
    (df_mlf_runs['metrics.df_vect_comments'] >= 1e5)
)
# df_mlf_runs[mask_finished].shape

df_mlf_use_for_agg = df_mlf_runs[mask_finished & mask_output_over_1M_rows]
df_mlf_use_for_agg.shape

(6, 55)

In [9]:
cols_with_multiple_vals = df_mlf_use_for_agg.columns[df_mlf_use_for_agg.nunique(dropna=False) > 1]
print(f"{len(cols_with_multiple_vals):,.0f} <- columns with multiple values")

style_df_numeric(
    df_mlf_use_for_agg
    [cols_with_multiple_vals]
    .drop(['artifact_uri', 'end_time',
           # 'start_time',
           ], 
          axis=1)
    .dropna(axis='columns', how='all')
    .iloc[:, :30]
    ,
    rename_cols_for_display=True,
)

29 <- columns with multiple values


Unnamed: 0,run id,start time,metrics.vectorizing time minutes comments,metrics.df vect comments,metrics.vectorizing time minutes full function,metrics.total comment files processed,metrics.df vect posts cols,metrics.vectorizing time minutes subreddit meta,metrics.df vect posts,metrics.df vect posts rows,metrics.df vect subreddits description rows,metrics.df vect subreddits description cols,params.tokenize lowercase,params.col post id,params.n comment files slice start,params.tf batch inference rows,params.comments path,params.subreddits path,params.mlflow comments folder,params.n comment files slice end,params.col text comment,params.cols index comment,params.col comment id,params.col text comment word count,params.cols comment text to concat,tags.mlflow.source.git.commit,tags.mlflow.runName
0,a69d1b259875458283124ffdaa6efbb6,2021-12-21 12:27:00.341000+00:00,53.09,12150831.00,60.64,17.0,-,-,-,-,-,-,False,post_id,57.0,3600,comments/top/2021-12-14,,df_vect_comments,76.0,comment_body_text,,comment_id,comment_text_word_count,,a02f187a9f0b0ed69ab646b1411b45fb8ec2152a,comments_slice_3-2021-12-21_122659
1,e7ed11ccdc0b45abbdf3bf19605d4498,2021-12-21 11:16:11.807000+00:00,61.66,14194865.00,70.80,20.0,-,-,-,-,-,-,False,post_id,38.0,3600,comments/top/2021-12-14,,df_vect_comments,57.0,comment_body_text,,comment_id,comment_text_word_count,,a02f187a9f0b0ed69ab646b1411b45fb8ec2152a,comments_slice_2-2021-12-21_111611
2,54ba724869bf4ec9a2cad2a4f7eca048,2021-12-21 10:08:28.099000+00:00,58.32,14310574.00,67.72,20.0,-,-,-,-,-,-,False,post_id,19.0,3600,comments/top/2021-12-14,,df_vect_comments,38.0,comment_body_text,,comment_id,comment_text_word_count,,a02f187a9f0b0ed69ab646b1411b45fb8ec2152a,comments_slice_1-2021-12-21_100827
8,26c8fcf422a9403ba4a844c8e380bf7f,2021-12-21 08:03:01.919000+00:00,54.69,13751054.00,63.78,20.0,-,-,-,-,-,-,False,post_id,0.0,3800,comments/top/2021-12-14,,df_vect_comments,19.0,comment_body_text,,comment_id,comment_text_word_count,,a65c8a903d550ff5ba1f9d8ccd2f476b09bce6bb,comments_slice_0-2021-12-21_080301
11,e91b75b201c848db80a26f63f305ff35,2021-12-21 04:34:31.422000+00:00,-,-,-,31.0,515.00,0.50,11715818.00,355268.00,49705.00,514.00,True,,,2450,posts/top/2021-12-14,subreddits/top/2021-12-14,df_vect_posts_extra_text,,text,"['subreddit_name', 'subreddit_id', 'post_id']",post_id,text_word_count,"['flair_text', 'post_url_for_embeddings', 'text', 'ocr_inferred_text_agg_clean']",bb32d90d8f1c5b0cb8921141fe366019e991f238,posts_as_comments_batch_concat_text_lowercase-2021-12-21_043430
12,559a8f13264245b3923ab5699ef55bfe,2021-12-20 23:35:19.466000+00:00,103.77,-,209.57,41.0,515.00,0.51,15629958.00,495690.00,49705.00,514.00,False,,,2450,posts/top/2021-12-14,subreddits/top/2021-12-14,df_vect_posts_extra_text,,text,"['subreddit_name', 'subreddit_id', 'post_id']",post_id,text_word_count,"['flair_text', 'post_url_for_embeddings', 'text', 'ocr_inferred_text_agg_clean']",bb32d90d8f1c5b0cb8921141fe366019e991f238,posts_as_comments_batch_concat_text-2021-12-20_233519


# Load configs for aggregation jobs

`n_sample_comments_files` and `n_sample_posts_files` allow us to only load a few files at a time (e.g., 2 instead of 50) to test the process end-to-end.

---
Note that by default `hydra` is a cli tool. If we want to call use it in jupyter, we need to manually initialize configs & compose the configuration. See my custom function `load_config_agg_jupyter`. Also see:
- [Notebook with `Hydra` examples in a notebook](https://github.com/facebookresearch/hydra/blob/master/examples/jupyter_notebooks/compose_configs_in_notebook.ipynb).
- [Hydra docs, Hydra in Jupyter](https://hydra.cc/docs/next/advanced/jupyter_notebooks/).


In [10]:
mlflow_experiment_test = 'v0.4.1_mUSE_aggregates_test'
mlflow_experiment_full = 'v0.4.1_mUSE_aggregates'

root_agg_config_name = 'aggregate_embeddings_v0.4.1'

config_test_sample_lc_false = AggregateEmbeddingsConfig(
    config_path="../config",
    config_name=root_agg_config_name,
    overrides=[f"mlflow_experiment={mlflow_experiment_test}",
               'n_sample_posts_files=1',     # 
               'n_sample_comments_files=4',  # 6 is limit for logging unique counts at comment level
               # 'data_embeddings_to_aggregate=top_subs-2021_07_16-use_multi_lower_case_false',
              ]
)

config_full_lc_false = AggregateEmbeddingsConfig(
    config_path="../config",
    config_name=root_agg_config_name,
    overrides=[f"mlflow_experiment={mlflow_experiment_full}",
               'n_sample_posts_files=null', 
               'n_sample_comments_files=null',
               # 'data_embeddings_to_aggregate=top_subs-2021_07_16-use_multi_lower_case_false',
              ]
)

pprint(config_test_sample_lc_false.config_dict, indent=2)

{ 'aggregate_params': { 'agg_comments_to_post_weight_col': None,
                        'agg_post_comment_weight': 20,
                        'agg_post_post_weight': 70,
                        'agg_post_subreddit_desc_weight': 10,
                        'agg_post_to_subreddit_weight_col': None,
                        'min_comment_text_len': 4},
  'calculate_similarites': True,
  'data_embeddings_to_aggregate': { 'col_comment_id': 'comment_id',
                                    'col_post_id': 'post_id',
                                    'col_subreddit_id': 'subreddit_id',
                                    'col_text_comment_word_count': 'comment_text_word_count',
                                    'col_text_post_word_count': 'text_word_count',
                                    'comments_folder_embeddings': 'df_vect_comments',
                                    'comments_uuid': [ '26c8fcf422a9403ba4a844c8e380bf7f',
                                                       '54b

In [11]:
# config_full_lc_false.config_flat

In [12]:
df_configs = pd.DataFrame(
    [
        config_test_sample_lc_false.config_flat,
        # config_test_full_lc_false.config_flat,
        config_full_lc_false.config_flat,
        # config_full_lc_true.config_flat,
    ]
)

# We can't use (df_configs.nunique(dropna=False) > 1)
#  because when a col's content is a list or something unhashable, we get an error
#  so instead we'll check each column individually

# cols_with_diffs_config = df_configs.columns[df_configs.nunique(dropna=False) > 1]
cols_with_diffs_config = list()
for c_ in df_configs.columns:
    try:
        if df_configs[c_].nunique(dropna=False) > 1:
            cols_with_diffs_config.append(c_)
    except TypeError:
        cols_with_diffs_config.append(c_)
        

df_configs[cols_with_diffs_config]

Unnamed: 0,comments_uuid,mlflow_experiment,n_sample_posts_files,n_sample_comments_files
0,"[26c8fcf422a9403ba4a844c8e380bf7f, 54ba724869bf4ec9a2cad2a4f7eca048, e7ed11ccdc0b45abbdf3bf19605d4498, a69d1b259875458283124ffdaa6efbb6]",v0.4.1_mUSE_aggregates_test,1.0,4.0
1,"[26c8fcf422a9403ba4a844c8e380bf7f, 54ba724869bf4ec9a2cad2a4f7eca048, e7ed11ccdc0b45abbdf3bf19605d4498, a69d1b259875458283124ffdaa6efbb6]",v0.4.1_mUSE_aggregates,,


In [13]:
# pprint(config_test_sample_lc_false.config_flat, indent=2)

# Compare python download v. gsutil

It's much better to use `gsutil` because it can run jobs in parallel and reduce times by ~2x to 4x.
```bash
# gsutil done in ~1 minute
gsutil -m cp -r -n $remote_gs_path $local_f
# Download already complete for /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/20/a69d1b259875458283124ffdaa6efbb6/artifacts/artifacts/df_vect_comments/000000000072-800764_by_516.parquet, skipping download but will run integrity checks.
# CPU times: user 716 ms, sys: 529 ms, total: 1.24 s.4 MiB/s ETA 00:00:22         
# Wall time: 52.6 s


# GCS client in python + manual file checks: 4:22
08:13:59 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/20/a69d1b259875458283124ffdaa6efbb6/artifacts/df_vect_comments"
100%|###########################################| 17/17 [04:22<00:00, 15.47s/it]
```

In [94]:
# %%time

# # mlflow experiment artifacts 
# remote_key = "mlflow/mlruns/20/a69d1b259875458283124ffdaa6efbb6/artifacts"
# local_f = f"/home/jupyter/subreddit_clustering_i18n/data/local_cache/{remote_key}"
# remote_gs_path = f"gs://i18n-subreddit-clustering/{remote_key}"

# !gsutil -m cp -r -n $remote_gs_path $local_f

In [93]:
# %%time

# # POST text & meta
# remote_key = config_full_lc_false.config_flat['folder_posts_text_and_meta']
# print(remote_key)
# # Need to remove the last part of the local path otherwise we'll get duplicate subfolders:
# #. top/2021-12-14/2021-12-14 instead of top/2021-12-14
# local_f = f"/home/jupyter/subreddit_clustering_i18n/data/local_cache/{'/'.join(remote_key.split('/')[:-1])}"
# Path(local_f).mkdir(parents=True, exist_ok=True)
# remote_gs_path = f"gs://i18n-subreddit-clustering/{remote_key}"

# !gsutil -m cp -r -n $remote_gs_path $local_f

In [92]:
# %%time
# # comments Text & meta

# remote_key = config_full_lc_false.config_flat['folder_comments_text_and_meta']
# print(remote_key)
# remote_gs_path = f"gs://i18n-subreddit-clustering/{remote_key}"
# local_f = f"/home/jupyter/subreddit_clustering_i18n/data/local_cache/{'/'.join(remote_key.split('/')[:-1])}"


# !gsutil -m cp -r -n $remote_gs_path $local_f

In [52]:
gc.collect()

22

# Run test on data subset, `lower_case=False`

9 minutes is such a long time... try .groupby().pipe() instead.
We won't get a progress bar, but if it's much faster then it's worth the loss of progress bar.

```bash
# using old .groupby() with loop.
09:59:49 | INFO | "Comments per post summary:
  comment_count_  posts_count  percent_of_posts  cumulative_percent_of_posts
0            0.0       440953          0.966917                     0.966917
1            1.0         2929          0.006423                     0.973340
2            2.0         2202          0.004829                     0.978169
3            3.0         1754          0.003846                     0.982015
4             4+         8202          0.017985                     1.000000"
09:59:49 | INFO | "Create df with weights for weighted-average calculation"
09:59:52 | INFO | "Get weighted average for POST + COMMENT + SUBREDDIT-META"
100%|##################################| 456040/456040 [09:11<00:00, 826.25it/s]
10:09:27 | INFO | "  (456040, 512) <- df_agg_posts_w_sub.shape (only posts with comments)"
10:09:27 | INFO | "Re-append multi-index so it's the same in original and new output"
10:09:35 | INFO | "Check that post-ID is unique..."
10:09:36 | INFO | "  (456040, 515) <- df_posts_agg_c shape after aggregation"
10:09:36 | INFO | "  0:09:47.922873 <- Total posts+comments+subs agg time elapsed"
10:09:38 | INFO | "RAM stats:
{'memory_used_percent': '6.41%', 'memory_used': '124,093'}"

```

In [38]:
BREAK

In [95]:
%%time

try:
    job_agg_test._send_log_file_to_mlflow()
    mlflow.end_run("FAILED")
    # run setup_logging() to remove logging to the file of a failed job
    setup_logging()
    
    del job_agg_test
except NameError:
    pass
gc.collect()

mlflow.end_run("FAILED")


job_agg_test = aggregate_embeddings_pd.AggregateEmbeddings(
    run_name=f"agg_test_lc_false_pd-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    **config_test_sample_lc_false.config_flat
)
job_agg_test.run_aggregation()

gc.collect()

09:55:53 | INFO | "Logging log-file to mlflow..."
09:55:54 | INFO | "== Start run_aggregation() method =="
09:55:54 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db"
09:55:55 | INFO | "host_name: djb-100-2021-04-28-djb-eda-german-subs"
09:55:55 | INFO | "cpu_count: 80"
09:55:57 | INFO | "RAM stats:
{'memory_used_percent': '5.32%', 'memory_total': '1,937,274', 'memory_used': '103,089', 'memory_free': '1,592,050'}"
09:55:57 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/aggregate_embeddings/2021-12-23_095557-agg_test_lc_false_pd-2021-12-23_095553"
09:55:57 | INFO | "  Saving config to local path..."
09:55:57 | INFO | "  Logging config to mlflow with joblib..."
09:55:58 | INFO | "  Logging config to mlflow with YAML..."
09:55:58 | INFO | "-- Start _load_raw_embeddings() method --"
09:55:58 | INFO | "Loading subreddit description embeddings..."
09:56:0

KeyError: "['manual_topic_and_rating'] not in index"

In [96]:
job_agg_test

<subclu.models.aggregate_embeddings_pd.AggregateEmbeddings at 0x7f134e293050>

In [101]:
%%time

col_weights = '_col_method_weight_'

info(f"Create df with weights for weighted-average calculation")
df_posts_for_weights = pd.concat(
    [
        job_agg_test.df_v_posts.assign(
            **{col_weights: job_agg_test.agg_post_post_weight}
        ),
        job_agg_test.df_v_com_agg.assign(
            **{col_weights: job_agg_test.agg_post_comment_weight}
        ),
        # For each post: add one row of subreddit metadata
        (
            job_agg_test.df_v_posts[job_agg_test.l_ix_post_level]
            .merge(
                job_agg_test.df_v_sub,
                how='left',
                left_on=job_agg_test.l_ix_sub_level,
                right_on=job_agg_test.l_ix_sub_level,
            )
        ).assign(
            **{col_weights: job_agg_test.agg_post_subreddit_desc_weight}
        ),
    ]
)
df_posts_for_weights.shape

10:40:54 | INFO | "Create df with weights for weighted-average calculation"


CPU times: user 1.37 s, sys: 982 ms, total: 2.35 s
Wall time: 2.35 s


(927167, 516)

### piping functions
https://pandas.pydata.org/pandas-docs/dev/user_guide/groupby.html#piping-function-calls

In [106]:
df_posts_for_weights.head(1000).groupby('post_id')[job_agg_test.l_embedding_cols]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f1d5c037550>

In [103]:
%%time
# the time to compare is ~9 minutes


(
    df_posts_for_weights.head(1000).groupby('post_id')
    .pipe(
        lambda grp: np.average(
            grp[job_agg_test.l_embedding_cols],
            weights=grp[col_weights],
            axis=0,
        )
    )
)

  return array(a, dtype, copy=False, order=order, subok=True)


TypeError: can't multiply sequence by non-int of type 'str'

In [112]:
9 / 20

0.45

In [110]:
%%time
# the time to compare is ~9 minutes for full data

n_sample_ = int(len(df_posts_for_weights) / 20)
print(f"{n_sample_:,.0f}")
(
    df_posts_for_weights.head(n_sample_)
    .groupby('post_id')
    .apply(
        lambda grp: np.average(
            grp[job_agg_test.l_embedding_cols],
            weights=grp[col_weights],
            axis=0,
        )
    )
)

46,358
CPU times: user 50.2 s, sys: 359 ms, total: 50.6 s
Wall time: 50.6 s


post_id
t3_q7nxrt    [0.005173432175070047, -0.04249882698059082, 0.06282028555870056, 0.004041563719511032, 0.020344318822026253, 0.05887889489531517, 0.025653112679719925, 0.01699175499379635, 0.009449150413274765, 0.011669213883578777, 0.0428733564913272...
t3_q7nyfn    [-0.059484802186489105, 0.04459403455257416, -0.08658625185489655, -0.04349858686327934, -0.0011724033392965794, -0.044737089425325394, 0.03139304742217064, 0.0018960057059302926, -0.03238798677921295, -0.05040803924202919, 0.0536521039...
t3_q7nyq3    [-0.02391538955271244, 0.002337999641895294, 0.04435169696807861, 0.018625501543283463, 0.024183524772524834, 0.07496833056211472, -0.027850400656461716, 0.046770427376031876, -0.04380057752132416, -0.014158650301396847, 0.0424765311181...
t3_q7o021    [-0.020432401448488235, -0.0282297320663929, 0.01715364307165146, 0.002324985107406974, -0.037785910069942474, 0.05924432352185249, -0.05497536063194275, 0.039222147315740585, -0.05581115558743477, -0.05569520592689514, 

# Run Full data with `lower_case=False`

The logic for sampling files and download/`caching` files locally lives in the `mlf` custom function.

Caching can save 9+ minutes if we try to download the files from GCS every time.

In [14]:
keys_to_check_in_config = ['mlflow_experiment', 'n_sample_posts_files', 'n_sample_comments_files', 'aggregate_params', 'calculate_similarites']

for k_ in keys_to_check_in_config:
    v_ = config_full_lc_false.config_dict.get(k_)
    if isinstance(v_, dict):
        print(f"\n{k_}:")
        [print(f"  {k2_}: \t{v2_}") for k2_, v2_ in v_.items()]
    else:
        print(f"{k_}: \t{v_}")

mlflow_experiment: 	v0.4.1_mUSE_aggregates
n_sample_posts_files: 	None
n_sample_comments_files: 	None

aggregate_params:
  min_comment_text_len: 	4
  agg_comments_to_post_weight_col: 	None
  agg_post_to_subreddit_weight_col: 	None
  agg_post_post_weight: 	70
  agg_post_comment_weight: 	20
  agg_post_subreddit_desc_weight: 	10
calculate_similarites: 	True


In [16]:
pprint(config_full_lc_false.config_dict)

{'aggregate_params': {'agg_comments_to_post_weight_col': None,
                      'agg_post_comment_weight': 20,
                      'agg_post_post_weight': 70,
                      'agg_post_subreddit_desc_weight': 10,
                      'agg_post_to_subreddit_weight_col': None,
                      'min_comment_text_len': 4},
 'calculate_similarites': True,
 'data_embeddings_to_aggregate': {'col_comment_id': 'comment_id',
                                  'col_post_id': 'post_id',
                                  'col_subreddit_id': 'subreddit_id',
                                  'col_text_comment_word_count': 'comment_text_word_count',
                                  'col_text_post_word_count': 'text_word_count',
                                  'comments_folder_embeddings': 'df_vect_comments',
                                  'comments_uuid': ['26c8fcf422a9403ba4a844c8e380bf7f',
                                                    '54ba724869bf4ec9a2cad2a4f7eca048',

In [21]:
BREAK

In [None]:
%%time

try:
    job_agg1._send_log_file_to_mlflow()
    mlflow.end_run("FAILED")
    # run setup_logging() to remove logging to the file of a failed job
    setup_logging()
    
    del job_agg1
    del d_dfs1
except NameError:
    pass

gc.collect()
mlflow.end_run("FAILED")


try:
    job_agg1 = aggregate_embeddings_pd.AggregateEmbeddings(
        run_name=f"agg_full_lc_false_pd-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
        **config_full_lc_false.config_flat
    )
    job_agg1.run_aggregation()
except Exception as e:
    print(e)
    job_agg1._send_log_file_to_mlflow()
    mlflow.end_run("FAILED")
    # run setup_logging() to remove logging to the file of a failed job
    setup_logging()

gc.collect()

11:20:55 | INFO | "== Start run_aggregation() method =="
11:20:55 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db"
11:20:55 | INFO | "host_name: djb-100-2021-04-28-djb-eda-german-subs"
11:20:55 | INFO | "cpu_count: 80"
11:20:55 | INFO | "RAM stats:
{'memory_used_percent': '0.10%', 'memory_total': '1,937,274', 'memory_used': '2,000', 'memory_free': '1,690,826'}"
11:20:55 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/aggregate_embeddings/2021-12-23_112055-agg_full_lc_false_pd-2021-12-23_112054"
11:20:55 | INFO | "  Saving config to local path..."
11:20:55 | INFO | "  Logging config to mlflow with joblib..."
11:20:56 | INFO | "  Logging config to mlflow with YAML..."
11:20:56 | INFO | "-- Start _load_raw_embeddings() method --"
11:20:56 | INFO | "Loading subreddit description embeddings..."
11:20:58 | INFO | "Local folder to download artifact(s):
  

In [None]:
job_agg1._send_log_file_to_mlflow()
gc.collect()

In [None]:
gc.collect()

# Run full data, `lower_case=True`

Looks like the problem I ran into with the file being corrupted might've been a problem with downloading the file(s). Fix: delete the local cache and download the files again.

In [None]:
BREAK

In [None]:
# %%time

# mlflow.end_run("FAILED")
# gc.collect()
# try:
#     # run setup_logging() to remove logging to the file of a failed job
#     setup_logging()
    
#     del job_agg2
#     del d_dfs2
# except NameError:
#     pass
# gc.collect()

# job_agg2 = AggregateEmbeddings(
#     run_name=f"full_lc_true-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
#     **config_full_lc_true.config_flat
# )
# job_agg2.run_aggregation()

# gc.collect()

In [23]:
mlflow.end_run("FAILED")

# Debugging

In [None]:
BREAK

### Check computed dfs

In [None]:
150 * 4

In [None]:
for k_, v_ in {k_: v_ for k_, v_ in vars(job_agg1).items() if 'df_' in k_}.items():
    print(f"\n{k_}")
    try:
        print(f"  {v_.shape}")
        display(v_.iloc[:8, :10])
        if not ('meta' in k_):
            print(v_.info())
    except Exception as e:
        pass

In [42]:
# job_agg_test._save_and_log_aggregate_and_similarity_dfs()

In [38]:
mlflow.end_run("FAILED")
gc.collect()

2794