# Purpose

**2022-11-07: v0.6.1**
<br> Use `dask.delayed` to run aggregation slightly faster (~3x). Still not fast enough, but better.
Note that I created a 2nd notebook because the first one timed out, so it's not clear whether the other notebook is still running or not.

**2022-08-15: v0.6.0**
<br>Test `dask.delayed` to run aggregation of multiple subreddits in parallel.
With the new project we expect to aggregate posts for over 300k subreddits. For most of the process, each subreddit can be processed independently of other subreddits, so it makes sense to try to split up the work so we can speed things up.

**2022-06-29: v0.5.0**
<br>Because we embedded post & text as a single embedding and we didn't use MLflow to create those embeddings, it's easier to  run the embeddings in this notebook rather than to re-use or re-write the old `AggregateEmbeddings` class.

Provenance:
* `v0.4.1 / djb_03.01-2021-12-aggregate_v041_posts_and_comments_pandas.ipynb`

# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from datetime import datetime, timedelta
import gc
import os
import logging
from logging import info
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm.auto import tqdm

import mlflow
import hydra

import subclu
from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)
from subclu.models import aggregate_embeddings_pd

from subclu.utils import set_working_directory, get_project_subfolder
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric,
    elapsed_time,
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)

from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_gcs import LoadSubredditsGCS


print_lib_versions([dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.6.1


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set Local model paths

In [4]:
manual_model_timestamp = datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')
path_this_model = get_project_subfolder(
    f"data/models/aggregate_embeddings/manual_v061_{manual_model_timestamp}"
)
Path.mkdir(path_this_model, parents=True, exist_ok=True)
path_this_model

PosixPath('/home/jupyter/subreddit_clustering_i18n/data/models/aggregate_embeddings/manual_v061_2022-11-08_035115')

# CREATE MLFLOW EXPERIMENTS!!
Before kicking off these jobs, make sure to create mlflow experiments for embeddings!!!

Otherwise we might end up with broken MLflow SQLite databases

master experiment list here:
- `subclu/utils/mlflow_logger.py`
    - `MlflowLogger.initialize_experiment_names` (class.method)
    
Example:
```python
l_experiments = [
    ...
    
    'v0.6.1_mUSE_aggregates_test',
    'v0.6.1_mUSE_aggregates',
    'v0.6.1_mUSE_clustering_test',
    'v0.6.1_mUSE_clustering',
    'v0.6.1_nearest_neighbors',
]
```

# Load config for embeddings aggregation

For v0.6.1 embeddings I didn't use mlflow to track the embeddings inference. We'll need to get them from these folders in GCS.
<br>For example:
- [Subreddit metadata](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/2022xxxx/subreddits/text/embedding/2022-xx-xx_084555`
- [Post + Comment Text (already combined)](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/2022xxxx/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-xx-xx_091925`

---

### Configs to update:


- `subclu/configs/`
    - `data_text_and_metadata/` <- This is where the raw metadata & text gets pulled
        - `vX.x.x_model.yaml`
    - `data_embeddings_to_aggregate/` <- This is where we pull the embeddings for a) subreddit meta & b) post+comments text
        - `v0.6.1_2022-11-07_muse_lower_case_false.yaml`
    - `aggregate_params/`  <- Parameters for aggregation weights
        - `v0.6.1_agg.yaml`
    - `aggregate_embeddings_v0.6.1.yaml`  <- File that references all the configs above

In [5]:
cfg_agg_embeddings = LoadHydraConfig(
    config_name='aggregate_embeddings_v0.6.1.yaml',
    config_path="../config",
    overrides=[
        f"agg_style=dask_delayed",
    ],
)
print(cfg_agg_embeddings.config_dict.keys())

dict_keys(['data_text_and_metadata', 'data_embeddings_to_aggregate', 'aggregate_params', 'bucket_output', 'mlflow_tracking_uri', 'mlflow_experiment', 'n_sample_subreddits', 'n_sample_posts_files', 'n_sample_comments_files', 'agg_style'])


In [6]:
# cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']

In [7]:
for k_, v_ in cfg_agg_embeddings.config_dict.items():
    if isinstance(v_, dict):
        print(f"{k_}:")
        for k2_, v2_ in v_.items():
            print(f"    {k2_}: {v2_}")
    else:
        print(f"{k_}: {v_}")

data_text_and_metadata:
    dataset_name: v0.6.1 inputs. ~110k seed subreddits, ~340k with 3+ posts, ~700k total subreddits
    bucket_name: i18n-subreddit-clustering
    folder_subreddits_text_and_meta: i18n_topic_model_batch/runs/20221107/subreddits/text
    folder_posts_text_and_meta: i18n_topic_model_batch/runs/20221107/posts
    folder_comments_text_and_meta: i18n_topic_model_batch/runs/20221107/comments
    folder_post_and_comment_text_and_meta: i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all
data_embeddings_to_aggregate:
    bucket_embeddings: i18n-subreddit-clustering
    post_and_comments_folder_embeddings: i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017
    subreddit_desc_folder_embeddings: i18n_topic_model_batch/runs/20221107/subreddits/text/embedding/2022-11-07_074632
    col_subreddit_id: subreddit_id
aggregate_params:
    min_post_and_comment_text_len: 3
    agg_post_post_and_comment_wei

# Download post embeddings with `gsutil`

gsutil can be 5x+ faster than the python library(!)
However, it can sometimes lock up the VM and crash jupyter :/


- https://cloud.google.com/storage/docs/gsutil/commands/cp#description
- https://cloud.google.com/storage/docs/wildcards

- If you have a large number of files to transfer, you can perform a parallel multi-threaded/multi-processing copy using the top-level gsutil `-m` option
- the `-n` option to prevent overwriting the content of existing files. The following example downloads text files from a bucket without clobbering the data in your directory
- Use the `-r` option to copy an entire directory tree.

- `-o` Set/override values in the boto configuration value, in the format \\`<section>:<name>=<value>`:
    - Examples: `-o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20`

    
```bash
gsutil -o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20 -m cp -r -n gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017 \
    /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding
```

---

```bash
# Try w/o parallel option (works, but pretty slow)
gsutil cp -r -n gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017 \
    /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding

    
# ===============
# Try batches using wildcards/regex
# ===

#  Batches of 100 & 50 still breaks
# ===
gsutil -m cp -r -n gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017/0000000001*.parquet \
    /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding

/ [0/201 files][ 21.5 GiB/150.3 GiB]  14% Done 776.4 MiB/s ETA 00:02:50


# Batches of ~20 seems to work ok, but better to use the flags and let it figure out the thread itself
# ===
gsutil -m cp -r -n gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017/0000000000[0-2]*.parquet \
    /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding
```


In [8]:
%%time

gcs_sub_embeddings = cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['subreddit_desc_folder_embeddings']
print(gcs_sub_embeddings)
gcs_post_comment_embeddings = cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['post_and_comments_folder_embeddings']
print(gcs_post_comment_embeddings, '\n')


# gsutil is usually faster than the python library.
remote_bucket_and_key = f"{cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings']}/{gcs_post_comment_embeddings}"
remote_gs_path = f'gs://{remote_bucket_and_key}'

# Need to remove the last part of the local path otherwise we'll get duplicate subfolders:
#. top/2021-12-14/2021-12-14 instead of top/2021-12-14
local_f = f"/home/jupyter/subreddit_clustering_i18n/data/local_cache/{'/'.join(remote_bucket_and_key.split('/')[:-1])}"
Path(local_f).mkdir(parents=True, exist_ok=True)

# print(f"Remote path:\n  {remote_gs_path}")
# print(f"Local path:\n  {local_f}")

# could do it manually (a pain & not worth it!)
#. Use the limits for parallel threads & processes instead!
l_parallel_file_regex = [
    "0000000000[0-1]*.parquet",
    "0000000000[2-3]*.parquet",
    "0000000000[4-5]*.parquet",
    "0000000000[6-7]*.parquet",
    "0000000000[8-9]*.parquet",

    "0000000001[0-1]*.parquet",
    "0000000001[2-3]*.parquet",
    "0000000001[4-5]*.parquet",
    "0000000001[6-7]*.parquet",
    "0000000001[8-9]*.parquet",

#     "0000000002[0-1]*.parquet",
#     "0000000002[2-3]*.parquet",
#     "0000000002[4-5]*.parquet",
#     "0000000002[6-7]*.parquet",
#     "0000000002[8-9]*.parquet",
]
# could do it manually (a pain)
# for rx_ in l_parallel_file_regex:
#     print(
#         f"\n=== {rx_} ===\ngsutil -m cp -r -n {remote_gs_path}/{rx_}  {local_f} \n"
#     )
#     ## !gsutil -m cp -r -n $remote_gs_path/$rx_ $local_f

# NOTE: best to run this command from a separate terminal b/c it can crash a jupyter notebook 
#  when loading many large files (30+)
# Add flags to limit thread & process count to ~20 fixes most problems
print(f"gsutil -o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20 -m cp -r -n {remote_gs_path} {local_f} \n")

# !gsutil -o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20 -m cp -r -n $remote_gs_path $local_f

i18n_topic_model_batch/runs/20221107/subreddits/text/embedding/2022-11-07_074632
i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017 

gsutil -o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20 -m cp -r -n gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017 /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding 

CPU times: user 322 µs, sys: 0 ns, total: 322 µs
Wall time: 291 µs


# Start MLflow & Log base params

In [9]:
mlf = MlflowLogger(tracking_uri=cfg_agg_embeddings.config_dict['mlflow_tracking_uri'])

In [10]:
mlflow_experiment = cfg_agg_embeddings.config_dict['mlflow_experiment']
# 'v0.6.0_mUSE_aggregates', 'v0.6.0_mUSE_aggregates_test'


t_start_agg_embed = datetime.utcnow()
info(f"== Start run_aggregation() method ==")


info(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
mlf.set_experiment(mlflow_experiment)
mlflow.start_run()
mlf.add_git_hash_to_active_run()
mlf.set_tag_hostname(key='host_name')
mlf.log_param_hostname(key='host_name')
mlf.log_cpu_count()
mlf.log_ram_stats(param=True, only_memory_used=False)

03:51:17 | INFO | "== Start run_aggregation() method =="
03:51:17 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db"
03:51:17 | INFO | "host_name: djb-100-2021-04-28-djb-eda-german-subs"
03:51:17 | INFO | "cpu_count: 96"
03:51:17 | INFO | "RAM stats:
{'memory_used_percent': '36.28%', 'memory_total': '1,444,961', 'memory_used': '524,198', 'memory_free': '759,036'}"


{'memory_total': 1444961,
 'memory_used_percent': 0.3627765732085503,
 'memory_used': 524198,
 'memory_free': 759036}

In [11]:
# set weights
# Normalize them by dividing by 100
WEIGHT_POST_COMMENT = (
    cfg_agg_embeddings.config_dict['aggregate_params']['agg_post_post_and_comment_weight'] / 100
)
WEIGHT_SUB_META = (
    cfg_agg_embeddings.config_dict['aggregate_params']['agg_post_subreddit_desc_weight'] / 100
)
print(WEIGHT_POST_COMMENT + WEIGHT_SUB_META)
assert(1.0 == WEIGHT_POST_COMMENT + WEIGHT_SUB_META)




mlflow.log_params(
    {
        'embeddings_bucket': cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
        'embeddings_subreddit_path': gcs_sub_embeddings,
        'embeddings_post_and_comments_path': gcs_post_comment_embeddings,
        'weight_post_and_comments': WEIGHT_POST_COMMENT,
        'weight_subreddit_meta': WEIGHT_SUB_META,
    }
)
for k_, v_ in cfg_agg_embeddings.config_dict.items():
    if isinstance(v_, str):
        try:
            mlflow.log_param(k_, v_)
        except Exception as e:
            print(e)

1.0


# Load data

NOTE: Manually end run if it failed

mlflow.end_run("FAILED")

In [12]:
%%time
t_start_data_load_ = datetime.utcnow()

subs_v = LoadSubredditsGCS(
    bucket_name=cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
    gcs_path=gcs_sub_embeddings,
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='subreddit_id',
    df_format='pandas',
    unique_check=True,
    verbose= True,
    
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
)
subs_v.local_cache()

df_v_subs = subs_v.read_as_one_df()
r_subs, c_subs = df_v_subs.shape
mlflow.log_metrics(
    {
        f"df_v_subs-rows": r_subs,
        f"df_v_subs-cols": c_subs,
    }
)
print(f"{r_subs:,.0f} rows, {c_subs:,.0f} cols")

03:51:19 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/subreddits/text/embedding/2022-11-07_074632"
03:51:19 | INFO | "  7 <- Files matching prefix"
03:51:19 | INFO | "  7 <- Files to check"
03:51:19 | INFO | "    000000000000-100179_by_514.parquet <- File already exists, not downloading"
03:51:19 | INFO | "    000000000001-233442_by_514.parquet <- File already exists, not downloading"
03:51:19 | INFO | "    000000000002-448032_by_514.parquet <- File already exists, not downloading"
03:51:19 | INFO | "    2022-11-07_07-46-32_vectorize_text.log <- File already exists, not downloading"
03:51:19 | INFO | "  Files already cached: 4"
03:51:19 | INFO | "  Files already downloaded."
03:51:19 | INFO | "  df format: pandas"
03:51:24 | INFO | "  Checking ID uniqueness..."


781,653 rows, 514 cols
CPU times: user 4.35 s, sys: 4.2 s, total: 8.55 s
Wall time: 6.11 s


In [13]:
%%time

pc_v = LoadSubredditsGCS(
    bucket_name=cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
    gcs_path=gcs_post_comment_embeddings,
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='post_id',
    df_format='pandas',
    unique_check=False,
    verbose= True,
    
    n_sample_files=cfg_agg_embeddings.config_dict['n_sample_posts_files'],  # None,
    n_files_slice_start=None,  # None,
    n_files_slice_end=None,  # None, 
)
pc_v.local_cache()

df_v_pc = pc_v.read_as_one_df()
r_pc, c_pc = df_v_pc.shape
mlflow.log_metrics(
    {
        f"df_v_post_comments-rows": r_pc,
        f"df_v_post_comments-cols": c_pc,
    }
)
print(f"{r_pc:,.0f} rows, {c_pc:,.0f} cols")

t_data_load = elapsed_time(start_time=t_start_data_load_, log_label='Data Loading Time', verbose=True)
mlflow.log_metric('time_fxn-data_loading_time',
                  t_data_load / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

03:51:25 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017"
03:51:25 | INFO | "  201 <- Files matching prefix"
03:51:25 | INFO | "  201 <- Files to check"
03:51:25 | INFO | "    000000000000-317975_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000001-304046_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000002-234112_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000003-365746_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000004-284006_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000005-344157_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000006-288452_by_515.parquet <- 

53,597,817 rows, 515 cols


03:55:53 | INFO | "RAM stats:
{'memory_used_percent': '49.77%', 'memory_used': '719,153'}"


CPU times: user 7min 26s, sys: 16min 43s, total: 24min 10s
Wall time: 4min 28s


{'memory_used_percent': 0.49769716968139627, 'memory_used': 719153}

# Set weights & create copy dfs for new weights

In [14]:
l_ix_sub_level = ['subreddit_id', 'subreddit_name']
l_ix_post_level = l_ix_sub_level + ['post_id']

l_embedding_cols = [c for c in df_v_pc if c.startswith('embeddings_')]
print(len(l_embedding_cols))

512


In [15]:
%%time
df_v_pc_weighted = df_v_pc.copy()

df_v_subs_weighted = df_v_subs.copy()

# should be True b/c they're copies
print(np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515]))
print(np.allclose(df_v_subs_weighted.iloc[:1000,2:515], df_v_subs.iloc[:1000,2:515]))

# apply weight to all posts & subreddit meta at once (vectorized)
info(f"Initializing weighted SUBS meta")
df_v_subs_weighted[l_embedding_cols] = df_v_subs_weighted[l_embedding_cols] * WEIGHT_SUB_META

info(f"Initializing weighted POSTS embeddings")
df_v_pc_weighted[l_embedding_cols] = df_v_pc_weighted[l_embedding_cols] * WEIGHT_POST_COMMENT

# NOW they shouldn't be equal (Should be False)
print(np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515]))
print(np.allclose(df_v_subs_weighted.iloc[:1000,2:515], df_v_subs.iloc[:1000,2:515]))

03:56:39 | INFO | "Initializing weighted SUBS meta"


True
True


03:56:41 | INFO | "Initializing weighted POSTS embeddings"


False
False
CPU times: user 1min 46s, sys: 1min 44s, total: 3min 31s
Wall time: 3min 30s


In [16]:
# counts_describe(df_v_pc[l_ix_post_level])

# Aggregate to Post-Level: Post&Comments + Subreddit Meta

It's better to let pandas handle the interations with `.groupby('subreddit_id')`. Otherwise we have to create masks for each subreddit that can take much longer (17+ hours).


For creating the DAG with 81k subreddits:
- ETA with masks: +17.6 hours
- ETA with groupby: ~2.5 hours

For creating DAG with 700k subreddits:
- 8.5 hours: with dask + groupby

```
# mask:
0%  329/81973 [04:18<17:42:36, 1.28it/s]

# .groupby()
6% 4751/81973 [09:56<2:35:06, 8.30it/s]


# .groupby() + dask.delayed(....to_numpy()) | 700k+ subreddits:
5%  34121/705963 [26:36<8:15:14, 22.61it/s]

5% 38903/711664 [30:03<8:09:53, 22.89it/s

```

---

Updates using `dask.delayed`:
By combining .groupby() + `dask.delayed` we can process things ~3x faster:

```
# .groupby() + dask.delayed(....to_numpy()) | FASTEST
100% 3467/3467 [02:31<00:00, 23.08it/s]
Wall time: 2min 38s


# masks with dask.delayed():
#  This is 2x faster than serial processing, but .groupby() + dask.delayed() is much faster
100% 3467/3467 [00:11<00:00, 299.85it/s]
05:44:20 | INFO | "Define new C1 df DAG in dask"
05:44:20 | INFO | "COMPUTE new C1 df START"
05:48:20 | INFO | "COMPUTE new C1 df DONE"
05:48:20 | INFO | "  0:04:11.393036 <- Total Agg fxn time time elapsed"
CPU times: user 4min 33s, sys: 24.3 s, total: 4min 57s
Wall time: 4min 12s


# .groupby(), no dask delayed | SLOWEST
100% 3467/3467 [08:20<00:00, 6.97it/s]
  0:08:21.661816 <- Total Agg fxn time time elapsed
```


In [None]:
%%time
# set style so that we can try output & time in either format
AGG_STYLE = cfg_agg_embeddings.config_dict['agg_style']  # serial v. dask.delayed

info(f"Start C1 - posts + comments + sub descriptions with format: `{AGG_STYLE}`")
t_start_agg_post_c1 = datetime.utcnow()

l_df_c1_weights = list()

if AGG_STYLE == 'serial':
    for s_id, df_ in tqdm(
        df_v_pc_weighted.groupby('subreddit_id'),
        ascii=True, mininterval=5,
    ):
        # For each post in a subreddit, get new embedding: combine subreddit_meta + post(and_comment)
        df_.loc[:, l_embedding_cols] = np.add(
            df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
            df_[l_embedding_cols]
        )
        l_df_c1_weights.append(df_)
        del df_

    info(f"Create new C1 df")
    df_posts_agg_c1 = pd.concat(l_df_c1_weights, ignore_index=True)

elif AGG_STYLE == 'dask_delayed':
    for s_id, df_ in tqdm(
        df_v_pc_weighted.groupby('subreddit_id'),
        ascii=True, mininterval=5,
    ):
        # For each post in a subreddit, get new embedding: combine subreddit_meta + post(and_comment)
        df_pc_embeddings_ = dask.delayed(np.add)(
            dask.delayed(df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy()),
            dask.delayed(df_[l_embedding_cols])
        )
        l_df_c1_weights.append(
            dask.delayed(pd.concat)([dask.delayed(df_[l_ix_post_level]), df_pc_embeddings_], ignore_index=False, axis=1)
        )

    info(f"Define new posts C1 df DAG in dask")
    df_posts_agg_c1_delayed = dask.delayed(pd.concat)(l_df_c1_weights, ignore_index=True)

    info(f"COMPUTE new C1 df START")
    df_posts_agg_c1 = df_posts_agg_c1_delayed.compute()
    info(f"COMPUTE new C1 df DONE")
    
else:
    raise NotImplementedError(f'Other agg style not implemented: {AGG_STYLE}')


r_, c_ = df_posts_agg_c1.shape
mlflow.log_metrics(
    {
        f"df_posts_agg_c1-rows": r_,
        f"df_posts_agg_c1-cols": c_,
    }
)
print(f"{r_:,.0f} rows, {c_:,.0f} cols")
del r_, c_

t_agg_pc_c1 = elapsed_time(start_time=t_start_agg_post_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_posts_agg_c1_no_delay',
                  t_agg_pc_c1 / timedelta(minutes=1)
                  )
info(f"C1 - post level complete")

03:59:24 | INFO | "Start C1 - posts + comments + sub descriptions with format: `dask_delayed`"


  0%|          | 0/711664 [00:00<?, ?it/s]

In [None]:
df_posts_agg_c1.info()

In [None]:
df_posts_agg_c1.iloc[:5, :10]

### Save post-level

In [None]:
d_dfs_to_save = defaultdict(dict)

In [None]:
%%time
d_dfs_to_save['df_posts_agg_c1']['local'] = (
    path_this_model / f"df_posts_agg_c1_{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_posts_agg_c1,
    d_dfs_to_save['df_posts_agg_c1']['local'],
    write_index=False
)

info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save['df_posts_agg_c1']['local'], artifact_path='df_posts_agg_c1')

# Aggregate to Subreddit Level

In [None]:
%%time

# first, figure out how many posts each subreddit has
info(f"Count posts per subreddit...")
c_post_embedding_count = 'posts_for_embeddings_count'


df_posts_for_embedding_count = (
    df_posts_agg_c1
    .groupby(l_ix_sub_level, as_index=False)
    .agg(**{c_post_embedding_count: ('post_id', 'nunique')})
)
# fill subs that have no posts
df_posts_for_embedding_count = pd.concat(
    [
        df_posts_for_embedding_count, 
        df_v_subs[
            ~df_v_subs['subreddit_id'].isin(df_posts_agg_c1['subreddit_id'])
        ][l_ix_sub_level].assign(**{c_post_embedding_count: 0})
    ],
    axis=0
)
mlf.log_ram_stats(only_memory_used=True)

# min_posts >= -> regular mean. If it's less than this, then mix in subreddit_description into average
n_min_posts_for_regular_mean = 3
subreddits_above_n_ = (
    df_posts_for_embedding_count
    [df_posts_for_embedding_count[c_post_embedding_count] >= n_min_posts_for_regular_mean]
    ['subreddit_id']
)
subreddits_below_n_ = set(df_v_subs['subreddit_id']) - set(subreddits_above_n_)
mask_min_posts_for_reg_mean = df_posts_agg_c1['subreddit_id'].isin(subreddits_above_n_)


info(f"SUBREDDIT-LEVEL C1 - posts + comments + sub descriptions")
t_start_agg_subs_c1 = datetime.utcnow()

# 3+ posts: simple mean()
info(f"Mean for subs above threshold: {n_min_posts_for_regular_mean}")
df_subs_agg_c1_Nplus = (
    df_posts_agg_c1[mask_min_posts_for_reg_mean]
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
)

# calculate mean for all other subs: add UNWEIGHTED subreddit_description into averages
info(f"Calculating mean for subs BELOW post threshold...")
df_subs_agg_c1_Nbelow = (
    pd.concat(
        [
            df_posts_agg_c1[~mask_min_posts_for_reg_mean],
            df_v_subs[df_v_subs['subreddit_id'].isin(subreddits_below_n_)]
        ]
    )
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
)
mlf.log_ram_stats(only_memory_used=True)
info(f"Combining all subreddits...")
df_subs_agg_c1 = (
    df_posts_for_embedding_count
    .merge(
        pd.concat([df_subs_agg_c1_Nplus, df_subs_agg_c1_Nbelow]),
        how='outer',
        on=l_ix_sub_level
    )
    .sort_values(by=l_ix_sub_level)
)

# Check for dupes
assert(len(df_subs_agg_c1) == df_subs_agg_c1['subreddit_id'].nunique()), f"Found duplicate subreddit_ids"

r_, c_ = df_subs_agg_c1.shape
mlflow.log_metrics(
    {
        f"df_subs_agg_c1-rows": r_,
        f"df_subs_agg_c1-cols": c_,
    }
)
info(f"{r_:,.0f} rows, {c_:,.0f} cols  <- df_subs_agg_c1.shape (posts + comments + sub description)")
del r_, c_

t_agg_subs_c1 = elapsed_time(start_time=t_start_agg_subs_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_subs_agg_c1',
                  t_agg_subs_c1 / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

In [None]:
df_subs_agg_c1.iloc[-8:, :10]

In [None]:
df_subs_agg_c1.iloc[10:18, :10]

In [None]:
mlf.log_ram_stats(only_memory_used=True)

### Save Subreddit level

Save to dask anyway b/c it could require multiple files as we cover 700k+ subreddits

In [None]:
%%time
d_dfs_to_save['df_subs_agg_c1']['local'] = (
    path_this_model / f"df_subs_agg_c1-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_subs_agg_c1,
    d_dfs_to_save['df_subs_agg_c1']['local'],
    write_index=False
)


info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save['df_subs_agg_c1']['local'], artifact_path='df_subs_agg_c1')
mlf.log_ram_stats(only_memory_used=True)

## 2nd flow for subreddit level -- do not include additional weight from subreddit description
Potentially, we might be skewing the embeddings too much by adding extra weight to subreddit description.

So save embeddings WITHOUT additional weights so that we can compare the two approaches.

We'll still fill subreddits w/o posts with subreddit description.

In [None]:
info(f"SUBREDDIT-LEVEL C1 no extra sub description weight - posts + comments + sub descriptions")
t_start_agg_subs_c1_uw = datetime.utcnow()

# 3+ posts: simple mean()
info(f"Mean for subs above threshold: {n_min_posts_for_regular_mean} (already calculated)")

# calculate mean for all other subs: add UNWEIGHTED subreddit_description into averages
info(f"Calculating mean for subs BELOW post threshold...")
df_subs_agg_c1_Nbelow_uw = (
    df_posts_agg_c1[~mask_min_posts_for_reg_mean]
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
)
# get embeddings for subs w/ zero posts
subs_wo_posts = df_posts_for_embedding_count[df_posts_for_embedding_count[c_post_embedding_count] == 0]['subreddit_id']
info(f"{len(subs_wo_posts):,.0f}")

mlf.log_ram_stats(only_memory_used=True)
info(f"Combining all subreddits...")
df_subs_agg_c1_uw = (
    df_posts_for_embedding_count
    .merge(
        pd.concat(
            [
                df_subs_agg_c1_Nplus, df_subs_agg_c1_Nbelow_uw, 
                df_v_subs[df_v_subs['subreddit_id'].isin(subs_wo_posts)]
            ]
        ),
        how='outer',
        on=l_ix_sub_level
    )
    .sort_values(by=l_ix_sub_level)
)

# Check for dupes
assert(len(df_subs_agg_c1_uw) == df_subs_agg_c1_uw['subreddit_id'].nunique()), f"Found duplicate subreddit_ids"

r_, c_ = df_subs_agg_c1_uw.shape
mlflow.log_metrics(
    {
        f"df_subs_agg_c1_uw-rows": r_,
        f"df_subs_agg_c1_uw-cols": c_,
    }
)
info(f"{r_:,.0f} rows, {c_:,.0f} cols  <- df_subs_agg_c1_uw.shape (posts + comments + sub description)")
del r_, c_

t_agg_subs_c1_uw = elapsed_time(start_time=t_start_agg_subs_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_subs_agg_c1_uw',
                  t_agg_subs_c1 / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

### Check equality of unweighted v. weighted

In [None]:
# should be True
assert True == np.allclose(
    (
        df_subs_agg_c1_uw
        [df_subs_agg_c1_uw['subreddit_id'].isin(subreddits_above_n_.head(20))]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    ),
    (
        df_subs_agg_c1
        [df_subs_agg_c1['subreddit_id'].isin(subreddits_above_n_.head(20))]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    )
)

In [None]:
# should be False
l_sample_subs_below_n_ = list(subreddits_below_n_)[:20]
assert False == np.allclose(
    (
        df_subs_agg_c1_uw
        [df_subs_agg_c1_uw['subreddit_id'].isin(l_sample_subs_below_n_)]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    ),
    (
        df_subs_agg_c1
        [df_subs_agg_c1['subreddit_id'].isin(l_sample_subs_below_n_)]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    )
)

### Save Subreddit level

Use dask b/c as we model over 200k subreddits a single file gets too big

In [None]:
%%time
name_sub_agg_unweighted = 'df_subs_agg_c1_unweighted'
d_dfs_to_save[name_sub_agg_unweighted]['local'] = (
    path_this_model / f"{name_sub_agg_unweighted}-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_subs_agg_c1_uw,
    d_dfs_to_save[name_sub_agg_unweighted]['local'],
    write_index=False
)


info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save[name_sub_agg_unweighted]['local'], artifact_path=name_sub_agg_unweighted)
mlf.log_ram_stats(only_memory_used=True)

# End run

In [None]:
# finish logging total time + end mlflow run
total_fxn_time = elapsed_time(start_time=t_start_agg_embed, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-full_aggregation_fxn_minutes',
                  total_fxn_time / timedelta(minutes=1)
                  )
mlflow.end_run()

In [None]:
# mlflow.end_run("FAILED")