# Purpose

**2022-11-07: v0.6.1**

2 main problems with the previous process & their attempted fixes:
- The previous method takes too long and doesn't produce outputs until all posts for all subreddits are processed. 
    - Fix: partition embeddings so that we can read and process each subreddit in parallel
- If jupyterlab times out, then we lose the ability to view progress (logs don't go to a central logger right now).
    - Fix: Create & write to a log file so we can see progress even if jupyterlab connection times out

In this notebook I'll mainly focus on partitioning the data, and in a separate notebook we'll do the actual aggregation.

# Notebook setup

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from collections import defaultdict
from datetime import datetime, timedelta
import gc
import os
import logging
from logging import info
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm.auto import tqdm

import mlflow
import hydra

import subclu
from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)
from subclu.models import aggregate_embeddings_pd

from subclu.utils import set_working_directory, get_project_subfolder
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric,
    elapsed_time,
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)

from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_gcs import LoadSubredditsGCS


print_lib_versions([dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.6.1


In [4]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

In [1]:
# # register to see dask progress bar (nvm, this makes everything take AGES to actually compute)
# from dask.diagnostics import ProgressBar


# pbar = ProgressBar()                
# pbar.register() # global registration

# Set Local model paths

In [5]:
manual_model_timestamp = datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')
path_this_model = get_project_subfolder(
    f"data/models/aggregate_embeddings/manual_partition_v061_{manual_model_timestamp}"
)
Path.mkdir(path_this_model, parents=True, exist_ok=True)
print(path_this_model)

# Set file for logs
# useful in case jupyterlab times out and we can't see notebook printouts

logger = logging.getLogger()

path_logs = Path(path_this_model) / 'logs'
Path.mkdir(path_logs, parents=False, exist_ok=True)

f_log_file = str(
    path_logs /
    f"{datetime.utcnow().strftime('%Y-%m-%d_%H-%M-%S')}_jupyter_log.log"
)

try:
    logger.removeHandler(fileHandler)
    logging.info(f"Removed existing log file {fileHandler}")
except Exception as e:
    pass

fileHandler = logging.FileHandler(f_log_file)
fileHandler.setLevel(logging.INFO)

formatter = logging.Formatter(
    '%(asctime)s | %(levelname)s | "%(message)s"',
    '%Y-%m-%d %H:%M:%S'
)
fileHandler.setFormatter(formatter)
logger.addHandler(fileHandler)

/home/jupyter/subreddit_clustering_i18n/data/models/aggregate_embeddings/manual_partition_v061_2022-11-08_063052


In [6]:
path_subreddit_partitioned = path_this_model / 'sub_embeddings_partitioned'
path_post_comments_partitioned = path_this_model / 'post_embeddings_partitioned'

Path.mkdir(path_subreddit_partitioned, parents=False, exist_ok=True)
Path.mkdir(path_post_comments_partitioned, parents=False, exist_ok=True)

# CREATE MLFLOW EXPERIMENTS!!
Before kicking off these jobs, make sure to create mlflow experiments for embeddings!!!

Otherwise we might end up with broken MLflow SQLite databases

master experiment list here:
- `subclu/utils/mlflow_logger.py`
    - `MlflowLogger.initialize_experiment_names` (class.method)
    
Example:
```python
l_experiments = [
    ...
    
    'v0.6.1_mUSE_aggregates_test',
    'v0.6.1_mUSE_aggregates',
    'v0.6.1_mUSE_clustering_test',
    'v0.6.1_mUSE_clustering',
    'v0.6.1_nearest_neighbors',
]
```

# Load config for embeddings aggregation

For v0.6.1 embeddings I didn't use mlflow to track the embeddings inference. We'll need to get them from these folders in GCS.
<br>For example:
- [Subreddit metadata](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/2022xxxx/subreddits/text/embedding/2022-xx-xx_084555`
- [Post + Comment Text (already combined)](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/2022xxxx/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-xx-xx_091925`

---

### Configs to update:


- `subclu/configs/`
    - `data_text_and_metadata/` <- This is where the raw metadata & text gets pulled
        - `vX.x.x_model.yaml`
    - `data_embeddings_to_aggregate/` <- This is where we pull the embeddings for a) subreddit meta & b) post+comments text
        - `v0.6.1_2022-11-07_muse_lower_case_false.yaml`
    - `aggregate_params/`  <- Parameters for aggregation weights
        - `v0.6.1_agg.yaml`
    - `aggregate_embeddings_v0.6.1.yaml`  <- File that references all the configs above

In [7]:
cfg_agg_embeddings = LoadHydraConfig(
    config_name='aggregate_embeddings_v0.6.1.yaml',
    config_path="../config",
    overrides=[
        f"agg_style=dask_delayed",
    ],
)
print(cfg_agg_embeddings.config_dict.keys())

dict_keys(['data_text_and_metadata', 'data_embeddings_to_aggregate', 'aggregate_params', 'bucket_output', 'mlflow_tracking_uri', 'mlflow_experiment', 'n_sample_subreddits', 'n_sample_posts_files', 'n_sample_comments_files', 'agg_style'])


In [8]:
for k_, v_ in cfg_agg_embeddings.config_dict.items():
    if isinstance(v_, dict):
        print(f"{k_}:")
        for k2_, v2_ in v_.items():
            print(f"    {k2_}: {v2_}")
    else:
        print(f"{k_}: {v_}")

data_text_and_metadata:
    dataset_name: v0.6.1 inputs. ~110k seed subreddits, ~340k with 3+ posts, ~700k total subreddits
    bucket_name: i18n-subreddit-clustering
    folder_subreddits_text_and_meta: i18n_topic_model_batch/runs/20221107/subreddits/text
    folder_posts_text_and_meta: i18n_topic_model_batch/runs/20221107/posts
    folder_comments_text_and_meta: i18n_topic_model_batch/runs/20221107/comments
    folder_post_and_comment_text_and_meta: i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all
data_embeddings_to_aggregate:
    bucket_embeddings: i18n-subreddit-clustering
    post_and_comments_folder_embeddings: i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017
    subreddit_desc_folder_embeddings: i18n_topic_model_batch/runs/20221107/subreddits/text/embedding/2022-11-07_074632
    col_subreddit_id: subreddit_id
aggregate_params:
    min_post_and_comment_text_len: 3
    agg_post_post_and_comment_wei

# Download post embeddings with `gsutil`

gsutil can be 5x+ faster than the python library(!)
However, it can sometimes lock up the VM and crash jupyter :/


- https://cloud.google.com/storage/docs/gsutil/commands/cp#description
- https://cloud.google.com/storage/docs/wildcards

- If you have a large number of files to transfer, you can perform a parallel multi-threaded/multi-processing copy using the top-level gsutil `-m` option
- the `-n` option to prevent overwriting the content of existing files. The following example downloads text files from a bucket without clobbering the data in your directory
- Use the `-r` option to copy an entire directory tree.

- `-o` Set/override values in the boto configuration value, in the format \\`<section>:<name>=<value>`:
    - Examples: `-o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20`

    
```bash
gsutil -o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20 -m cp -r -n gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017 \
    /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding
```


In [9]:
%%time

gcs_sub_embeddings = cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['subreddit_desc_folder_embeddings']
print(gcs_sub_embeddings)
gcs_post_comment_embeddings = cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['post_and_comments_folder_embeddings']
print(gcs_post_comment_embeddings, '\n')


# gsutil is usually faster than the python library.
remote_bucket_and_key = f"{cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings']}/{gcs_post_comment_embeddings}"
remote_gs_path = f'gs://{remote_bucket_and_key}'

# Need to remove the last part of the local path otherwise we'll get duplicate subfolders:
#. top/2021-12-14/2021-12-14 instead of top/2021-12-14
local_f = f"/home/jupyter/subreddit_clustering_i18n/data/local_cache/{'/'.join(remote_bucket_and_key.split('/')[:-1])}"
Path(local_f).mkdir(parents=True, exist_ok=True)

# print(f"Remote path:\n  {remote_gs_path}")
# print(f"Local path:\n  {local_f}")


# NOTE: best to run this command from a separate terminal b/c it can crash a jupyter notebook 
#  when loading many large files (30+)
# Add flags to limit thread & process count to ~20 fixes most problems
print(f"gsutil -o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20 -m cp -r -n {remote_gs_path} {local_f} \n")

# !gsutil -o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20 -m cp -r -n $remote_gs_path $local_f

i18n_topic_model_batch/runs/20221107/subreddits/text/embedding/2022-11-07_074632
i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017 

gsutil -o GSUtil:parallel_thread_count=20 -o GSUtil:parallel_process_count=20 -m cp -r -n gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017 /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding 

CPU times: user 991 µs, sys: 0 ns, total: 991 µs
Wall time: 742 µs


# Load Subreddit metadata


In [9]:
%%time
t_start_data_load_ = datetime.utcnow()

subs_v = LoadSubredditsGCS(
    bucket_name=cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
    gcs_path=gcs_sub_embeddings,
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='subreddit_id',
    df_format='pandas',
    unique_check=True,
    verbose= True,
    
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
)
subs_v.local_cache()

df_v_subs = subs_v.read_as_one_df()
r_subs, c_subs = df_v_subs.shape

print(f"{r_subs:,.0f} rows, {c_subs:,.0f} cols")

05:00:43 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/subreddits/text/embedding/2022-11-07_074632"
05:00:43 | INFO | "  7 <- Files matching prefix"
05:00:43 | INFO | "  7 <- Files to check"
05:00:43 | INFO | "    000000000000-100179_by_514.parquet <- File already exists, not downloading"
05:00:43 | INFO | "    000000000001-233442_by_514.parquet <- File already exists, not downloading"
05:00:43 | INFO | "    000000000002-448032_by_514.parquet <- File already exists, not downloading"
05:00:43 | INFO | "    2022-11-07_07-46-32_vectorize_text.log <- File already exists, not downloading"
05:00:43 | INFO | "  Files already cached: 4"
05:00:43 | INFO | "  Files already downloaded."
05:00:43 | INFO | "  df format: pandas"
05:00:48 | INFO | "  Checking ID uniqueness..."


781,653 rows, 514 cols
CPU times: user 4.41 s, sys: 4.15 s, total: 8.57 s
Wall time: 6.11 s


In [10]:
df_v_subs.iloc[:4, :15]

Unnamed: 0,subreddit_id,subreddit_name,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12
0,t5_2qh1i,askreddit,0.025941,-0.049365,0.006804,0.053185,0.058297,0.058016,0.052757,0.009165,-0.053651,-0.054025,-0.04903,-0.038186,0.040012
1,t5_2qh33,funny,0.027515,-0.05897,-0.024202,-0.02486,0.065797,0.063036,-0.029735,0.056695,-0.05464,-0.026086,-0.048793,-0.034287,0.05268
2,t5_35n7t,whitepeopletwitter,-0.062793,-0.063598,-0.055777,-0.022888,0.068102,0.062438,0.040131,0.054116,-0.027087,-0.020984,-0.059177,0.01955,0.041514
3,t5_2qh0u,pics,-0.060736,0.022654,0.029311,-0.02218,0.059654,0.060696,0.029968,-0.039235,0.048987,-0.056546,-0.051583,-0.028964,0.057466


## Save as partitioned files (locally)
This can take a long time because we're saving 700k individual files. Even if each file is only a few KB in size.

In [16]:
# %%time

# df_v_subs.to_parquet(
#     path_subreddit_partitioned,
#     partition_cols=['subreddit_id'],
# )

In [20]:
%%time

(
    dd.from_pandas(df_v_subs, npartitions=100)
    .to_parquet(
        path_subreddit_partitioned / 'dask',
        write_index=False,
        partition_on='subreddit_id',
    )
)

KeyboardInterrupt: 

## Save as partitioned files (GCS)

Upload using gsutil so it handles parallel writes

In [None]:
TODO

In [None]:
# # gsutil is usually faster than the python library.
# remote_bucket_and_key = f"{cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings']}/{gcs_post_comment_embeddings}"
# remote_gs_path_subs_partitioned = (
#     f'gs://'
#     f"{cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings']}/"
#     f"{gcs_sub_embeddings}"
# )

# Load post+comment embeddings

In [10]:
%%time

pc_v = LoadSubredditsGCS(
    bucket_name=cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
    gcs_path=gcs_post_comment_embeddings,
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='post_id',
    df_format='pandas',
    unique_check=False,
    verbose= True,
    
    n_sample_files=None,  # None,
    n_files_slice_start=None,  # None,
    n_files_slice_end=None,  # None, 
)
pc_v.local_cache()

df_v_pc = pc_v.read_as_one_df()
r_pc, c_pc = df_v_pc.shape
print(f"{r_pc} rows, {c_pc:,.0f} cols")

06:31:16 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017"
06:31:16 | INFO | "  201 <- Files matching prefix"
06:31:16 | INFO | "  201 <- Files to check"
06:31:16 | INFO | "    000000000000-317975_by_515.parquet <- File already exists, not downloading"
06:31:16 | INFO | "    000000000001-304046_by_515.parquet <- File already exists, not downloading"
06:31:16 | INFO | "    000000000002-234112_by_515.parquet <- File already exists, not downloading"
06:31:16 | INFO | "    000000000003-365746_by_515.parquet <- File already exists, not downloading"
06:31:16 | INFO | "    000000000004-284006_by_515.parquet <- File already exists, not downloading"
06:31:16 | INFO | "    000000000005-344157_by_515.parquet <- File already exists, not downloading"
06:31:16 | INFO | "    000000000006-288452_by_515.parquet <- 

53597817 rows, 515 cols
CPU times: user 7min 54s, sys: 18min 52s, total: 26min 47s
Wall time: 3min 53s


### Sample df to test saving using index

In [11]:
%%time

df_v_pc_sample = df_v_pc.sample(n=2123000, random_state=42).copy()
df_v_pc_sample.shape

CPU times: user 23.3 s, sys: 2.06 s, total: 25.3 s
Wall time: 25.3 s


(2123000, 515)

In [12]:
df_v_pc_sample['subreddit_id'].nunique()

177409

In [16]:
%%time

df_v_pc_sample = (
    df_v_pc_sample
    .sort_values(by='subreddit_id', ascending=True)
)

CPU times: user 10.4 s, sys: 515 ms, total: 10.9 s
Wall time: 10.9 s


In [17]:
%%time

ddf_v_pc = dd.from_pandas(df_v_pc_sample, npartitions=100)

CPU times: user 11.1 s, sys: 739 ms, total: 11.8 s
Wall time: 11.8 s


In [18]:
ddf_v_pc.head()

Unnamed: 0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21,embeddings_22,embeddings_23,embeddings_24,embeddings_25,embeddings_26,...,embeddings_482,embeddings_483,embeddings_484,embeddings_485,embeddings_486,embeddings_487,embeddings_488,embeddings_489,embeddings_490,embeddings_491,embeddings_492,embeddings_493,embeddings_494,embeddings_495,embeddings_496,embeddings_497,embeddings_498,embeddings_499,embeddings_500,embeddings_501,embeddings_502,embeddings_503,embeddings_504,embeddings_505,embeddings_506,embeddings_507,embeddings_508,embeddings_509,embeddings_510,embeddings_511
0,t5_2ya42,purplepilldebate,t3_ylpy0g,0.023644,0.061906,0.00206,-0.030665,0.060242,0.06241,-0.030977,-0.028784,0.037142,0.00055,0.054326,0.067632,0.063913,0.056804,-0.032872,0.0497,0.045741,-0.016133,0.041791,-0.008407,0.014084,-0.036242,0.065911,0.009804,-0.063815,-0.015015,0.013124,...,-0.064879,-0.054868,-0.061981,0.051509,-0.035174,-0.064979,0.067171,0.053196,-0.016785,-0.001603,-0.055716,-0.014797,0.049491,0.061413,-0.055531,0.048362,-0.026484,-0.056763,0.065697,0.050267,0.053234,-0.06693,-0.023373,0.052564,0.042297,0.044805,-0.035666,-0.049719,0.023932,0.064363
0,t5_30560,itcareerquestions,t3_yh96sq,-0.014757,0.067029,0.009229,0.058075,0.003972,-0.039523,0.017789,-0.044161,-0.020246,-0.068242,-0.023906,-0.058127,0.067714,-0.035006,-0.070421,-0.003776,-0.022255,-0.017544,0.053738,-0.037302,-0.034228,0.010895,0.070921,-0.062496,-0.067761,-0.03124,-0.048563,...,0.025471,0.013038,0.061924,0.066259,-0.040295,-0.061356,-0.052747,-0.027093,0.005906,-0.048143,0.066431,0.002995,-0.039177,0.062026,-0.014983,-0.024444,-0.017498,0.02367,0.025833,0.002727,0.004744,-0.070893,-0.067031,-0.013716,-0.061246,0.033668,-0.050552,-0.057196,-0.027049,0.057137
0,t5_37516x,theflyingtree,t3_x92xbs,-0.050731,0.0473,0.033193,-0.023584,-0.069668,0.067376,-0.060143,-0.027894,-0.060309,0.014754,0.034706,-0.026501,0.0434,-0.06082,-0.065404,-0.016431,0.04578,0.039311,0.02094,0.028365,-0.012141,0.000448,0.048385,0.01168,-0.066501,-0.020895,-0.067902,...,-0.01847,-0.044804,0.035361,-0.045197,0.058491,-0.061803,0.01337,-0.066663,-0.017179,-0.052123,0.038178,-0.044849,-0.011225,-0.016488,-0.023448,0.059358,0.005309,0.00594,-0.01352,0.045126,-0.00485,-0.023119,0.018646,-0.066088,0.043981,-0.001499,-0.036892,-0.00639,-0.014383,-0.026199
0,t5_2qlq6,audible,t3_x5v2km,-0.058011,-0.056398,0.040377,0.005816,0.047075,0.014561,-0.027639,0.007625,-0.009932,-0.063436,-0.041177,-0.058582,0.061123,-0.060232,-0.063526,0.051338,-0.05814,-0.019322,0.031992,0.050422,-0.046334,-0.0523,0.008517,-0.054336,-0.062479,-0.017011,0.057848,...,-0.016013,-0.056486,-0.034473,0.044314,-0.049185,-0.061752,0.014953,0.021774,-0.037716,0.062945,-0.041886,0.062561,-0.04706,0.019618,-0.048683,-0.001434,0.045211,-0.04721,0.060337,-0.027217,-0.011286,-0.059661,0.017192,-0.002443,-0.041846,-0.061353,-0.014314,-0.04269,0.037469,0.03572
0,t5_727wh4,takingcareofmyself,t3_xpabsk,0.041984,0.070482,0.009444,-0.026611,-0.089727,0.029351,-0.01104,-0.04697,-0.084615,0.046595,0.073871,0.099597,0.042562,-0.006095,-0.015178,0.009747,0.053648,0.024951,0.059977,0.018523,0.038629,-0.036912,0.058183,0.058821,0.01636,-0.025032,-0.015239,...,0.001146,0.026521,-0.039801,-0.048266,-0.028693,0.007116,-0.079086,-0.00724,-0.005157,0.018876,0.00016,0.035907,-0.052774,-0.041382,-0.038731,-0.003992,0.031624,0.002393,0.067947,-0.014337,-0.026502,-0.030022,-0.002565,0.017329,-0.014302,-0.052186,0.040643,-0.003113,0.045284,0.068758


In [21]:
%%time

ddf_v_pc = ddf_v_pc.set_index('subreddit_id', drop=False)

CPU times: user 1.9 s, sys: 196 ms, total: 2.1 s
Wall time: 1.85 s


In [23]:
%%time

ddf_v_pc.head()

CPU times: user 34 s, sys: 16.1 s, total: 50.1 s
Wall time: 40.4 s


Unnamed: 0_level_0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21,embeddings_22,embeddings_23,embeddings_24,embeddings_25,embeddings_26,...,embeddings_482,embeddings_483,embeddings_484,embeddings_485,embeddings_486,embeddings_487,embeddings_488,embeddings_489,embeddings_490,embeddings_491,embeddings_492,embeddings_493,embeddings_494,embeddings_495,embeddings_496,embeddings_497,embeddings_498,embeddings_499,embeddings_500,embeddings_501,embeddings_502,embeddings_503,embeddings_504,embeddings_505,embeddings_506,embeddings_507,embeddings_508,embeddings_509,embeddings_510,embeddings_511
subreddit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
t5_1009a3,t5_1009a3,memesenespanol,t3_y9wrk3,0.029794,-0.036412,0.01103,-0.026378,-0.11295,0.020763,-0.000531,-0.023769,-0.079438,0.004313,0.017542,0.0443,0.083777,-0.020491,-0.073843,-0.02906,0.002103,-0.00386,-0.09007,-0.075972,-0.065027,0.055047,-0.011931,0.043185,-0.005391,0.040229,-0.056414,...,-0.001278,0.00716,-0.039252,0.01841,-0.012865,-0.008477,-0.042788,0.031798,-0.061614,-0.101503,0.00897,0.026178,-0.046689,-0.009793,0.023486,0.01281,-0.013412,0.052082,0.041755,-0.012377,-0.053272,0.02098,0.02798,-0.037622,0.002199,-0.026033,-0.017429,-0.020511,-0.014953,0.04023
t5_1009a3,t5_1009a3,memesenespanol,t3_ylgyli,-0.011561,0.016742,0.042173,0.011019,0.010504,0.037094,-0.001444,0.013033,-0.075158,0.029151,0.009533,0.049926,0.064287,-0.019245,0.019229,-0.018734,0.072003,0.039332,0.044433,-0.030302,0.024067,-0.048032,0.044026,0.059258,-0.065348,-0.019487,0.085103,...,0.047689,-0.012912,-0.049804,-0.040879,0.020014,-0.087303,0.036601,-0.072077,0.024168,0.068173,0.005123,0.054224,0.043275,-0.042033,0.061564,-0.025993,-0.002551,-0.050939,0.004725,0.006282,0.018217,-0.053759,-0.010806,-0.047394,0.035035,-0.050257,0.002041,-0.075777,0.048742,0.061655
t5_1009a3,t5_1009a3,memesenespanol,t3_yk94bn,0.067938,0.047519,-0.042753,-0.034848,-0.061984,0.04781,-0.0165,0.082757,-0.093897,-0.006455,-0.0895,-0.069184,-0.053194,-0.016164,0.063037,0.067988,0.034457,-0.038994,0.066526,-0.012809,0.005293,0.042943,0.017434,0.011339,0.034427,0.051017,0.045604,...,0.074072,-0.022093,0.03068,0.024071,0.042562,-0.063363,0.046072,0.077823,0.069786,-0.006616,0.044856,0.025121,-0.033219,-0.034077,-0.021144,-0.006295,0.03198,0.013519,-0.06114,-0.01173,-0.067999,0.005053,-0.01654,0.041907,0.000889,-0.001233,0.047814,-0.083923,-0.034015,0.006397
t5_1009a3,t5_1009a3,memesenespanol,t3_ygrvxr,0.012266,0.031666,-0.033965,-0.005456,-0.099487,-0.032535,0.033139,0.040253,-0.092837,-0.036952,-0.000781,0.003804,0.05898,-0.016602,-0.058802,0.043792,-0.006009,0.025345,0.016796,0.0532,-0.031829,0.022719,0.03625,0.02649,0.038504,0.037893,0.038948,...,0.032594,-0.014326,0.055015,0.056786,0.051868,-0.020934,-0.050007,0.01601,0.065273,-0.094662,-0.024044,0.000282,-0.067299,-0.044612,0.021254,-0.033123,-0.020041,0.072763,-0.021518,0.038483,-0.057747,-0.039825,-0.0062,0.005183,0.049154,-0.036276,-0.011988,-0.057912,0.070577,-0.011358
t5_1009a3,t5_1009a3,memesenespanol,t3_y28ica,-0.032772,0.085508,0.05794,0.047873,-0.062763,0.022547,0.049556,0.049403,-0.072958,0.075054,-0.064979,-0.005786,-0.046292,0.042218,-0.019904,0.0619,-0.01256,0.036472,0.028593,0.054423,-0.014246,0.012553,0.017955,-0.027888,-0.016607,0.062211,0.042267,...,0.027762,0.015362,0.012604,0.036481,0.047646,-0.06718,-0.032762,-0.082307,-0.012887,-0.07824,0.053084,-0.00075,-0.004817,0.011674,0.049049,0.026105,0.042094,-0.069384,0.024259,-0.046838,-0.034735,0.002032,0.049646,0.067827,0.069735,-0.021837,0.056587,0.019552,0.082167,-0.018743


## Save as partitioned files (locally)
This can take a long time because we're saving 700k+ individual files. Even if each file is only a few MB in size.

dask can be really frustrating because in 14 minutes it wrote nothing... sigh.

In [None]:
df_v_pc_sample

In [24]:
%%time

(
    ddf_v_pc
    .to_parquet(
        path_post_comments_partitioned,
        write_index=False,
        partition_on='subreddit_id',
    )
)

KeyboardInterrupt: 

In [28]:
%%time

(
    dd.from_pandas(df_v_pc, npartitions=100)
    .to_parquet(
        path_post_comments_partitioned,
        write_index=False,
        partition_on='subreddit_id',
    )
)

[                                        ] | 0% Completed | 14min 40.6s
[                                        ] | 0% Completed | 14min 40.8s


KeyboardInterrupt: 

In [None]:
LEGACY

# Start MLflow & Log base params

In [9]:
mlf = MlflowLogger(tracking_uri=cfg_agg_embeddings.config_dict['mlflow_tracking_uri'])

In [10]:
mlflow_experiment = cfg_agg_embeddings.config_dict['mlflow_experiment']
# 'v0.6.0_mUSE_aggregates', 'v0.6.0_mUSE_aggregates_test'


t_start_agg_embed = datetime.utcnow()
info(f"== Start run_aggregation() method ==")


info(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
mlf.set_experiment(mlflow_experiment)
mlflow.start_run()
mlf.add_git_hash_to_active_run()
mlf.set_tag_hostname(key='host_name')
mlf.log_param_hostname(key='host_name')
mlf.log_cpu_count()
mlf.log_ram_stats(param=True, only_memory_used=False)

03:51:17 | INFO | "== Start run_aggregation() method =="
03:51:17 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db"
03:51:17 | INFO | "host_name: djb-100-2021-04-28-djb-eda-german-subs"
03:51:17 | INFO | "cpu_count: 96"
03:51:17 | INFO | "RAM stats:
{'memory_used_percent': '36.28%', 'memory_total': '1,444,961', 'memory_used': '524,198', 'memory_free': '759,036'}"


{'memory_total': 1444961,
 'memory_used_percent': 0.3627765732085503,
 'memory_used': 524198,
 'memory_free': 759036}

In [11]:
# set weights
# Normalize them by dividing by 100
WEIGHT_POST_COMMENT = (
    cfg_agg_embeddings.config_dict['aggregate_params']['agg_post_post_and_comment_weight'] / 100
)
WEIGHT_SUB_META = (
    cfg_agg_embeddings.config_dict['aggregate_params']['agg_post_subreddit_desc_weight'] / 100
)
print(WEIGHT_POST_COMMENT + WEIGHT_SUB_META)
assert(1.0 == WEIGHT_POST_COMMENT + WEIGHT_SUB_META)




mlflow.log_params(
    {
        'embeddings_bucket': cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
        'embeddings_subreddit_path': gcs_sub_embeddings,
        'embeddings_post_and_comments_path': gcs_post_comment_embeddings,
        'weight_post_and_comments': WEIGHT_POST_COMMENT,
        'weight_subreddit_meta': WEIGHT_SUB_META,
    }
)
for k_, v_ in cfg_agg_embeddings.config_dict.items():
    if isinstance(v_, str):
        try:
            mlflow.log_param(k_, v_)
        except Exception as e:
            print(e)

1.0


In [13]:
%%time

pc_v = LoadSubredditsGCS(
    bucket_name=cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
    gcs_path=gcs_post_comment_embeddings,
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='post_id',
    df_format='pandas',
    unique_check=False,
    verbose= True,
    
    n_sample_files=cfg_agg_embeddings.config_dict['n_sample_posts_files'],  # None,
    n_files_slice_start=None,  # None,
    n_files_slice_end=None,  # None, 
)
pc_v.local_cache()

df_v_pc = pc_v.read_as_one_df()
r_pc, c_pc = df_v_pc.shape
mlflow.log_metrics(
    {
        f"df_v_post_comments-rows": r_pc,
        f"df_v_post_comments-cols": c_pc,
    }
)
print(f"{r_pc:,.0f} rows, {c_pc:,.0f} cols")

t_data_load = elapsed_time(start_time=t_start_data_load_, log_label='Data Loading Time', verbose=True)
mlflow.log_metric('time_fxn-data_loading_time',
                  t_data_load / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

03:51:25 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20221107/post_and_comment_text_combined/text_all/embedding/2022-11-07_081017"
03:51:25 | INFO | "  201 <- Files matching prefix"
03:51:25 | INFO | "  201 <- Files to check"
03:51:25 | INFO | "    000000000000-317975_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000001-304046_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000002-234112_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000003-365746_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000004-284006_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000005-344157_by_515.parquet <- File already exists, not downloading"
03:51:25 | INFO | "    000000000006-288452_by_515.parquet <- 

53,597,817 rows, 515 cols


03:55:53 | INFO | "RAM stats:
{'memory_used_percent': '49.77%', 'memory_used': '719,153'}"


CPU times: user 7min 26s, sys: 16min 43s, total: 24min 10s
Wall time: 4min 28s


{'memory_used_percent': 0.49769716968139627, 'memory_used': 719153}

# Set weights & create copy dfs for new weights

In [14]:
l_ix_sub_level = ['subreddit_id', 'subreddit_name']
l_ix_post_level = l_ix_sub_level + ['post_id']

l_embedding_cols = [c for c in df_v_pc if c.startswith('embeddings_')]
print(len(l_embedding_cols))

512


In [15]:
%%time
df_v_pc_weighted = df_v_pc.copy()

df_v_subs_weighted = df_v_subs.copy()

# should be True b/c they're copies
print(np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515]))
print(np.allclose(df_v_subs_weighted.iloc[:1000,2:515], df_v_subs.iloc[:1000,2:515]))

# apply weight to all posts & subreddit meta at once (vectorized)
info(f"Initializing weighted SUBS meta")
df_v_subs_weighted[l_embedding_cols] = df_v_subs_weighted[l_embedding_cols] * WEIGHT_SUB_META

info(f"Initializing weighted POSTS embeddings")
df_v_pc_weighted[l_embedding_cols] = df_v_pc_weighted[l_embedding_cols] * WEIGHT_POST_COMMENT

# NOW they shouldn't be equal (Should be False)
print(np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515]))
print(np.allclose(df_v_subs_weighted.iloc[:1000,2:515], df_v_subs.iloc[:1000,2:515]))

03:56:39 | INFO | "Initializing weighted SUBS meta"


True
True


03:56:41 | INFO | "Initializing weighted POSTS embeddings"


False
False
CPU times: user 1min 46s, sys: 1min 44s, total: 3min 31s
Wall time: 3min 30s


In [16]:
# counts_describe(df_v_pc[l_ix_post_level])

# Aggregate to Post-Level: Post&Comments + Subreddit Meta

It's better to let pandas handle the interations with `.groupby('subreddit_id')`. Otherwise we have to create masks for each subreddit that can take much longer (17+ hours).


For creating the DAG with 81k subreddits:
- ETA with masks: +17.6 hours
- ETA with groupby: ~2.5 hours

For creating DAG with 700k subreddits:
- 8.5 hours: with dask + groupby

```
# mask:
0%  329/81973 [04:18<17:42:36, 1.28it/s]

# .groupby()
6% 4751/81973 [09:56<2:35:06, 8.30it/s]


# .groupby() + dask.delayed(....to_numpy()) | 700k+ subreddits:
5%  34121/705963 [26:36<8:15:14, 22.61it/s]

5% 38903/711664 [30:03<8:09:53, 22.89it/s

```

---

Updates using `dask.delayed`:
By combining .groupby() + `dask.delayed` we can process things ~3x faster:

```
# .groupby() + dask.delayed(....to_numpy()) | FASTEST
100% 3467/3467 [02:31<00:00, 23.08it/s]
Wall time: 2min 38s


# masks with dask.delayed():
#  This is 2x faster than serial processing, but .groupby() + dask.delayed() is much faster
100% 3467/3467 [00:11<00:00, 299.85it/s]
05:44:20 | INFO | "Define new C1 df DAG in dask"
05:44:20 | INFO | "COMPUTE new C1 df START"
05:48:20 | INFO | "COMPUTE new C1 df DONE"
05:48:20 | INFO | "  0:04:11.393036 <- Total Agg fxn time time elapsed"
CPU times: user 4min 33s, sys: 24.3 s, total: 4min 57s
Wall time: 4min 12s


# .groupby(), no dask delayed | SLOWEST
100% 3467/3467 [08:20<00:00, 6.97it/s]
  0:08:21.661816 <- Total Agg fxn time time elapsed
```


In [None]:
%%time
# set style so that we can try output & time in either format
AGG_STYLE = cfg_agg_embeddings.config_dict['agg_style']  # serial v. dask.delayed

info(f"Start C1 - posts + comments + sub descriptions with format: `{AGG_STYLE}`")
t_start_agg_post_c1 = datetime.utcnow()

l_df_c1_weights = list()

if AGG_STYLE == 'serial':
    for s_id, df_ in tqdm(
        df_v_pc_weighted.groupby('subreddit_id'),
        ascii=True, mininterval=5,
    ):
        # For each post in a subreddit, get new embedding: combine subreddit_meta + post(and_comment)
        df_.loc[:, l_embedding_cols] = np.add(
            df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
            df_[l_embedding_cols]
        )
        l_df_c1_weights.append(df_)
        del df_

    info(f"Create new C1 df")
    df_posts_agg_c1 = pd.concat(l_df_c1_weights, ignore_index=True)

elif AGG_STYLE == 'dask_delayed':
    for s_id, df_ in tqdm(
        df_v_pc_weighted.groupby('subreddit_id'),
        ascii=True, mininterval=5,
    ):
        # For each post in a subreddit, get new embedding: combine subreddit_meta + post(and_comment)
        df_pc_embeddings_ = dask.delayed(np.add)(
            dask.delayed(df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy()),
            dask.delayed(df_[l_embedding_cols])
        )
        l_df_c1_weights.append(
            dask.delayed(pd.concat)([dask.delayed(df_[l_ix_post_level]), df_pc_embeddings_], ignore_index=False, axis=1)
        )

    info(f"Define new posts C1 df DAG in dask")
    df_posts_agg_c1_delayed = dask.delayed(pd.concat)(l_df_c1_weights, ignore_index=True)

    info(f"COMPUTE new C1 df START")
    df_posts_agg_c1 = df_posts_agg_c1_delayed.compute()
    info(f"COMPUTE new C1 df DONE")
    
else:
    raise NotImplementedError(f'Other agg style not implemented: {AGG_STYLE}')


r_, c_ = df_posts_agg_c1.shape
mlflow.log_metrics(
    {
        f"df_posts_agg_c1-rows": r_,
        f"df_posts_agg_c1-cols": c_,
    }
)
print(f"{r_:,.0f} rows, {c_:,.0f} cols")
del r_, c_

t_agg_pc_c1 = elapsed_time(start_time=t_start_agg_post_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_posts_agg_c1_no_delay',
                  t_agg_pc_c1 / timedelta(minutes=1)
                  )
info(f"C1 - post level complete")

03:59:24 | INFO | "Start C1 - posts + comments + sub descriptions with format: `dask_delayed`"


  0%|          | 0/711664 [00:00<?, ?it/s]

In [None]:
df_posts_agg_c1.info()

In [None]:
df_posts_agg_c1.iloc[:5, :10]

### Save post-level

In [None]:
d_dfs_to_save = defaultdict(dict)

In [None]:
%%time
d_dfs_to_save['df_posts_agg_c1']['local'] = (
    path_this_model / f"df_posts_agg_c1_{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_posts_agg_c1,
    d_dfs_to_save['df_posts_agg_c1']['local'],
    write_index=False
)

info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save['df_posts_agg_c1']['local'], artifact_path='df_posts_agg_c1')

# Aggregate to Subreddit Level

In [None]:
%%time

# first, figure out how many posts each subreddit has
info(f"Count posts per subreddit...")
c_post_embedding_count = 'posts_for_embeddings_count'


df_posts_for_embedding_count = (
    df_posts_agg_c1
    .groupby(l_ix_sub_level, as_index=False)
    .agg(**{c_post_embedding_count: ('post_id', 'nunique')})
)
# fill subs that have no posts
df_posts_for_embedding_count = pd.concat(
    [
        df_posts_for_embedding_count, 
        df_v_subs[
            ~df_v_subs['subreddit_id'].isin(df_posts_agg_c1['subreddit_id'])
        ][l_ix_sub_level].assign(**{c_post_embedding_count: 0})
    ],
    axis=0
)
mlf.log_ram_stats(only_memory_used=True)

# min_posts >= -> regular mean. If it's less than this, then mix in subreddit_description into average
n_min_posts_for_regular_mean = 3
subreddits_above_n_ = (
    df_posts_for_embedding_count
    [df_posts_for_embedding_count[c_post_embedding_count] >= n_min_posts_for_regular_mean]
    ['subreddit_id']
)
subreddits_below_n_ = set(df_v_subs['subreddit_id']) - set(subreddits_above_n_)
mask_min_posts_for_reg_mean = df_posts_agg_c1['subreddit_id'].isin(subreddits_above_n_)


info(f"SUBREDDIT-LEVEL C1 - posts + comments + sub descriptions")
t_start_agg_subs_c1 = datetime.utcnow()

# 3+ posts: simple mean()
info(f"Mean for subs above threshold: {n_min_posts_for_regular_mean}")
df_subs_agg_c1_Nplus = (
    df_posts_agg_c1[mask_min_posts_for_reg_mean]
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
)

# calculate mean for all other subs: add UNWEIGHTED subreddit_description into averages
info(f"Calculating mean for subs BELOW post threshold...")
df_subs_agg_c1_Nbelow = (
    pd.concat(
        [
            df_posts_agg_c1[~mask_min_posts_for_reg_mean],
            df_v_subs[df_v_subs['subreddit_id'].isin(subreddits_below_n_)]
        ]
    )
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
)
mlf.log_ram_stats(only_memory_used=True)
info(f"Combining all subreddits...")
df_subs_agg_c1 = (
    df_posts_for_embedding_count
    .merge(
        pd.concat([df_subs_agg_c1_Nplus, df_subs_agg_c1_Nbelow]),
        how='outer',
        on=l_ix_sub_level
    )
    .sort_values(by=l_ix_sub_level)
)

# Check for dupes
assert(len(df_subs_agg_c1) == df_subs_agg_c1['subreddit_id'].nunique()), f"Found duplicate subreddit_ids"

r_, c_ = df_subs_agg_c1.shape
mlflow.log_metrics(
    {
        f"df_subs_agg_c1-rows": r_,
        f"df_subs_agg_c1-cols": c_,
    }
)
info(f"{r_:,.0f} rows, {c_:,.0f} cols  <- df_subs_agg_c1.shape (posts + comments + sub description)")
del r_, c_

t_agg_subs_c1 = elapsed_time(start_time=t_start_agg_subs_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_subs_agg_c1',
                  t_agg_subs_c1 / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

In [None]:
df_subs_agg_c1.iloc[-8:, :10]

In [None]:
df_subs_agg_c1.iloc[10:18, :10]

In [None]:
mlf.log_ram_stats(only_memory_used=True)

### Save Subreddit level

Save to dask anyway b/c it could require multiple files as we cover 700k+ subreddits

In [None]:
%%time
d_dfs_to_save['df_subs_agg_c1']['local'] = (
    path_this_model / f"df_subs_agg_c1-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_subs_agg_c1,
    d_dfs_to_save['df_subs_agg_c1']['local'],
    write_index=False
)


info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save['df_subs_agg_c1']['local'], artifact_path='df_subs_agg_c1')
mlf.log_ram_stats(only_memory_used=True)

## 2nd flow for subreddit level -- do not include additional weight from subreddit description
Potentially, we might be skewing the embeddings too much by adding extra weight to subreddit description.

So save embeddings WITHOUT additional weights so that we can compare the two approaches.

We'll still fill subreddits w/o posts with subreddit description.

In [None]:
info(f"SUBREDDIT-LEVEL C1 no extra sub description weight - posts + comments + sub descriptions")
t_start_agg_subs_c1_uw = datetime.utcnow()

# 3+ posts: simple mean()
info(f"Mean for subs above threshold: {n_min_posts_for_regular_mean} (already calculated)")

# calculate mean for all other subs: add UNWEIGHTED subreddit_description into averages
info(f"Calculating mean for subs BELOW post threshold...")
df_subs_agg_c1_Nbelow_uw = (
    df_posts_agg_c1[~mask_min_posts_for_reg_mean]
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
)
# get embeddings for subs w/ zero posts
subs_wo_posts = df_posts_for_embedding_count[df_posts_for_embedding_count[c_post_embedding_count] == 0]['subreddit_id']
info(f"{len(subs_wo_posts):,.0f}")

mlf.log_ram_stats(only_memory_used=True)
info(f"Combining all subreddits...")
df_subs_agg_c1_uw = (
    df_posts_for_embedding_count
    .merge(
        pd.concat(
            [
                df_subs_agg_c1_Nplus, df_subs_agg_c1_Nbelow_uw, 
                df_v_subs[df_v_subs['subreddit_id'].isin(subs_wo_posts)]
            ]
        ),
        how='outer',
        on=l_ix_sub_level
    )
    .sort_values(by=l_ix_sub_level)
)

# Check for dupes
assert(len(df_subs_agg_c1_uw) == df_subs_agg_c1_uw['subreddit_id'].nunique()), f"Found duplicate subreddit_ids"

r_, c_ = df_subs_agg_c1_uw.shape
mlflow.log_metrics(
    {
        f"df_subs_agg_c1_uw-rows": r_,
        f"df_subs_agg_c1_uw-cols": c_,
    }
)
info(f"{r_:,.0f} rows, {c_:,.0f} cols  <- df_subs_agg_c1_uw.shape (posts + comments + sub description)")
del r_, c_

t_agg_subs_c1_uw = elapsed_time(start_time=t_start_agg_subs_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_subs_agg_c1_uw',
                  t_agg_subs_c1 / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

### Check equality of unweighted v. weighted

In [None]:
# should be True
assert True == np.allclose(
    (
        df_subs_agg_c1_uw
        [df_subs_agg_c1_uw['subreddit_id'].isin(subreddits_above_n_.head(20))]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    ),
    (
        df_subs_agg_c1
        [df_subs_agg_c1['subreddit_id'].isin(subreddits_above_n_.head(20))]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    )
)

In [None]:
# should be False
l_sample_subs_below_n_ = list(subreddits_below_n_)[:20]
assert False == np.allclose(
    (
        df_subs_agg_c1_uw
        [df_subs_agg_c1_uw['subreddit_id'].isin(l_sample_subs_below_n_)]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    ),
    (
        df_subs_agg_c1
        [df_subs_agg_c1['subreddit_id'].isin(l_sample_subs_below_n_)]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    )
)

### Save Subreddit level

Use dask b/c as we model over 200k subreddits a single file gets too big

In [None]:
%%time
name_sub_agg_unweighted = 'df_subs_agg_c1_unweighted'
d_dfs_to_save[name_sub_agg_unweighted]['local'] = (
    path_this_model / f"{name_sub_agg_unweighted}-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_subs_agg_c1_uw,
    d_dfs_to_save[name_sub_agg_unweighted]['local'],
    write_index=False
)


info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save[name_sub_agg_unweighted]['local'], artifact_path=name_sub_agg_unweighted)
mlf.log_ram_stats(only_memory_used=True)

# End run

In [None]:
# finish logging total time + end mlflow run
total_fxn_time = elapsed_time(start_time=t_start_agg_embed, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-full_aggregation_fxn_minutes',
                  total_fxn_time / timedelta(minutes=1)
                  )
mlflow.end_run()

In [None]:
# mlflow.end_run("FAILED")