# Purpose

**2022-08-15: v0.6.0**
<br>Test `dask.delayed` to run aggregation of multiple subreddits in parallel.
With the new project we expect to aggregate posts for over 300k subreddits. For most of the process, each subreddit can be processed independently of other subreddits, so it makes sense to try to split up the work so we can speed things up.

**2022-06-29: v0.5.0**
<br>Because we embedded post & text as a single embedding and we didn't use MLflow to create those embeddings, it's easier to  run the embeddings in this notebook rather than to re-use or re-write the old `AggregateEmbeddings` class.

Provenance:
* `v0.4.1 / djb_03.01-2021-12-aggregate_v041_posts_and_comments_pandas.ipynb`

# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from datetime import datetime, timedelta
import gc
import os
import logging
from logging import info
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm.auto import tqdm

import mlflow
import hydra

import subclu
from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)
from subclu.models import aggregate_embeddings_pd

from subclu.utils import set_working_directory, get_project_subfolder
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric,
    elapsed_time,
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)

from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_gcs import LoadSubredditsGCS


print_lib_versions([dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.6.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set Local model paths

In [4]:
manual_model_timestamp = datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')
path_this_model = get_project_subfolder(
    f"data/models/aggregate_embeddings/manual_v060_{manual_model_timestamp}"
)
Path.mkdir(path_this_model, parents=True, exist_ok=True)
path_this_model

PosixPath('/home/jupyter/subreddit_clustering_i18n/data/models/aggregate_embeddings/manual_v060_2022-08-16_084151')

# Load config for embeddings aggregation

For v0.6.0 embeddings I didn't use mlflow to track the embeddings inference. We'll need to get them from these folders in GCS:

- [Subreddit metadata](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555`
- [Post + Comment Text (already combined)](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925`



In [5]:
cfg_agg_embeddings = LoadHydraConfig(
    config_name='aggregate_embeddings_v0.6.0.yaml',
    config_path="../config",
    overrides=[
        f"agg_style=dask_delayed",
    ],
)
print(cfg_agg_embeddings.config_dict.keys())

dict_keys(['data_text_and_metadata', 'data_embeddings_to_aggregate', 'aggregate_params', 'bucket_output', 'mlflow_tracking_uri', 'mlflow_experiment', 'n_sample_subreddits', 'n_sample_posts_files', 'n_sample_comments_files', 'agg_style'])


In [6]:
# cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']

In [7]:
for k_, v_ in cfg_agg_embeddings.config_dict.items():
    if isinstance(v_, dict):
        print(f"{k_}:")
        for k2_, v2_ in v_.items():
            print(f"    {k2_}: {v2_}")
    else:
        print(f"{k_}: {v_}")

data_text_and_metadata:
    dataset_name: v0.6.0 inputs. ~110k seed subreddits, ~340k with 3+ posts, ~700k total subreddits
    bucket_name: i18n-subreddit-clustering
    folder_subreddits_text_and_meta: i18n_topic_model_batch/runs/20220811/subreddits/text
    folder_posts_text_and_meta: i18n_topic_model_batch/runs/20220811/posts
    folder_comments_text_and_meta: i18n_topic_model_batch/runs/20220811/comments
    folder_post_and_comment_text_and_meta: i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all
data_embeddings_to_aggregate:
    bucket_embeddings: i18n-subreddit-clustering
    post_and_comments_folder_embeddings: i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding/2022-08-11_084218
    subreddit_desc_folder_embeddings: i18n_topic_model_batch/runs/20220811/subreddits/text/embedding/2022-08-11_082859
    col_subreddit_id: subreddit_id
aggregate_params:
    min_post_and_comment_text_len: 3
    agg_post_post_and_comment_wei

# Start MLflow & Log base params

In [8]:
mlf = MlflowLogger(tracking_uri=cfg_agg_embeddings.config_dict['mlflow_tracking_uri'])

In [9]:
mlflow_experiment = cfg_agg_embeddings.config_dict['mlflow_experiment']
# 'v0.6.0_mUSE_aggregates', 'v0.6.0_mUSE_aggregates_test'


t_start_agg_embed = datetime.utcnow()
info(f"== Start run_aggregation() method ==")


info(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
mlf.set_experiment(mlflow_experiment)
mlflow.start_run()
mlf.add_git_hash_to_active_run()
mlf.set_tag_hostname(key='host_name')
mlf.log_param_hostname(key='host_name')
mlf.log_cpu_count()
mlf.log_ram_stats(param=True, only_memory_used=False)

08:41:52 | INFO | "== Start run_aggregation() method =="
08:41:52 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db"
08:41:53 | INFO | "host_name: djb-100-2021-04-28-djb-eda-german-subs"
08:41:53 | INFO | "cpu_count: 96"
08:41:53 | INFO | "RAM stats:
{'memory_used_percent': '5.98%', 'memory_total': '1,444,961', 'memory_used': '86,467', 'memory_free': '1,197,814'}"


{'memory_total': 1444961,
 'memory_used_percent': 0.05984036939405285,
 'memory_used': 86467,
 'memory_free': 1197814}

In [10]:
# set weights
# Normalize them by dividing by 100
WEIGHT_POST_COMMENT = (
    cfg_agg_embeddings.config_dict['aggregate_params']['agg_post_post_and_comment_weight'] / 100
)
WEIGHT_SUB_META = (
    cfg_agg_embeddings.config_dict['aggregate_params']['agg_post_subreddit_desc_weight'] / 100
)
print(WEIGHT_POST_COMMENT + WEIGHT_SUB_META)
assert(1.0 == WEIGHT_POST_COMMENT + WEIGHT_SUB_META)


gcs_sub_embeddings = cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['subreddit_desc_folder_embeddings']
print(gcs_sub_embeddings)
gcs_post_comment_embeddings = cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['post_and_comments_folder_embeddings']
print(gcs_post_comment_embeddings)

mlflow.log_params(
    {
        'embeddings_bucket': cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
        'embeddings_subreddit_path': gcs_sub_embeddings,
        'embeddings_post_and_comments_path': gcs_post_comment_embeddings,
        'weight_post_and_comments': WEIGHT_POST_COMMENT,
        'weight_subreddit_meta': WEIGHT_SUB_META,
    }
)
for k_, v_ in cfg_agg_embeddings.config_dict.items():
    if isinstance(v_, str):
        try:
            mlflow.log_param(k_, v_)
        except Exception as e:
            print(e)

1.0
i18n_topic_model_batch/runs/20220811/subreddits/text/embedding/2022-08-11_082859
i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding/2022-08-11_084218


# Load data

In [11]:
%%time
t_start_data_load_ = datetime.utcnow()

subs_v = LoadSubredditsGCS(
    bucket_name=cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
    gcs_path=gcs_sub_embeddings,
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='subreddit_id',
    df_format='pandas',
    unique_check=True,
    verbose= True,
    
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
)
subs_v.local_cache()

df_v_subs = subs_v.read_as_one_df()
r_subs, c_subs = df_v_subs.shape
mlflow.log_metrics(
    {
        f"df_v_subs-rows": r_subs,
        f"df_v_subs-cols": c_subs,
    }
)
print(f"{r_subs:,.0f} rows, {c_subs:,.0f} cols")

08:41:55 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220811/subreddits/text/embedding/2022-08-11_082859"
08:41:55 | INFO | "  7 <- Files matching prefix"
08:41:55 | INFO | "  7 <- Files to check"
08:41:55 | INFO | "    000000000000-131971_by_514.parquet <- File already exists, not downloading"
08:41:55 | INFO | "    000000000001-198630_by_514.parquet <- File already exists, not downloading"
08:41:55 | INFO | "    000000000002-441159_by_514.parquet <- File already exists, not downloading"
08:41:55 | INFO | "    2022-08-11_08-28-59_vectorize_text.log <- File already exists, not downloading"
08:41:55 | INFO | "  Files already cached: 4"
08:41:55 | INFO | "  Files already downloaded."
08:41:55 | INFO | "  df format: pandas"
08:42:00 | INFO | "  Checking ID uniqueness..."


771,760 rows, 514 cols
CPU times: user 5.38 s, sys: 5.19 s, total: 10.6 s
Wall time: 7.41 s


In [12]:
# gsutil is usually faster than the python library.
remote_bucket_and_key = 'i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding/2022-08-11_084218'
remote_gs_path = f'gs://{remote_bucket_and_key}'

# Need to remove the last part of the local path otherwise we'll get duplicate subfolders:
#. top/2021-12-14/2021-12-14 instead of top/2021-12-14
local_f = f"/home/jupyter/subreddit_clustering_i18n/data/local_cache/{'/'.join(remote_bucket_and_key.split('/')[:-1])}"
Path(local_f).mkdir(parents=True, exist_ok=True)

# print(f"Remote path:\n  {remote_gs_path}")
# print(f"Local path:\n  {local_f}")

print(
    f"gsutil -m cp -r -n {remote_gs_path} {local_f}"
)
# gsutil -m cp -r -n $remote_gs_path $local_f

gsutil -m cp -r -n gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding/2022-08-11_084218 /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding


In [13]:
%%time

pc_v = LoadSubredditsGCS(
    bucket_name=cfg_agg_embeddings.config_dict['data_embeddings_to_aggregate']['bucket_embeddings'],
    gcs_path=gcs_post_comment_embeddings,
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='post_id',
    df_format='pandas',
    unique_check=False,
    verbose= True,
    
    n_sample_files=cfg_agg_embeddings.config_dict['n_sample_posts_files'],  # None,
    n_files_slice_start=None,  # None,
    n_files_slice_end=None,  # None, 
)
pc_v.local_cache()

df_v_pc = pc_v.read_as_one_df()
r_pc, c_pc = df_v_pc.shape
mlflow.log_metrics(
    {
        f"df_v_post_comments-rows": r_pc,
        f"df_v_post_comments-cols": c_pc,
    }
)
print(f"{r_pc:,.0f} rows, {c_pc:,.0f} cols")

t_data_load = elapsed_time(start_time=t_start_data_load_, log_label='Data Loading Time', verbose=True)
mlflow.log_metric('time_fxn-data_loading_time',
                  t_data_load / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

08:42:03 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding/2022-08-11_084218"
08:42:03 | INFO | "  197 <- Files matching prefix"
08:42:03 | INFO | "  197 <- Files to check"
08:42:03 | INFO | "    000000000000-264431_by_515.parquet <- File already exists, not downloading"
08:42:03 | INFO | "    000000000001-249532_by_515.parquet <- File already exists, not downloading"
08:42:03 | INFO | "    000000000002-308094_by_515.parquet <- File already exists, not downloading"
08:42:03 | INFO | "    000000000003-331082_by_515.parquet <- File already exists, not downloading"
08:42:03 | INFO | "    000000000004-356401_by_515.parquet <- File already exists, not downloading"
08:42:03 | INFO | "    000000000005-331679_by_515.parquet <- File already exists, not downloading"
08:42:03 | INFO | "    000000000006-253861_by_515.parquet <- 

51,906,348 rows, 515 cols


08:45:38 | INFO | "RAM stats:
{'memory_used_percent': '29.48%', 'memory_used': '425,923'}"


CPU times: user 6min 43s, sys: 18min 49s, total: 25min 32s
Wall time: 3min 37s


{'memory_used_percent': 0.2947643569618834, 'memory_used': 425923}

# Set weights & create copy dfs for new weights

In [14]:
l_ix_sub_level = ['subreddit_id', 'subreddit_name']
l_ix_post_level = l_ix_sub_level + ['post_id']

l_embedding_cols = [c for c in df_v_pc if c.startswith('embeddings_')]
print(len(l_embedding_cols))

512


In [15]:
%%time
df_v_pc_weighted = df_v_pc.copy()

df_v_subs_weighted = df_v_subs.copy()

# should be True b/c they're copies
print(np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515]))
print(np.allclose(df_v_subs_weighted.iloc[:1000,2:515], df_v_subs.iloc[:1000,2:515]))

# apply weight to all posts & subreddit meta at once (vectorized)
info(f"Initializing weighted SUBS meta")
df_v_subs_weighted[l_embedding_cols] = df_v_subs_weighted[l_embedding_cols] * WEIGHT_SUB_META

info(f"Initializing weighted POSTS embeddings")
df_v_pc_weighted[l_embedding_cols] = df_v_pc_weighted[l_embedding_cols] * WEIGHT_POST_COMMENT

# NOW they shouldn't be equal (Should be False)
print(np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515]))
print(np.allclose(df_v_subs_weighted.iloc[:1000,2:515], df_v_subs.iloc[:1000,2:515]))

08:46:27 | INFO | "Initializing weighted SUBS meta"


True
True


08:46:30 | INFO | "Initializing weighted POSTS embeddings"


False
False
CPU times: user 1min 55s, sys: 2min 6s, total: 4min 2s
Wall time: 4min 1s


In [16]:
# counts_describe(df_v_pc[l_ix_post_level])

# Aggregate to Post-Level: Post&Comments + Subreddit Meta

It's better to let pandas handle the interations with `.groupby('subreddit_id')`. Otherwise we have to create masks for each subreddit that can take much longer (17+ hours).

- ETA with masks: +17.6 hours
- ETA with groupby ~2.5 hours

```
# mask:
0%  329/81973 [04:18<17:42:36, 1.28it/s]

# .groupby()
6% 4751/81973 [09:56<2:35:06, 8.30it/s]
```

---

Updates using `dask.delayed`:
By combining .groupby() + `dask.delayed` we can process things ~3x faster:

```
# .groupby() + dask.delayed(....to_numpy())
5%  34121/705963 [26:36<8:15:14, 22.61it/s]

```

```
# .groupby(), no dask delayed
100% 3467/3467 [08:20<00:00, 6.97it/s]
  0:08:21.661816 <- Total Agg fxn time time elapsed


# .groupby() + dask.delayed(....to_numpy())
100% 3467/3467 [02:31<00:00, 23.08it/s]
  Wall time: 2min 38s


# masks with dask.delayed():
#  This is 2x faster than serial processing, but .groupby() + dask.delayed() is much faster
100% 3467/3467 [00:11<00:00, 299.85it/s]
05:44:20 | INFO | "Define new C1 df DAG in dask"
05:44:20 | INFO | "COMPUTE new C1 df START"
05:48:20 | INFO | "COMPUTE new C1 df DONE"
05:48:20 | INFO | "  0:04:11.393036 <- Total Agg fxn time time elapsed"
CPU times: user 4min 33s, sys: 24.3 s, total: 4min 57s
Wall time: 4min 12s


```


In [17]:
%%time
# set style so that we can try output & time in either format
AGG_STYLE = cfg_agg_embeddings.config_dict['agg_style']  # serial v. dask.delayed

info(f"Start C1 - posts + comments + sub descriptions with format: `{AGG_STYLE}`")
t_start_agg_post_c1 = datetime.utcnow()

l_df_c1_weights = list()

if AGG_STYLE == 'serial':
    for s_id, df_ in tqdm(
        df_v_pc_weighted.groupby('subreddit_id'),
        ascii=True, mininterval=5,
    ):
        df_.loc[:, l_embedding_cols] = np.add(
            df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
            df_[l_embedding_cols]
        )
        l_df_c1_weights.append(df_)
        del df_

    info(f"Create new C1 df")
    df_posts_agg_c1 = pd.concat(l_df_c1_weights, ignore_index=True)

elif AGG_STYLE == 'dask_delayed':
    for s_id, df_ in tqdm(
        df_v_pc_weighted.groupby('subreddit_id'),
        ascii=True, mininterval=5,
    ):
        df_pc_embeddings_ = dask.delayed(np.add)(
            # df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
            dask.delayed(df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy()),
            dask.delayed(df_[l_embedding_cols])
        )
        l_df_c1_weights.append(
            dask.delayed(pd.concat)([dask.delayed(df_[l_ix_post_level]), df_pc_embeddings_], ignore_index=False, axis=1)
        )

    info(f"Define new posts C1 df DAG in dask")
    df_posts_agg_c1_delayed = dask.delayed(pd.concat)(l_df_c1_weights, ignore_index=True)

    info(f"COMPUTE new C1 df START")
    df_posts_agg_c1 = df_posts_agg_c1_delayed.compute()
    info(f"COMPUTE new C1 df DONE")
    
else:
    raise NotImplementedError(f'Other agg style not implemented: {AGG_STYLE}')

    
r_, c_ = df_posts_agg_c1.shape
mlflow.log_metrics(
    {
        f"df_posts_agg_c1-rows": r_,
        f"df_posts_agg_c1-cols": c_,
    }
)
print(f"{r_:,.0f} rows, {c_:,.0f} cols")
del r_, c_

t_agg_pc_c1 = elapsed_time(start_time=t_start_agg_post_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_posts_agg_c1_no_delay',
                  t_agg_pc_c1 / timedelta(minutes=1)
                  )
info(f"C1 - post level complete")

08:49:40 | INFO | "Start C1 - posts + comments + sub descriptions with format: `dask_delayed`"


  0%|          | 0/705963 [00:00<?, ?it/s]

17:29:25 | INFO | "Define new posts C1 df DAG in dask"
17:29:38 | INFO | "COMPUTE new C1 df START"
17:53:58 | INFO | "COMPUTE new C1 df DONE"
17:53:58 | INFO | "  9:04:17.319290 <- Total Agg fxn time time elapsed"
17:53:58 | INFO | "C1 - post level complete"


51,906,348 rows, 515 cols
CPU times: user 8h 57min 19s, sys: 11min 58s, total: 9h 9min 17s
Wall time: 9h 4min 17s


In [18]:
df_posts_agg_c1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51906348 entries, 0 to 51906347
Columns: 515 entries, subreddit_id to embeddings_511
dtypes: float32(512), object(3)
memory usage: 100.2+ GB


In [19]:
df_posts_agg_c1.iloc[:5, :10]

Unnamed: 0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6
0,t5_1001tl,jewel_xo,t3_w6lnkt,-0.011266,0.001246,0.035281,0.040452,-0.065908,0.009004,-0.0009
1,t5_10029e,milkyhentai,t3_wacyh8,-0.039492,0.007736,0.038307,0.045457,-0.027101,0.033553,0.040519
2,t5_1006k8,badwouldyourather,t3_v9i9a9,-0.008159,0.035251,-0.000912,0.036374,0.046088,0.031007,0.003537
3,t5_100806,jojojosiah,t3_v49gii,0.029087,0.004141,0.031094,-0.019099,-0.041052,0.058712,-0.029871
4,t5_100806,jojojosiah,t3_v49tw9,-0.035303,0.040265,0.045831,0.049636,0.06057,-0.014866,0.047486


### Save post-level

In [23]:
d_dfs_to_save = defaultdict(dict)

In [24]:
%%time
d_dfs_to_save['df_posts_agg_c1']['local'] = (
    path_this_model / f"df_posts_agg_c1_{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_posts_agg_c1,
    d_dfs_to_save['df_posts_agg_c1']['local'],
    write_index=False
)

info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save['df_posts_agg_c1']['local'], artifact_path='df_posts_agg_c1')

20:59:30 | INFO | "Converting pandas to dask..."
20:59:41 | INFO | "  111,275.8 MB <- Memory usage"
20:59:41 | INFO | "     203	<- target Dask partitions	  550.0 <- target MB partition size"
21:10:14 | INFO | "  Logging df to mlflow..."


CPU times: user 1h 43min 7s, sys: 22min 41s, total: 2h 5min 48s
Wall time: 33min 26s


# Aggregate to Subreddit Level

In [26]:
%%time

# first, figure out how many posts each subreddit has
info(f"Count posts per subreddit...")
c_post_embedding_count = 'posts_for_embeddings_count'


df_posts_for_embedding_count = (
    df_posts_agg_c1
    .groupby(l_ix_sub_level, as_index=False)
    .agg(**{c_post_embedding_count: ('post_id', 'nunique')})
)
# fill subs that have no posts
df_posts_for_embedding_count = pd.concat(
    [
        df_posts_for_embedding_count, 
        df_v_subs[
            ~df_v_subs['subreddit_id'].isin(df_posts_agg_c1['subreddit_id'])
        ][l_ix_sub_level].assign(**{c_post_embedding_count: 0})
    ],
    axis=0
)
mlf.log_ram_stats(only_memory_used=True)

# min_posts >= -> regular mean. If it's less than this, then mix in subreddit_description into average
n_min_posts_for_regular_mean = 3
subreddits_above_n_ = (
    df_posts_for_embedding_count
    [df_posts_for_embedding_count[c_post_embedding_count] >= n_min_posts_for_regular_mean]
    ['subreddit_id']
)
subreddits_below_n_ = set(df_v_subs['subreddit_id']) - set(subreddits_above_n_)
mask_min_posts_for_reg_mean = df_posts_agg_c1['subreddit_id'].isin(subreddits_above_n_)


info(f"SUBREDDIT-LEVEL C1 - posts + comments + sub descriptions")
t_start_agg_subs_c1 = datetime.utcnow()

# 3+ posts: simple mean()
info(f"Mean for subs above threshold: {n_min_posts_for_regular_mean}")
df_subs_agg_c1_Nplus = (
    df_posts_agg_c1[mask_min_posts_for_reg_mean]
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
)

# calculate mean for all other subs: add UNWEIGHTED subreddit_description into averages
info(f"Calculating mean for subs BELOW post threshold...")
df_subs_agg_c1_Nbelow = (
    pd.concat(
        [
            df_posts_agg_c1[~mask_min_posts_for_reg_mean],
            df_v_subs[df_v_subs['subreddit_id'].isin(subreddits_below_n_)]
        ]
    )
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
)
mlf.log_ram_stats(only_memory_used=True)
info(f"Combining all subreddits...")
df_subs_agg_c1 = (
    df_posts_for_embedding_count
    .merge(
        pd.concat([df_subs_agg_c1_Nplus, df_subs_agg_c1_Nbelow]),
        how='outer',
        on=l_ix_sub_level
    )
    .sort_values(by=l_ix_sub_level)
)

# Check for dupes
assert(len(df_subs_agg_c1) == df_subs_agg_c1['subreddit_id'].nunique()), f"Found duplicate subreddit_ids"

r_, c_ = df_subs_agg_c1.shape
mlflow.log_metrics(
    {
        f"df_subs_agg_c1-rows": r_,
        f"df_subs_agg_c1-cols": c_,
    }
)
info(f"{r_:,.0f} rows, {c_:,.0f} cols  <- df_subs_agg_c1.shape (posts + comments + sub description)")
del r_, c_

t_agg_subs_c1 = elapsed_time(start_time=t_start_agg_subs_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_subs_agg_c1',
                  t_agg_subs_c1 / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

21:36:17 | INFO | "Count posts per subreddit..."
21:37:40 | INFO | "SUBREDDIT-LEVEL C1 - posts + comments + sub descriptions"
21:37:40 | INFO | "Mean for subs above threshold: 3"
21:53:11 | INFO | "Calculating mean for subs BELOW post threshold..."
21:53:27 | INFO | "Combining all subreddits..."
21:53:35 | INFO | "  0:15:55.600336 <- Total Agg fxn time time elapsed"
21:53:35 | INFO | "  <- df_subs_agg_c1.shape (posts + comments + sub description)"


771,760 rows, 515 cols


21:53:43 | INFO | "RAM stats:
{'memory_used_percent': '59.21%', 'memory_used': '855,520'}"


CPU times: user 6min 27s, sys: 10min 54s, total: 17min 22s
Wall time: 17min 26s


{'memory_used_percent': 0.5920713431019937, 'memory_used': 855520}

In [27]:
df_subs_agg_c1.iloc[-8:, :10]

Unnamed: 0,subreddit_id,subreddit_name,posts_for_embeddings_count,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6
705955,t5_zzsmq,floridamanatheart,1,0.02713,0.00231,-0.000681,0.059026,0.050773,0.016815,0.003628
705956,t5_zzszh,circumcisiongrief,275,-0.011574,0.028891,0.004747,0.004758,-0.015716,0.034211,0.015709
705957,t5_zzw6f,missourisingles,32,0.009504,0.006715,-0.004402,0.019836,-0.045115,0.047648,0.030778
705958,t5_zzw7y,geofssim,1,-0.038012,0.043682,-0.032269,-0.086736,-0.057276,-0.00981,0.032813
705959,t5_zzwrs,hypnosisisbs,2,0.017816,-0.022911,-0.020059,-0.014588,0.004333,0.031112,-0.024723
705960,t5_zzyg0,creepyscarystories,2,0.030785,-0.005454,0.021258,0.017853,0.008426,0.0505,0.039826
705961,t5_zzze9,demonmemes,1,-0.04772,-0.064439,-0.059378,0.002949,-0.015243,0.047832,-0.014066
705962,t5_zzzyw,rachelnicki,5,0.003122,0.038606,0.004465,-0.022144,0.022109,0.029157,-0.011033


In [28]:
df_subs_agg_c1.iloc[10:18, :10]

Unnamed: 0,subreddit_id,subreddit_name,posts_for_embeddings_count,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6
10,t5_100mht,thelongestgameever2,34,-0.004382,-0.019218,0.028417,-0.003276,-0.037487,0.038918,0.001788
11,t5_100mqi,shoelacecult,2,-0.017796,0.003162,0.031859,0.003582,0.002934,0.044953,0.008811
12,t5_100pg0,ru_animemes,1,-0.052363,0.019555,0.006061,-0.080538,-0.020567,0.021408,0.026688
714150,t5_100q7e,alahly,0,-0.002044,0.064038,0.019453,-0.046617,0.039276,0.060532,-0.028324
714836,t5_100r30,comss,0,0.066905,0.038777,-0.000221,0.01271,0.050384,-0.077518,-0.062838
13,t5_100sns,wapt,6,-0.03823,0.004211,0.022296,0.012222,0.009444,-0.042922,-0.044318
14,t5_100tjt,jodi_huisentruit_case,6,0.008531,0.032689,0.033053,0.023155,-0.012744,0.020924,-0.016081
15,t5_100uqq,thethinkingfox,5,0.009767,-0.021833,0.016461,-0.038427,0.026392,0.044391,0.041226


In [29]:
mlf.log_ram_stats(only_memory_used=True)

21:53:53 | INFO | "RAM stats:
{'memory_used_percent': '59.21%', 'memory_used': '855,526'}"


{'memory_used_percent': 0.5920754954631994, 'memory_used': 855526}

### Save Subreddit level

This one we can save as a pandas df, no need to split it into multiple files

In [30]:
%%time
d_dfs_to_save['df_subs_agg_c1']['local'] = (
    path_this_model / f"df_subs_agg_c1-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_subs_agg_c1,
    d_dfs_to_save['df_subs_agg_c1']['local'],
    write_index=False
)


info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save['df_subs_agg_c1']['local'], artifact_path='df_subs_agg_c1')
mlf.log_ram_stats(only_memory_used=True)

21:53:53 | INFO | "Converting pandas to dask..."
21:53:54 | INFO | "  1,618.8 MB <- Memory usage"
21:53:54 | INFO | "       4	<- target Dask partitions	  450.0 <- target MB partition size"
21:54:08 | INFO | "  Logging df to mlflow..."
21:54:56 | INFO | "RAM stats:
{'memory_used_percent': '59.21%', 'memory_used': '855,595'}"


CPU times: user 44.3 s, sys: 18.6 s, total: 1min 2s
Wall time: 1min 2s


{'memory_used_percent': 0.5921232476170637, 'memory_used': 855595}

## 2nd flow for subreddit level -- do not include additional weight from subreddit description
Potentially, we might be skewing the embeddings too much by adding extra weight to subreddit description.

So save embeddings WITHOUT additional weights so that we can compare the two approaches.

We'll still fill subreddits w/o posts with subreddit description.

In [39]:
info(f"SUBREDDIT-LEVEL C1 no extra sub description weight - posts + comments + sub descriptions")
t_start_agg_subs_c1_uw = datetime.utcnow()

# 3+ posts: simple mean()
info(f"Mean for subs above threshold: {n_min_posts_for_regular_mean} (already calculated)")

# calculate mean for all other subs: add UNWEIGHTED subreddit_description into averages
info(f"Calculating mean for subs BELOW post threshold...")
df_subs_agg_c1_Nbelow_uw = (
    df_posts_agg_c1[~mask_min_posts_for_reg_mean]
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
)
# get embeddings for subs w/ zero posts
subs_wo_posts = df_posts_for_embedding_count[df_posts_for_embedding_count[c_post_embedding_count] == 0]['subreddit_id']
info(f"{len(subs_wo_posts):,.0f}")

mlf.log_ram_stats(only_memory_used=True)
info(f"Combining all subreddits...")
df_subs_agg_c1_uw = (
    df_posts_for_embedding_count
    .merge(
        pd.concat(
            [
                df_subs_agg_c1_Nplus, df_subs_agg_c1_Nbelow_uw, 
                df_v_subs[df_v_subs['subreddit_id'].isin(subs_wo_posts)]
            ]
        ),
        how='outer',
        on=l_ix_sub_level
    )
    .sort_values(by=l_ix_sub_level)
)

# Check for dupes
assert(len(df_subs_agg_c1_uw) == df_subs_agg_c1_uw['subreddit_id'].nunique()), f"Found duplicate subreddit_ids"

r_, c_ = df_subs_agg_c1_uw.shape
mlflow.log_metrics(
    {
        f"df_subs_agg_c1_uw-rows": r_,
        f"df_subs_agg_c1_uw-cols": c_,
    }
)
info(f"{r_:,.0f} rows, {c_:,.0f} cols  <- df_subs_agg_c1_uw.shape (posts + comments + sub description)")
del r_, c_

t_agg_subs_c1_uw = elapsed_time(start_time=t_start_agg_subs_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_subs_agg_c1_uw',
                  t_agg_subs_c1 / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

22:08:30 | INFO | "SUBREDDIT-LEVEL C1 no extra sub description weight - posts + comments + sub descriptions"
22:08:30 | INFO | "Mean for subs above threshold: 3 (already calculated)"
22:08:30 | INFO | "Calculating mean for subs BELOW post threshold..."
22:08:40 | INFO | "65,797"
22:08:49 | INFO | "RAM stats:
{'memory_used_percent': '53.52%', 'memory_used': '773,338'}"
22:08:49 | INFO | "Combining all subreddits..."
22:08:58 | INFO | "771,760 rows, 515 cols  <- df_subs_agg_c1_uw.shape (posts + comments + sub description)"
22:08:58 | INFO | "  0:31:18.225451 <- Total Agg fxn time time elapsed"
22:09:07 | INFO | "RAM stats:
{'memory_used_percent': '53.62%', 'memory_used': '774,849'}"


{'memory_used_percent': 0.5362421546325472, 'memory_used': 774849}

### Check equality of unweighted v. weighted

In [47]:
# should be True
assert True == np.allclose(
    (
        df_subs_agg_c1_uw
        [df_subs_agg_c1_uw['subreddit_id'].isin(subreddits_above_n_.head(20))]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    ),
    (
        df_subs_agg_c1
        [df_subs_agg_c1['subreddit_id'].isin(subreddits_above_n_.head(20))]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    )
)

In [48]:
# should be False
l_sample_subs_below_n_ = list(subreddits_below_n_)[:20]
assert False == np.allclose(
    (
        df_subs_agg_c1_uw
        [df_subs_agg_c1_uw['subreddit_id'].isin(l_sample_subs_below_n_)]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    ),
    (
        df_subs_agg_c1
        [df_subs_agg_c1['subreddit_id'].isin(l_sample_subs_below_n_)]
        .sort_values(by=['subreddit_id'])
        [l_embedding_cols]
    )
)

### Save Subreddit level

Use dask b/c as we model over 200k subreddits a single file gets too big

In [49]:
%%time
name_sub_agg_unweighted = 'df_subs_agg_c1_unweighted'
d_dfs_to_save[name_sub_agg_unweighted]['local'] = (
    path_this_model / f"{name_sub_agg_unweighted}-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_subs_agg_c1_uw,
    d_dfs_to_save[name_sub_agg_unweighted]['local'],
    write_index=False
)


info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save[name_sub_agg_unweighted]['local'], artifact_path=name_sub_agg_unweighted)
mlf.log_ram_stats(only_memory_used=True)

22:20:07 | INFO | "Converting pandas to dask..."
22:20:07 | INFO | "  1,618.8 MB <- Memory usage"
22:20:07 | INFO | "       4	<- target Dask partitions	  450.0 <- target MB partition size"
22:20:22 | INFO | "  Logging df to mlflow..."
22:21:10 | INFO | "RAM stats:
{'memory_used_percent': '53.62%', 'memory_used': '774,858'}"


CPU times: user 45.7 s, sys: 18 s, total: 1min 3s
Wall time: 1min 2s


{'memory_used_percent': 0.5362483831743555, 'memory_used': 774858}

# End run

In [50]:
# finish logging total time + end mlflow run
total_fxn_time = elapsed_time(start_time=t_start_agg_embed, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-full_aggregation_fxn_minutes',
                  total_fxn_time / timedelta(minutes=1)
                  )
mlflow.end_run()

22:22:33 | INFO | "  13:40:40.488306 <- Total Agg fxn time time elapsed"


In [51]:
# mlflow.end_run("FAILED")