# Purpose

Run inference on posts for v0.4.1 Subreddit meta + POSTS.

Diff from before: instead of only using `text` (post title + post body), we'll be using multiple columns to concat and get the embeddings.

---

This notebook runs the `vectorize_text_to_embeddings` function to:
- loading USE-multilingual model
- load post text
- convert the text into embeddings (at post or level)


# Notebook setup

In [73]:
%load_ext autoreload
%autoreload 2

In [74]:
from datetime import datetime
import gc
# from functools import partial
# import os
import logging
# from pathlib import Path
# from pprint import pprint

import mlflow

# from tqdm.auto import tqdm
# from tqdm import tqdm
import numpy as np
import pandas as pd

from google.cloud import storage

# TF libraries... I've been getting errors when these aren't loaded
import tensorflow_text
import tensorflow as tf

import subclu
from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.models import vectorize_text_tf

from subclu.utils import set_working_directory
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([mlflow, np, mlflow, pd, tensorflow_text, tf, subclu])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.18.5
mlflow		v: 1.16.0
pandas		v: 1.2.5
tensorflow_text	v: 2.3.0
tensorflow	v: 2.3.3
subclu		v: 0.4.1


In [75]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Initialize mlflow logging with sqlite database

In [None]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

## Get list of experiments with new function

In [None]:
mlf.list_experiment_meta(output_format='pandas')

# Check whether we have access to a GPU

In [56]:
l_phys_gpus = tf.config.list_physical_devices('GPU')
# from tensorflow.python.client import device_lib

print(
    f"\nBuilt with CUDA? {tf.test.is_built_with_cuda()}"
    f"\nGPUs\n==="
    f"\nNum GPUs Available: {len(l_phys_gpus)}"
    f"\nGPU details:\n{l_phys_gpus}"
#     f"\n\nAll devices:\n===\n"
#     f"{device_lib.list_local_devices()}"
)


Built with CUDA? True
GPUs
===
Num GPUs Available: 1
GPU details:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Load config with data to process

In [70]:
config_data_v041 = LoadHydraConfig(
    config_path="../config/data_text_and_metadata",
    config_name='v0.4.1_2021_12',
)
config_data_v041.config_dict

{'dataset_name': 'v0.4.1 inputs - 50k subreddits - Active Subreddits (no Geo) + Geo-relevant users_l28>=100 & posts_l28>=4',
 'bucket_name': 'i18n-subreddit-clustering',
 'folder_subreddits_text_and_meta': 'subreddits/top/2021-12-14',
 'folder_posts_text_and_meta': 'posts/top/2021-12-14',
 'folder_comments_text_and_meta': 'comments/top/2021-12-14',
 'folder_subreddits_text_and_meta_filter': 'subreddits/top/2021-09-24'}

In [71]:
mlflow_experiment_test = 'v0.4.1_mUSE_inference_test'
mlflow_experiment_full = 'v0.4.1_mUSE_inference'

bucket_name = config_data_v041.config_dict['bucket_name']
subreddits_path = config_data_v041.config_dict['folder_subreddits_text_and_meta']
posts_path = config_data_v041.config_dict['folder_posts_text_and_meta']

subreddits_path_exclude = config_data_v041.config_dict['folder_subreddits_text_and_meta_filter']
comments_path = None

# Test new batching function

Most inputs will be the same.
However, some things will change:
- Add new parameter to sample only first N files (we'll process each file individually)

# Run full with `lower_case=False`
Let's see if the current refactor is good enough or if I really need to manually batch files...

**answer**: no it wasn't good enough -- 60GB of RAM wasn't good enough for 19Million comments _lol_.

```
...
12:02:14 | INFO | "  (19168154, 6) <- updated df_comments shape"
12:02:14 | INFO | "Vectorizing COMMENTS..."
12:02:14 | INFO | "Getting embeddings in batches of size: 2100"
100%
9128/9128 [1:32:26<00:00, 1.97it/s]

<__array_function__ internals> in concatenate(*args, **kwargs)

MemoryError: Unable to allocate 36.6 GiB for an array with shape (512, 19168154) and data type float32
```


In [63]:
mlflow_experiment_test

'v0.4.1_mUSE_inference_test'

In [81]:
mlflow.end_run(status='KILLED')

vectorize_text_tf.vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name=f"posts_as_comments_full_text-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    mlflow_experiment=mlflow_experiment_test,
    
    tokenize_lowercase=False,
    
    bucket_name=bucket_name,
    subreddits_path=None,  # subreddits_path,
    posts_path=None,  # posts_path
    comments_path=posts_path,

    # Hack: Rename cols so that I can process posts as a batch of comments
    col_post_id=None,
    col_comment_id='post_id',
    col_text_comment='text',
    col_text_comment_word_count='text_word_count',
    cols_index_comment=['subreddit_name', 'subreddit_id', 'post_id'],
    local_comms_subfolder_relative='df_vect_posts',
    mlflow_comments_folder='df_vect_posts_extra_text',
    cols_comment_text_to_concat=['flair_text', 'post_url_for_embeddings', 'text', 'ocr_inferred_text_agg_clean'],
    
    tf_batch_inference_rows=2400,
    tf_limit_first_n_chars=900,
    
    n_sample_comment_files=2,
    n_sample_comments=80000,
    get_embeddings_verbose=True,

)

23:34:02 | INFO | "Start vectorize function"
23:34:02 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-12-20_233402"


new function loaded
new function loaded


23:34:03 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db"
23:34:03 | INFO | "host_name: djb-subclu-inference-tf-2-3-20210630"
23:34:03 | INFO | "  Saving config to local path..."
23:34:03 | INFO | "  Logging config to mlflow with joblib..."
23:34:04 | INFO | "  Logging config to mlflow with YAML..."
23:34:04 | INFO | "Loading model use_multilingual..."
23:34:06 | INFO | "  0:00:02.260601 <- Load TF HUB model time elapsed"
23:34:06 | INFO | "** Procesing Comments files one at a time ***"
23:34:06 | INFO | "-- Loading & vectorizing COMMENTS in files: 2 --
Expected batch size: 2400"
local variable 'df_posts' referenced before assignment"
23:34:07 | INFO | "Processing: posts/top/2021-12-14/000000000000.parquet"
23:34:10 | INFO | "  Sampling COMMENTS down to: 80,000     Samples PER FILE: 40,001"
23:34:10 | INFO | "  (40001, 8) <- df_comments.shape AFTER sampling"
23:34:10 | INFO | "Create merged text

In [82]:
mlflow.end_run(status='KILLED')
gc.collect()

51851

## Re-do with new batching logic
Trying to do all 19 million comments at once broke, sigh, so need to batch one file at a time.

### Re-run comments and log to non-test mlflow experiment


Besides file-batching, this job increased the row-batches from 2,000 to 6,100... unclear if this is having a negative impact. Maybe smaller batches are somehow more efficient?
Now that I'm reading one file at a time, it looks like speed is taking a big hit

Baseline when running it all in memory. It took `1:32:26`, but it ran out of memory (RAM).
The current ETA is around `2 hours`

```
# singe file, all in memory (results in OOM)
12:02:14 | INFO | "Vectorizing COMMENTS..."
12:02:14 | INFO | "Getting embeddings in batches of size: 2100"
100%
9128/9128 [1:32:26<00:00, 1.97it/s]


# one file at a time... slower, but we get results one file at a time...
16%
6/37 [21:11<1:49:46, 212.45s/it]
```


In [83]:
mlflow.end_run(status='KILLED')

vectorize_text_tf.vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name=f"posts_as_comments_batch_concat_text-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    mlflow_experiment=mlflow_experiment_full,
    
    tokenize_lowercase=False,
    
    bucket_name=bucket_name,
    subreddits_path=subreddits_path,
    posts_path=None,  # posts_path
    comments_path=posts_path,

    # Hack: Rename cols so that I can process posts as a batch of comments
    col_post_id=None,
    col_comment_id='post_id',
    col_text_comment='text',
    col_text_comment_word_count='text_word_count',
    cols_index_comment=['subreddit_name', 'subreddit_id', 'post_id'],
    local_comms_subfolder_relative='df_vect_posts',
    mlflow_comments_folder='df_vect_posts_extra_text',
    cols_comment_text_to_concat=['flair_text', 'post_url_for_embeddings', 'text', 'ocr_inferred_text_agg_clean'],
    
    tf_batch_inference_rows=2450,
    tf_limit_first_n_chars=900,
    
    n_sample_comment_files=None,
    
#     n_sample_posts=9500,
#     n_sample_comments=19100,
)

23:35:19 | INFO | "Start vectorize function"
23:35:19 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-12-20_233519"


new function loaded
new function loaded


23:35:19 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db"
23:35:19 | INFO | "host_name: djb-subclu-inference-tf-2-3-20210630"
23:35:19 | INFO | "  Saving config to local path..."
23:35:19 | INFO | "  Logging config to mlflow with joblib..."
23:35:20 | INFO | "  Logging config to mlflow with YAML..."
23:35:20 | INFO | "Loading model use_multilingual..."
23:35:22 | INFO | "  0:00:02.309061 <- Load TF HUB model time elapsed"
23:35:22 | INFO | "Load subreddits df..."
23:35:24 | INFO | "  0:00:01.509299 <- df_subs loading time elapsed"
23:35:24 | INFO | "  (49705, 4) <- df_subs shape"
23:35:24 | INFO | "Vectorizing subreddit descriptions..."
23:35:25 | INFO | "Getting embeddings in batches of size: 2450"
23:35:36 | INFO | "progress:  29%|#########7                        | 6/21 [00:11<00:27,  1.86s/it]"
23:35:48 | INFO | "progress:  71%|#######################5         | 15/21 [00:23<00:09,  1.52s/it

In [84]:
gc.collect()

52174

# Run full with `lower_case=True`

This one is expected to be a little slower because it'll call `.str.lower()` on each batch of text.

---

TODO: unsure if it's worth running this job in parallel while I do work on a separate VM... might be a big pain to manually sync the rows from metrics & params happening at the same time in two different VMs.



In [None]:
mlflow.end_run(status='KILLED')

vectorize_text_tf.vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name=f"posts_as_comments_batch_concat_text_lowercase-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    mlflow_experiment=mlflow_experiment_full,
    
    tokenize_lowercase=True,
    
    bucket_name=bucket_name,
    subreddits_path=subreddits_path,
    posts_path=None,  # posts_path
    comments_path=posts_path,

    # Hack: Rename cols so that I can process posts as a batch of comments
    col_post_id=None,
    col_comment_id='post_id',
    col_text_comment='text',
    col_text_comment_word_count='text_word_count',
    cols_index_comment=['subreddit_name', 'subreddit_id', 'post_id'],
    local_comms_subfolder_relative='df_vect_posts',
    mlflow_comments_folder='df_vect_posts_extra_text',
    cols_comment_text_to_concat=['flair_text', 'post_url_for_embeddings', 'text', 'ocr_inferred_text_agg_clean'],
    
    tf_batch_inference_rows=2450,
    tf_limit_first_n_chars=900,
    
    n_sample_comment_files=None,
    
#     n_sample_posts=9500,
#     n_sample_comments=19100,
)

# Debug - TQDM bar not showing up
not sure why, but it's no longer showing up


In [47]:
from subclu.utils.tqdm_logger import LogTQDM
from time import sleep

In [48]:
for i in LogTQDM(range(20), ascii=True, mininterval=.05):
    sleep(.05)




In [49]:
for i in LogTQDM(range(20)):
    sleep(.05)




In [50]:
for i in LogTQDM(range(20), ascii=True, logger=logging.getLogger(__name__)):
    sleep(.07)

22:06:01 | INFO | "progress:  60%|##########7       | 12/20 [00:00<00:00, 14.25it/s]"
22:06:01 | INFO | "progress: 100%|##################| 20/20 [00:01<00:00, 14.23it/s]"



In [40]:
for i in LogTQDM(range(20), logger=logging.getLogger(__name__)):
    sleep(.07)

22:03:31 | INFO | "progress:  60%|##########7       | 12/20 [00:00<00:00, 14.25it/s]"
22:03:31 | INFO | "progress: 100%|##################| 20/20 [00:01<00:00, 14.23it/s]"



In [41]:
for i in LogTQDM(range(20), logger=logging.getLogger(__name__),
                 ncols=80, position=0, leave=True,
                 ):
    sleep(.07)

22:03:32 | INFO | "progress:  60%|###################8             | 12/20 [00:00<00:00, 14.26it/s]"
22:03:33 | INFO | "progress: 100%|#################################| 20/20 [00:01<00:00, 14.24it/s]"



In [45]:
for i in LogTQDM(range(20), logger=logging.getLogger(__name__),
                 ncols=80, position=0, leave=True,
                 mininterval=.05
                 ):
    sleep(.07)

22:05:14 | INFO | "progress:   5%|#7                                | 1/20 [00:00<00:01, 14.26it/s]"
22:05:14 | INFO | "progress:  10%|###4                              | 2/20 [00:00<00:01, 14.08it/s]"
22:05:14 | INFO | "progress:  15%|#####1                            | 3/20 [00:00<00:01, 14.02it/s]"
22:05:14 | INFO | "progress:  20%|######8                           | 4/20 [00:00<00:01, 14.00it/s]"
22:05:15 | INFO | "progress:  25%|########5                         | 5/20 [00:00<00:01, 13.98it/s]"
22:05:15 | INFO | "progress:  30%|##########2                       | 6/20 [00:00<00:01, 13.97it/s]"
22:05:15 | INFO | "progress:  35%|###########8                      | 7/20 [00:00<00:00, 13.96it/s]"
22:05:15 | INFO | "progress:  40%|#############6                    | 8/20 [00:00<00:00, 13.96it/s]"
22:05:15 | INFO | "progress:  45%|###############3                  | 9/20 [00:00<00:00, 13.95it/s]"
22:05:15 | INFO | "progress:  50%|################5                | 10/20 [00:00<00:00, 13

In [65]:
for i in tqdm(range(20)):
    sleep(.05)

100%|██████████| 20/20 [00:01<00:00, 19.75it/s]


In [66]:
for i in tqdm(range(20), ascii=True):
    sleep(.05)

100%|##########| 20/20 [00:01<00:00, 19.79it/s]
