# Purpose

Run Vectorizing for all NEW subreddits in v0.4.1 

---

This notebook runs the `vectorize_text_to_embeddings` function to:
- loading USE-multilingual model
- load post & comment text
- convert the text into embeddings (at post or comment level)


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
# from functools import partial
# import os
import logging
# from pathlib import Path
# from pprint import pprint

import mlflow

# from tqdm.auto import tqdm
from tqdm import tqdm
import numpy as np
import pandas as pd

from google.cloud import storage

# TF libraries... I've been getting errors when these aren't loaded
import tensorflow_text
import tensorflow as tf

import subclu
from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.models.vectorize_text import (
    vectorize_text_to_embeddings,
)
from subclu.models import vectorize_text_tf

from subclu.utils import set_working_directory
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([mlflow, np, mlflow, pd, tensorflow_text, tf, subclu])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.18.5
mlflow		v: 1.16.0
pandas		v: 1.2.5
tensorflow_text	v: 2.3.0
tensorflow	v: 2.3.3
subclu		v: 0.4.1


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Initialize mlflow logging with sqlite database

In [6]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db'

## Get list of experiments with new function

In [7]:
mlf.list_experiment_meta(output_format='pandas')

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
0,0,Default,./mlruns/0,active
1,1,fse_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/1,active
2,2,fse_vectorize_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/2,active
3,3,subreddit_description_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/3,active
4,4,fse_vectorize_v1.1,gs://i18n-subreddit-clustering/mlflow/mlruns/4,active
5,5,use_multilingual_v0.1_test,gs://i18n-subreddit-clustering/mlflow/mlruns/5,active
6,6,use_multilingual_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/6,active
7,7,use_multilingual_v1_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/7,active
8,8,use_multilingual_v1_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/8,active
9,9,v0.3.2_use_multi_inference_test,gs://i18n-subreddit-clustering/mlflow/mlruns/9,active


# Check whether we have access to a GPU

In [8]:
l_phys_gpus = tf.config.list_physical_devices('GPU')
# from tensorflow.python.client import device_lib

print(
    f"\nBuilt with CUDA? {tf.test.is_built_with_cuda()}"
    f"\nGPUs\n==="
    f"\nNum GPUs Available: {len(l_phys_gpus)}"
    f"\nGPU details:\n{l_phys_gpus}"
#     f"\n\nAll devices:\n===\n"
#     f"{device_lib.list_local_devices()}"
)


Built with CUDA? True
GPUs
===
Num GPUs Available: 1
GPU details:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Load config with data to process

In [39]:
config_data_v041 = LoadHydraConfig(
    config_path="../config/data_text_and_metadata",
    config_name='v0.4.1_2021_12',
#     config_name='top_subreddits_2021_07_16',
)
config_data_v041.config_dict

{'dataset_name': 'v0.4.1 inputs - 50k subreddits - Active Subreddits (no Geo) + Geo-relevant users_l28>=100 & posts_l28>=4',
 'bucket_name': 'i18n-subreddit-clustering',
 'folder_subreddits_text_and_meta': 'subreddits/top/2021-12-14',
 'folder_posts_text_and_meta': 'posts/top/2021-12-14',
 'folder_comments_text_and_meta': 'comments/top/2021-12-14',
 'folder_subreddits_text_and_meta_filter': 'subreddits/top/2021-09-24'}

In [42]:
mlflow_experiment_test = 'v0.4.1_mUSE_inference_test'
mlflow_experiment_full = 'v0.4.1_mUSE_inference'

bucket_name = config_data_v041.config_dict['bucket_name']
subreddits_path = config_data_v041.config_dict['folder_subreddits_text_and_meta']
posts_path = config_data_v041.config_dict['folder_posts_text_and_meta']

subreddits_path_exclude = config_data_v041.config_dict['folder_subreddits_text_and_meta_filter']
# comments_path = None

# Side bar - had to move files 
## from subfolder `2021-12-14` to: `2021-12-24`...
dislexia strikes again

```python
# check list of files
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
l_comment_files_to_process = list(bucket.list_blobs(prefix=subreddits_path))
total_comms_file_count = len(l_comment_files_to_process)
print(total_comms_file_count)

new_folder = "gs://i18n-subreddit-clustering/posts/top/2021-12-14/"
for blob_ in list(bucket.list_blobs(prefix='posts/top/2021-12-24')):
    current_name = f"gs://i18n-subreddit-clustering/{blob_.name}"
    
    `!gsutil mv $current_name   $new_folder`  # remove tick marks, in actual code, only used them to fix display issue
    
# > Copying gs://i18n-subreddit-clustering/posts/top/2021-12-24/000000000001.parquet [Content-Type=application/octet-stream]...
# > Removing gs://i18n-subreddit-clustering/posts/top/2021-12-24/000000000001.parquet...

```

# Run full with `lower_case=False`
Let's see if the current refactor is good enough or if I really need to manually batch files...

**answer**: no it wasn't good enough -- 60GB of RAM wasn't good enough for 19Million comments _lol_.

```
...
12:02:14 | INFO | "  (19168154, 6) <- updated df_comments shape"
12:02:14 | INFO | "Vectorizing COMMENTS..."
12:02:14 | INFO | "Getting embeddings in batches of size: 2100"
100%
9128/9128 [1:32:26<00:00, 1.97it/s]

<__array_function__ internals> in concatenate(*args, **kwargs)

MemoryError: Unable to allocate 36.6 GiB for an array with shape (512, 19168154) and data type float32
```


In [17]:
comments_path = None
print(mlflow_experiment_test)
print(posts_path)

v0.4.1_mUSE_inference_test
posts/top/2021-12-14


In [None]:
mlflow.end_run(status='KILLED')

vectorize_text_tf.vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name=f"posts_as_comments_new_fxn{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    mlflow_experiment=mlflow_experiment_test,
    
    tokenize_lowercase=False,
    
    bucket_name=bucket_name,
    subreddits_path=None,  # subreddits_path,
    posts_path=None,  # posts_path
    comments_path=posts_path,
    subreddits_path_exclude=subreddits_path_exclude,  # New param to exclude embeddings in these subs

    # Hack: Rename cols so that I can process posts as a batch of comments
    col_post_id=None,
    col_comment_id='post_id',
    col_text_comment='text',
    col_text_comment_word_count='text_word_count',
    cols_index_comment=['subreddit_name', 'subreddit_id', 'post_id'],
    local_comms_subfolder_relative='df_vect_posts',
    mlflow_comments_folder='df_vect_posts',
    
    tf_batch_inference_rows=2500,
    tf_limit_first_n_chars=850,
    
    n_sample_comment_files=2,
    batch_comment_files=True,
    
    # try slicing later files?
    
#     n_sample_posts=9500,
#     n_sample_comments=19100,
)

In [45]:
mlflow.end_run(status='KILLED')

## Test - Re-do comments with new batching logic
Trying to do all 19 million comments at once broke, sigh, so need to batch one file at a time.

### Re-run comments and log to non-test mlflow experiment


Besides file-batching, this job increased the row-batches from 2,000 to 6,100... unclear if this is having a negative impact. Maybe smaller batches are somehow more efficient?
Now that I'm reading one file at a time, it looks like speed is taking a big hit

Baseline when running it all in memory. It took `1:32:26`, but it ran out of memory (RAM).
The current ETA is around `2 hours`

```
# singe file, all in memory (results in OOM)
12:02:14 | INFO | "Vectorizing COMMENTS..."
12:02:14 | INFO | "Getting embeddings in batches of size: 2100"
100%
9128/9128 [1:32:26<00:00, 1.97it/s]


# one file at a time... slower, but we get results one file at a time...
16%
6/37 [21:11<1:49:46, 212.45s/it]
```


In [None]:
mlflow.end_run(status='KILLED')

vectorize_text_tf.vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name=f"posts_as_comments_batch_fxn-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    mlflow_experiment=mlflow_experiment_full,
    
    tokenize_lowercase=False,
    subreddits_path_exclude=subreddits_path_exclude,  # New param to exclude embeddings in these subs
    batch_comment_files=True,
    
    bucket_name=bucket_name,
    subreddits_path=subreddits_path,
    posts_path=None,  # posts_path
    comments_path=posts_path,

    # Hack: Rename cols so that I can process posts as a batch of comments
    col_post_id=None,
    col_comment_id='post_id',
    col_text_comment='text',
    col_text_comment_word_count='text_word_count',
    cols_index_comment=['subreddit_name', 'subreddit_id', 'post_id'],
    local_comms_subfolder_relative='df_vect_posts',
    mlflow_comments_folder='df_vect_posts',
    
    tf_batch_inference_rows=2600,
    tf_limit_first_n_chars=850,
    
    n_sample_comment_files=None,
    
#     n_sample_posts=9500,
#     n_sample_comments=19100,
)

20:58:41 | INFO | "Start vectorize function"
20:58:41 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-12-17_205841"
20:58:41 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db"
20:58:44 | INFO | "host_name: djb-subclu-inference-tf-2-3-20210630"
20:58:44 | INFO | "  Saving config to local path..."
20:58:44 | INFO | "  Logging config to mlflow with joblib..."
20:58:44 | INFO | "  Logging config to mlflow with YAML..."
20:58:45 | INFO | "Loading model use_multilingual..."
20:58:47 | INFO | "  0:00:02.292857 <- Load TF HUB model time elapsed"
20:58:47 | INFO | "Load subreddits df..."
20:58:48 | INFO | "  0:00:00.790027 <- df_subs_exclude loading time elapsed"
20:58:48 | INFO | "  (19262, 4) <- df_subs_exclude shape"
20:58:48 | INFO | "Load subreddits df..."
20:58:49 | INFO | "  0:00:01.292345 <- df_subs loading time elapsed"
20:58:49 |

In [None]:
gc.collect()

# Run full with `lower_case=True`

This one is expected to be a little slower because it'll call `.str.lower()` on each batch of text.

---

TODO: unsure if it's worth running this job in parallel while I do work on a separate VM... might be a big pain to manually sync the rows from metrics & params happening at the same time in two different VMs.



In [26]:
mlflow.end_run(status='KILLED')

vectorize_text_tf.vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name=f"posts_as_comments_batch_fxn-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    mlflow_experiment=mlflow_experiment_full,
    
    tokenize_lowercase=True,
    
    bucket_name=bucket_name,
    subreddits_path=subreddits_path,
    posts_path=None,  # posts_path
    comments_path=posts_path,

    # Hack: Rename cols so that I can process posts as a batch of comments
    col_post_id=None,
    col_comment_id='post_id',
    col_text_comment='text',
    col_text_comment_word_count='text_word_count',
    cols_index_comment=['subreddit_name', 'subreddit_id', 'post_id'],
    local_comms_subfolder_relative='df_vect_posts',
    mlflow_comments_folder='df_vect_posts',
    
    tf_batch_inference_rows=2600,
    tf_limit_first_n_chars=850,
    
    n_sample_comment_files=None,
    
#     n_sample_posts=9500,
#     n_sample_comments=19100,
)

10:05:02 | INFO | "Start vectorize function"
10:05:02 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-09-28_100502"
10:05:02 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db"
10:05:03 | INFO | "  Saving config to local path..."
10:05:03 | INFO | "  Logging config to mlflow..."
10:05:03 | INFO | "Loading model use_multilingual..."
10:05:05 | INFO | "  0:00:02.265257 <- Load TF HUB model time elapsed"
10:05:05 | INFO | "Load subreddits df..."
10:05:06 | INFO | "  0:00:00.683829 <- df_subs loading time elapsed"
10:05:06 | INFO | "  (19262, 4) <- df_subs shape"
10:05:06 | INFO | "Vectorizing subreddit descriptions..."
10:05:06 | INFO | "Getting embeddings in batches of size: 2600"
ResourceExhausted, lowering character limit
 OOM when allocating tensor with shape[568066,1280] and type float on /job:localhost/replica:0/task:0/device:GP

In [None]:
LEGACY

# Run full with lower_case=False

Time on CPU, only comments + subs:
```
13:29:07 | INFO | "  (1108757, 6) <- df_comments shape"
13:29:08 | INFO | "  (629, 4) <- df_subs shape"

13:45:11 | INFO | "  0:16:21.475036 <- Total vectorize fxn time elapsed"
```

In [18]:
mlflow.end_run(status='KILLED')

model, df_vect, df_vect_comments, df_vect_subs = vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name='full_data-lowercase_false',
    mlflow_experiment=mlflow_experiment_full,
    
    tokenize_lowercase=False,
    subreddits_path='subreddits/de/2021-06-16',
    posts_path=None,  # 'posts/de/2021-06-16',
    comments_path='comments/de/2021-06-16',
    tf_batch_inference_rows=1500,
    tf_limit_first_n_chars=1100,
    n_sample_posts=None,
    n_sample_comments=None,
)

13:28:50 | INFO | "Start vectorize function"
13:28:50 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-07-01_1328"
13:28:50 | INFO | "Load comments df..."
13:29:07 | INFO | "  (1108757, 6) <- df_comments shape"
13:29:07 | INFO | "Keep only comments that match posts IDs in df_posts..."
13:29:07 | INFO | "df_posts missing, so we can't filter comments..."
13:29:07 | INFO | "Load subreddits df..."
13:29:08 | INFO | "  (629, 4) <- df_subs shape"
13:29:08 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
13:29:09 | INFO | "Loading model use_multilingual...
  with kwargs: None"




13:29:11 | INFO | "  0:00:02.282361 <- Load TF HUB model time elapsed"
13:29:11 | INFO | "Vectorizing subreddit descriptions..."




13:29:13 | INFO | "  Saving to local... df_vect_subreddits_description..."
13:29:13 | INFO | "  Logging to mlflow..."




13:29:14 | INFO | "Vectorizing COMMENTS..."
13:29:14 | INFO | "Getting embeddings in batches of size: 1500"


  0%|          | 0/740 [00:00<?, ?it/s]

13:44:30 | INFO | "  Saving to local... df_vect_comments..."
13:44:49 | INFO | "  Logging to mlflow..."
13:45:11 | INFO | "  0:16:21.475036 <- Total vectorize fxn time elapsed"
