# Purpose

We have a new batch of COMMENTS for v0.4.1 that we need to vectorize.

Diff from before: Instead of trying to process all comments in a single job, split up into 3 batches so that it's eaiser to recover in case any of the jobs fails.

---

This notebook runs the `vectorize_text_to_embeddings` function to:
- loading USE-multilingual model
- load post & comment text
- convert the text into embeddings (at post or level)


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
# from functools import partial
# import os
import logging
# from pathlib import Path
# from pprint import pprint

import mlflow

# from tqdm.auto import tqdm
from tqdm import tqdm
import numpy as np
import pandas as pd

from google.cloud import storage

# TF libraries... I've been getting errors when these aren't loaded
import tensorflow_text
import tensorflow as tf

import subclu
from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.models.vectorize_text import (
    vectorize_text_to_embeddings,
)
from subclu.models import vectorize_text_tf

from subclu.utils import set_working_directory
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([mlflow, np, mlflow, pd, tensorflow_text, tf, subclu])

python		v 3.7.10
===
mlflow		v: 1.16.0
numpy		v: 1.18.5
mlflow		v: 1.16.0
pandas		v: 1.2.5
tensorflow_text	v: 2.3.0
tensorflow	v: 2.3.3
subclu		v: 0.4.1


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Initialize mlflow logging with sqlite database

In [4]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db'

## Get list of experiments with new function

In [9]:
mlf.list_experiment_meta(output_format='pandas').tail(10)

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
15,15,v0.4.0_use_multi_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/15,active
16,16,v0.4.0_use_multi_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/16,active
17,17,v0.4.0_use_multi_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/17,active
18,18,v0.4.0_use_multi_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/18,active
19,19,v0.4.1_mUSE_inference_test,gs://i18n-subreddit-clustering/mlflow/mlruns/19,active
20,20,v0.4.1_mUSE_inference,gs://i18n-subreddit-clustering/mlflow/mlruns/20,active
21,21,v0.4.1_mUSE_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/21,active
22,22,v0.4.1_mUSE_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/22,active
23,23,v0.4.1_mUSE_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/23,active
24,24,v0.4.1_mUSE_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/24,active


# Check whether we have access to a GPU

In [6]:
l_phys_gpus = tf.config.list_physical_devices('GPU')
# from tensorflow.python.client import device_lib

print(
    f"\nBuilt with CUDA? {tf.test.is_built_with_cuda()}"
    f"\nGPUs\n==="
    f"\nNum GPUs Available: {len(l_phys_gpus)}"
    f"\nGPU details:\n{l_phys_gpus}"
#     f"\n\nAll devices:\n===\n"
#     f"{device_lib.list_local_devices()}"
)


Built with CUDA? True
GPUs
===
Num GPUs Available: 1
GPU details:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Load config with data to process

In [10]:
config_data_v041 = LoadHydraConfig(
    config_path="../config/data_text_and_metadata",
    config_name='v0.4.1_2021_12',
)
config_data_v041.config_dict

{'dataset_name': 'v0.4.1 inputs - 50k subreddits - Active Subreddits (no Geo) + Geo-relevant users_l28>=100 & posts_l28>=4',
 'bucket_name': 'i18n-subreddit-clustering',
 'folder_subreddits_text_and_meta': 'subreddits/top/2021-12-14',
 'folder_posts_text_and_meta': 'posts/top/2021-12-14',
 'folder_comments_text_and_meta': 'comments/top/2021-12-14',
 'folder_subreddits_text_and_meta_filter': 'subreddits/top/2021-09-24'}

In [11]:
mlflow_experiment_test = 'v0.4.1_mUSE_inference_test'
mlflow_experiment_full = 'v0.4.1_mUSE_inference'

# Add or over-ride configs values
bucket_name = config_data_v041.config_dict['bucket_name']
subreddits_path = None  # config_data_v041.config_dict['folder_subreddits_text_and_meta']
posts_path = None  # config_data_v041.config_dict['folder_posts_text_and_meta']

comments_path = config_data_v041.config_dict['folder_comments_text_and_meta']

# subreddits_path_exclude = config_data_v041.config_dict['folder_subreddits_text_and_meta_filter']

### list total number of comment files

In [17]:
# comments_full_path = f"gs://{bucket_name}/{comments_path}"
# !gsutil ls $comments_full_path

In [18]:
# check list of files
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
l_comment_files_to_process = list(bucket.list_blobs(prefix=comments_path))
total_comms_file_count = len(l_comment_files_to_process)
print(total_comms_file_count)

73


In [20]:
import math

n_subsets = 4
slice_base = math.ceil(total_comms_file_count / 4)
slice_indeces = list()

for i in range(4):
    slice_indeces.append((i * slice_base, (i + 1) * slice_base))

slice_indeces

[(0, 19), (19, 38), (38, 57), (57, 76)]

# Test with batching function


### New function (with batching)
Most inputs will be the same.
However, some things will change:
- Added new parameter to sample only first N files (we'll process each file individually)

For subreddit only, we can expand to more than 1,500 characters.

HOWEVER - when scoring posts &/or comments, we're better off trimming to first ~1,000 characters to speed things up. We can increase the character len if results aren't great... this could be a hyperparameter to tune.

### Notes on batch & characters:

Comments tend to be shorter, so we can usually run larger batches. A batch of `6,000` can still result in `OOM` errors, so go lower than that.
```python
    # TF batches
    tf_batch_inference_rows=5000,
    tf_limit_first_n_chars=900,
```

Posts tend to be longer, so we're better off running smaller batches:
```python
    # TF batches
    tf_batch_inference_rows=2400,
    tf_limit_first_n_chars=900,
```


In [14]:
mlflow.end_run(status='KILLED')

vectorize_text_tf.vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name=f"posts_as_comments_full_text-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    mlflow_experiment=mlflow_experiment_test,
    
    tokenize_lowercase=False,
    batch_comment_files=True,
    
    bucket_name=bucket_name,
    subreddits_path=None,
    posts_path=None,
    comments_path=comments_path,

    # TF batches
    tf_batch_inference_rows=6000,
    tf_limit_first_n_chars=900,
    
    # Sampling/batching files or rows
    n_sample_comment_files=2,
    n_sample_comments=209100,
    # n_sample_posts=9500,
    get_embeddings_verbose=True,
)

07:06:41 | INFO | "Start vectorize function"
07:06:41 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-12-21_070641"
07:06:41 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db"
07:06:42 | INFO | "host_name: djb-subclu-inference-tf-2-3-20210630"
07:06:42 | INFO | "  Saving config to local path..."
07:06:42 | INFO | "  Logging config to mlflow with joblib..."
07:06:43 | INFO | "  Logging config to mlflow with YAML..."
07:06:43 | INFO | "Loading model use_multilingual..."
07:06:45 | INFO | "  0:00:02.223100 <- Load TF HUB model time elapsed"
07:06:45 | INFO | "** Procesing Comments files one at a time ***"
07:06:45 | INFO | "-- Loading & vectorizing COMMENTS in files: 2 --
Expected batch size: 6000"
local variable 'df_posts' referenced before assignment"
07:06:45 | INFO | "Processing: comments/top/2021-12-14/000000000000.parquet"
07:0

### Test on a slice of files

The previous `n_sample_comment_files` would always sample the first N files, but we didn't check whether file list was sorted.

```
# # TODO(djb): blobs can't be sorted, but I can sort and check the file name, so use that instead.
```
With new refactoring: 
- I sort list to ensure consistency on each run 
- add slice start & end parameters to pick arbitrary files in list

In [25]:
mlflow.end_run(status='KILLED')

vectorize_text_tf.vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name=f"comments_slice-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    mlflow_experiment=mlflow_experiment_test,
    
    tokenize_lowercase=False,
    batch_comment_files=True,
    
    bucket_name=bucket_name,
    subreddits_path=None,
    posts_path=None,
    comments_path=comments_path,

    # TF batches
    tf_batch_inference_rows=6000,
    tf_limit_first_n_chars=900,

    # slicing FILES 
    n_comment_files_slice_start=2,
    n_comment_files_slice_end=4,

    # Sampling FILES 
    #   NOTE: DON'T USE n_sample_*_files and slices! they may create unexpected results!
    # n_sample_comment_files=2, 
     
    # Sampling rows
    n_sample_comments=89100,
    # n_sample_posts=9500,
    get_embeddings_verbose=True,
)

07:42:09 | INFO | "Start vectorize function"
07:42:09 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-12-21_074209"
07:42:10 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db"
07:42:10 | INFO | "host_name: djb-subclu-inference-tf-2-3-20210630"
07:42:10 | INFO | "  Saving config to local path..."
07:42:10 | INFO | "  Logging config to mlflow with joblib..."
07:42:11 | INFO | "  Logging config to mlflow with YAML..."
07:42:11 | INFO | "Loading model use_multilingual..."
07:42:13 | INFO | "  0:00:02.271232 <- Load TF HUB model time elapsed"
07:42:13 | INFO | "** Procesing Comments files one at a time ***"
07:42:13 | INFO | "-- Loading & vectorizing COMMENTS in files: 2 --
Expected batch size: 6000"
local variable 'df_posts' referenced before assignment"




07:42:14 | INFO | "Processing: comments/top/2021-12-14/000000000002.parquet"
07:42:17 | INFO | "  Sampling COMMENTS down to: 39,100     Samples PER FILE: 19,551"
07:42:17 | INFO | "  (19551, 6) <- df_comments.shape AFTER sampling"
07:42:17 | INFO | "cols_index: ['subreddit_name', 'subreddit_id', 'post_id', 'comment_id']"
07:42:17 | INFO | "col_text: comment_body_text"
07:42:17 | INFO | "lowercase_text: False"
07:42:17 | INFO | "limit_first_n_chars: 900"
07:42:17 | INFO | "limit_first_n_chars_retry: 600"
07:42:17 | INFO | "Getting embeddings in batches of size: 6000"




07:42:22 | INFO | "progress: 100%|###################################| 4/4 [00:04<00:00,  1.25s/it]"

07:42:22 | INFO | "  Saving to local: df_vect_comments/000000000002 | 19,551 Rows by 516 Cols"
07:42:24 | INFO | "Processing: comments/top/2021-12-14/000000000003.parquet"
07:42:26 | INFO | "  Sampling COMMENTS down to: 39,100     Samples PER FILE: 19,551"
07:42:26 | INFO | "  (19551, 6) <- df_comments.shape AFTER sampling"
07:42:26 | INFO | "cols_index: ['subreddit_name', 'subreddit_id', 'post_id', 'comment_id']"
07:42:26 | INFO | "col_text: comment_body_text"
07:42:26 | INFO | "lowercase_text: False"
07:42:26 | INFO | "limit_first_n_chars: 900"
07:42:26 | INFO | "limit_first_n_chars_retry: 600"
07:42:27 | INFO | "Getting embeddings in batches of size: 6000"
07:42:31 | INFO | "progress: 100%|###################################| 4/4 [00:04<00:00,  1.01s/it]"

07:42:31 | INFO | "  Saving to local: df_vect_comments/000000000003 | 19,551 Rows by 516 Cols"
07:42:32 | INFO | "progress: 100%

In [26]:
mlflow.end_run(status='KILLED')

# Run all in slices + using new batching logic

By splitting into 4 jobs (using slices), we decrease the chance of a single job failing and ruining all the jobs. Which should make it easier to re-run a single job if it fails.

### impact of batch size
by increasing batch size from 3,300 to 3,900 we reduce processing time per file by 15 seconds.

```bash
[07:56:14 | INFO | "Processing: comments/top/2021-12-14/000000000000.parquet"]
07:50:24 | INFO | "Getting embeddings in batches of size: 3300"
07:50:35 | INFO | "progress:   5%|#6                              | 13/245 [00:11<03:22,  1.15it/s]"
07:50:46 | INFO | "progress:  12%|###7                            | 29/245 [00:22<02:43,  1.32it/s]"
07:50:58 | INFO | "progress:  18%|#####8                          | 45/245 [00:34<02:30,  1.33it/s]"
07:51:09 | INFO | "progress:  24%|#######8                        | 60/245 [00:45<02:18,  1.34it/s]"
07:51:22 | INFO | "progress:  31%|#########7                      | 75/245 [00:58<02:14,  1.26it/s]"
07:51:34 | INFO | "progress:  36%|###########4                    | 88/245 [01:09<02:07,  1.23it/s]"
07:51:45 | INFO | "progress:  36%|###########4                    | 88/245 [01:21<02:07,  1.23it/s]"
07:51:45 | INFO | "progress:  42%|#############                  | 103/245 [01:21<01:52,  1.26it/s]"
07:51:55 | INFO | "progress:  42%|#############                  | 103/245 [01:31<01:52,  1.26it/s]"
07:51:56 | INFO | "progress:  48%|##############9                | 118/245 [01:32<01:38,  1.28it/s]"
07:52:08 | INFO | "progress:  54%|################8              | 133/245 [01:44<01:27,  1.28it/s]"
07:52:19 | INFO | "progress:  60%|##################5            | 147/245 [01:55<01:17,  1.27it/s]"
07:52:31 | INFO | "progress:  66%|####################3          | 161/245 [02:06<01:07,  1.25it/s]"
07:52:44 | INFO | "progress:  71%|######################1        | 175/245 [02:19<00:58,  1.20it/s]"
07:52:55 | INFO | "progress:  77%|#######################9       | 189/245 [02:31<00:46,  1.20it/s]"
07:53:06 | INFO | "progress:  77%|#######################9       | 189/245 [02:41<00:46,  1.20it/s]"
07:53:06 | INFO | "progress:  82%|#########################5     | 202/245 [02:42<00:36,  1.19it/s]"
07:53:18 | INFO | "progress:  88%|###########################3   | 216/245 [02:54<00:24,  1.19it/s]"
07:53:30 | INFO | "progress:  94%|#############################1 | 230/245 [03:05<00:12,  1.20it/s]"
07:53:41 | INFO | "progress: 100%|##############################8| 244/245 [03:16<00:00,  1.21it/s]"
07:53:41 | INFO | "progress: 100%|###############################| 245/245 [03:17<00:00,  1.24it/s]"


# higher batch size = faster
07:56:14 | INFO | "Processing: comments/top/2021-12-14/000000000000.parquet"
07:56:17 | INFO | "Getting embeddings in batches of size: 3900"
07:56:29 | INFO | "progress:   6%|#8                              | 12/207 [00:11<03:04,  1.06it/s]"
07:56:40 | INFO | "progress:  13%|####1                           | 27/207 [00:22<02:29,  1.20it/s]"
07:56:52 | INFO | "progress:  20%|######4                         | 42/207 [00:35<02:15,  1.21it/s]"
07:57:04 | INFO | "progress:  27%|########6                       | 56/207 [00:46<02:04,  1.21it/s]"
07:57:17 | INFO | "progress:  34%|##########8                     | 70/207 [00:59<01:58,  1.16it/s]"
07:57:28 | INFO | "progress:  34%|##########8                     | 70/207 [01:10<01:58,  1.16it/s]"
07:57:28 | INFO | "progress:  40%|############8                   | 83/207 [01:11<01:47,  1.15it/s]"
07:57:40 | INFO | "progress:  47%|##############9                 | 97/207 [01:22<01:34,  1.17it/s]"
07:57:52 | INFO | "progress:  54%|################6              | 111/207 [01:34<01:22,  1.17it/s]"
07:58:04 | INFO | "progress:  60%|##################5            | 124/207 [01:46<01:11,  1.16it/s]"
07:58:15 | INFO | "progress:  66%|####################5          | 137/207 [01:58<01:01,  1.14it/s]"
07:58:28 | INFO | "progress:  66%|####################5          | 137/207 [02:11<01:01,  1.14it/s]"
07:58:28 | INFO | "progress:  72%|######################4        | 150/207 [02:11<00:52,  1.10it/s]"
07:58:38 | INFO | "progress:  72%|######################4        | 150/207 [02:21<00:52,  1.10it/s]"
07:58:40 | INFO | "progress:  78%|########################2      | 162/207 [02:22<00:41,  1.08it/s]"
07:58:52 | INFO | "progress:  85%|##########################2    | 175/207 [02:34<00:29,  1.09it/s]"
07:59:04 | INFO | "progress:  91%|############################1  | 188/207 [02:46<00:17,  1.08it/s]"
07:59:15 | INFO | "progress:  97%|##############################1| 201/207 [02:58<00:05,  1.09it/s]"
07:59:21 | INFO | "progress: 100%|###############################| 207/207 [03:03<00:00,  1.13it/s]"
```

In [30]:
slice_indeces

[(0, 19), (19, 38), (38, 57), (57, 76)]

In [35]:
slice_indeces[1:]

[(19, 38), (38, 57), (57, 76)]

In [37]:
d_slice_nums = {s_: i for i, s_ in enumerate(slice_indeces)}
d_slice_nums

{(0, 19): 0, (19, 38): 1, (38, 57): 2, (57, 76): 3}

## Re-run comments and log to "full" (non-test) mlflow experiment

batch of 3,800 is too large - 3 of 4 jobs failed, reduce down to 3,200 or lower b/c it's better to take a while and save than keep getting stuck.


In [45]:
int(3800 * .65)

2470

In [None]:
for slice_ in slice_indeces[1:]:
    try:
        mlflow.end_run(status='KILLED')

        vectorize_text_tf.vectorize_text_to_embeddings(
            model_name='use_multilingual',
            run_name=f"comments_slice_{d_slice_nums[slice_]}-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
            mlflow_experiment=mlflow_experiment_full,

            tokenize_lowercase=False,
            batch_comment_files=True,

            bucket_name=bucket_name,
            subreddits_path=None,
            posts_path=None,
            comments_path=comments_path,

            # TF batches
            tf_batch_inference_rows=3600,
            tf_limit_first_n_chars=900,

            # slicing FILES 
            n_comment_files_slice_start=slice_[0],
            n_comment_files_slice_end=slice_[1],
        )
    except Exception as e:
        print(e)

10:08:27 | INFO | "Start vectorize function"
10:08:27 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-12-21_100827"
10:08:28 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db"
10:08:28 | INFO | "host_name: djb-subclu-inference-tf-2-3-20210630"
10:08:28 | INFO | "  Saving config to local path..."
10:08:28 | INFO | "  Logging config to mlflow with joblib..."
10:08:29 | INFO | "  Logging config to mlflow with YAML..."
10:08:29 | INFO | "Loading model use_multilingual..."




10:08:31 | INFO | "  0:00:02.250651 <- Load TF HUB model time elapsed"
10:08:31 | INFO | "** Procesing Comments files one at a time ***"
10:08:31 | INFO | "-- Loading & vectorizing COMMENTS in files: 19 --
Expected batch size: 3600"
local variable 'df_posts' referenced before assignment"




10:08:32 | INFO | "Processing: comments/top/2021-12-14/000000000019.parquet"
10:08:36 | INFO | "Getting embeddings in batches of size: 3600"




10:08:47 | INFO | "progress:   5%|#7                              | 12/221 [00:11<03:14,  1.07it/s]"
10:08:59 | INFO | "progress:  11%|###6                            | 25/221 [00:22<02:57,  1.11it/s]"
10:09:11 | INFO | "progress:  17%|#####5                          | 38/221 [00:34<02:48,  1.09it/s]"
10:09:22 | INFO | "progress:  23%|#######3                        | 51/221 [00:46<02:33,  1.11it/s]"
10:09:33 | INFO | "progress:  23%|#######3                        | 51/221 [00:57<02:33,  1.11it/s]"
10:09:34 | INFO | "progress:  29%|#########4                      | 65/221 [00:57<02:15,  1.15it/s]"
10:09:45 | INFO | "progress:  36%|###########4                    | 79/221 [01:08<01:59,  1.18it/s]"
10:09:58 | INFO | "progress:  42%|#############4                  | 93/221 [01:21<01:50,  1.16it/s]"
10:10:09 | INFO | "progress:  48%|###############                | 107/221 [01:33<01:37,  1.17it/s]"
10:10:24 | INFO | "progress:  48%|###############                | 107/221 [01:47<01:37,  1

In [None]:
gc.collect()

# Run full with `lower_case=True`

This one is expected to be a little slower because it'll call `.str.lower()` on each batch of text.

---

TODO: unsure if it's worth running this job in parallel while I do work on a separate VM... might be a big pain to manually sync the rows from metrics & params happening at the same time in two different VMs.



In [18]:
mlflow.end_run(status='KILLED')

vectorize_text_tf.vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name=f"comments_lower_case-{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}",
    mlflow_experiment=mlflow_experiment_full,
    
    tokenize_lowercase=True,
    
    bucket_name=bucket_name,
    subreddits_path=None,
    posts_path=None,
    comments_path=comments_path,

    # TF batches
    tf_batch_inference_rows=3200,
    tf_limit_first_n_chars=900,
    
    # Sampling FILES
    # n_sample_comment_files=15,
    # n_comment_files_slice_start=20,
    # n_comment_files_slice_end=62,
)

06:39:08 | INFO | "Start vectorize function"
06:39:08 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-10-05_063908"
06:39:08 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-subclu-inference-tf-2-3-20210630/mlruns.db"
06:39:08 | INFO | "  Saving config to local path..."
06:39:08 | INFO | "  Logging config to mlflow..."
06:39:09 | INFO | "Loading model use_multilingual..."
06:39:11 | INFO | "  0:00:02.147629 <- Load TF HUB model time elapsed"
06:39:11 | INFO | "** Procesing Comments files one at a time ***"
06:39:11 | INFO | "-- Loading & vectorizing COMMENTS in files: 59 --
Expected batch size: 3200"
local variable 'df_posts' referenced before assignment"
  0%|          | 0/59 [00:00<?, ?it/s]06:39:11 | INFO | "Processing: comments/top/2021-10-04/000000000000.parquet"
06:39:15 | INFO | "Getting embeddings in batches of size: 3200"
100%|####################################

UnboundLocalError: local variable 'count_files_processed' referenced before assignment

In [19]:
mlflow.end_run(status='KILLED')

# Appendix

### Notes on previous function (all in memory):
- 60GB of RAM wasn't good enough for 19Million comments _lol_ (also might've run into memory leaks in the GPU)

```
...
12:02:14 | INFO | "  (19168154, 6) <- updated df_comments shape"
12:02:14 | INFO | "Vectorizing COMMENTS..."
12:02:14 | INFO | "Getting embeddings in batches of size: 2100"
100%
9128/9128 [1:32:26<00:00, 1.97it/s]

<__array_function__ internals> in concatenate(*args, **kwargs)

MemoryError: Unable to allocate 36.6 GiB for an array with shape (512, 19168154) and data type float32

```

### New batching fxn
Besides file-batching, this job increased the row-batches from 2,000 to 6,100... unclear if this is having a negative impact. Maybe smaller batches are somehow more efficient?
Now that I'm reading one file at a time, it looks like speed is taking a big hit

Baseline when running it all in memory. It took `1:32:26`, but it ran out of memory (RAM).
The current ETA is around `2 hours`

```
# singe file, all in memory (results in OOM)
12:02:14 | INFO | "Vectorizing COMMENTS..."
12:02:14 | INFO | "Getting embeddings in batches of size: 2100"
100%
9128/9128 [1:32:26<00:00, 1.97it/s]


# one file at a time... slower, but we get results one file at a time...
16%
6/37 [21:11<1:49:46, 212.45s/it]
```


Notes on new fxn to batch posts as if they're comments. (Because batching logic is only implemented for comments)

```python
    # Hack: Rename cols so that I can process `posts` as a batch of comments
    bucket_name=bucket_name,
    subreddits_path=None,
    posts_path=None,  # posts_path
    comments_path=comments_path,
    
    col_post_id=None,
    col_comment_id='post_id',
    col_text_comment='text',
    col_text_comment_word_count='text_word_count',
    cols_index_comment=['subreddit_name', 'subreddit_id', 'post_id'],
    local_comms_subfolder_relative='df_vect_posts',
    mlflow_comments_folder='df_vect_posts_extra_text',
    cols_comment_text_to_concat=['flair_text', 'post_url_for_embeddings', 'text', 'ocr_inferred_text_agg_clean'],
```


In [None]:
LEGACY

# Run full with lower_case=False (legacy fse/fasttext)

Time on CPU, only comments + subs:
```
13:29:07 | INFO | "  (1108757, 6) <- df_comments shape"
13:29:08 | INFO | "  (629, 4) <- df_subs shape"

13:45:11 | INFO | "  0:16:21.475036 <- Total vectorize fxn time elapsed"
```

In [18]:
mlflow.end_run(status='KILLED')

model, df_vect, df_vect_comments, df_vect_subs = vectorize_text_to_embeddings(
    model_name='use_multilingual',
    run_name='full_data-lowercase_false',
    mlflow_experiment=mlflow_experiment_full,
    
    tokenize_lowercase=False,
    subreddits_path='subreddits/de/2021-06-16',
    posts_path=None,  # 'posts/de/2021-06-16',
    comments_path='comments/de/2021-06-16',
    tf_batch_inference_rows=1500,
    tf_limit_first_n_chars=1100,
    n_sample_posts=None,
    n_sample_comments=None,
)

13:28:50 | INFO | "Start vectorize function"
13:28:50 | INFO | "  Local model saving directory: /home/jupyter/subreddit_clustering_i18n/data/models/use_multilingual/2021-07-01_1328"
13:28:50 | INFO | "Load comments df..."
13:29:07 | INFO | "  (1108757, 6) <- df_comments shape"
13:29:07 | INFO | "Keep only comments that match posts IDs in df_posts..."
13:29:07 | INFO | "df_posts missing, so we can't filter comments..."
13:29:07 | INFO | "Load subreddits df..."
13:29:08 | INFO | "  (629, 4) <- df_subs shape"
13:29:08 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/mlflow/mlruns.db"
13:29:09 | INFO | "Loading model use_multilingual...
  with kwargs: None"




13:29:11 | INFO | "  0:00:02.282361 <- Load TF HUB model time elapsed"
13:29:11 | INFO | "Vectorizing subreddit descriptions..."




13:29:13 | INFO | "  Saving to local... df_vect_subreddits_description..."
13:29:13 | INFO | "  Logging to mlflow..."




13:29:14 | INFO | "Vectorizing COMMENTS..."
13:29:14 | INFO | "Getting embeddings in batches of size: 1500"


  0%|          | 0/740 [00:00<?, ?it/s]

13:44:30 | INFO | "  Saving to local... df_vect_comments..."
13:44:49 | INFO | "  Logging to mlflow..."
13:45:11 | INFO | "  0:16:21.475036 <- Total vectorize fxn time elapsed"
