# Purpose

Use this notebook to test the new data-loader (GCS), configs, and embeddings class that we'll be using in kubeflow.

For inference (getting embeddings) it might be better to read from GCS than from SQL.

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
from pathlib import Path

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

from tqdm import tqdm

import subclu
# from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)

# new modules to test:
from google.cloud import bigquery
from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_gcs import (
    LoadSubredditsGCS
)
from subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf import(
    get_embeddings_as_df,
    upload_folder_to_gcs,
)



print_lib_versions([bigquery, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
google.cloud.bigquery	v: 2.20.0
numpy		v: 1.18.5
pandas		v: 1.2.5
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.5.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()
logging.info('loggging ready')

17:30:03 | INFO | "loggging ready"


# Auth note
This notebook assumes you have authenticated using the gcloud CLI. Example</br>
```bash
gcloud auth application-default login
```

# Test embeddings function on plain df

To make sure that the function itself is fine.

#### Update: 2022-06-02
The name for the meta column has changed to: `subreddit_meta_for_embeddings`

In [16]:
%%time
RUN_DATE = '20220602'

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path=f'i18n_topic_model_batch/runs/{RUN_DATE}/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_meta_for_embeddings', 'subreddit_meta_for_embeddings_len'],
    col_unique_check='subreddit_id',
    df_format='pandas',
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
    unique_check=False,
    verbose= True,
)
subs.local_cache()

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")

df_ = subs.read_as_one_df()
print(df_.shape)

assert 1 == len(subs.local_files_)

18:07:30 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220602/subreddits/text"
18:07:30 | INFO | "  1 <- Files matching prefix"
18:07:30 | INFO | "  1 <- Files to check"
18:07:30 | INFO | "    000000000000.parquet <- File already exists, not downloading"
18:07:30 | INFO | "  Files already cached: 1"
18:07:30 | INFO | "  Files already downloaded."
18:07:30 | INFO | "  df format: pandas"


1 <- Local files
1 <- Local parquet files
(84480, 4)
CPU times: user 168 ms, sys: 173 ms, total: 341 ms
Wall time: 1.62 s


In [5]:
# # load model
# import tensorflow_hub as hub
# import tensorflow_text
# model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

In [6]:
# %%time
# # Get embeddings
# df_vect_ = get_embeddings_as_df(
#     model=model,
#     df=df_,
#     col_text='subreddit_name_title_related_subs_and_clean_descriptions',
#     cols_index=['subreddit_id', 'subreddit_name'],
#     verbose_init=True,
#     verbose=True,
# )

# Test upload folder to GCS

We need this if we're not using mlflow to track a model's output

In [11]:
%%time

upload_folder_to_gcs(
    bucket_name='gazette-models-temp',
    gcs_output_root=f'i18n_topic_model_batch/runs/{RUN_DATE}/subreddits/text/TEST_UPLOAD',
    local_dir=f'/home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/{RUN_DATE}/subreddits/text',
    gcs_new_subfolder=None,
    verbose=True,
    dry_run=True,
)

17:42:30 | INFO | "dry_run=True"
17:42:32 | INFO | "Uploading file
  from: /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220602/subreddits/text/000000000000.parquet
  to: gazette-models-temp/i18n_topic_model_batch/runs/20220602/subreddits/text/TEST_UPLOAD/000000000000.parquet"


/home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220602/subreddits/text
CPU times: user 25.3 ms, sys: 17.8 ms, total: 43.1 ms
Wall time: 1.26 s


In [13]:
%%time

upload_folder_to_gcs(
    bucket_name='gazette-models-temp',
    gcs_output_root=f'i18n_topic_model_batch/runs/{RUN_DATE}/subreddits/text/TEST_UPLOAD',
    local_dir=f'/home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/{RUN_DATE}/subreddits/text',
    gcs_new_subfolder='subfolder_test',
    verbose=True,
    dry_run=True,
)

17:43:35 | INFO | "dry_run=True"
17:43:36 | INFO | "Uploading file
  from: /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220602/subreddits/text/000000000000.parquet
  to: gazette-models-temp/i18n_topic_model_batch/runs/20220602/subreddits/text/TEST_UPLOAD/subfolder_test/000000000000.parquet"


/home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220602/subreddits/text
CPU times: user 28.5 ms, sys: 20.4 ms, total: 48.9 ms
Wall time: 1.29 s


# Check meta text len so that we know the length needed to cover ~90% of subreddits

This length will change based on what text we add. For example, adding taxonomy mature topics should increase text length for some subreddits.

In [35]:
style_df_numeric(
    df_['subreddit_meta_for_embeddings_len']
    .describe(percentiles=[.1, .25, .5, .70, .75, .85, .90, .95, .99])
    .to_frame()
)

Unnamed: 0,subreddit_meta_for_embeddings_len
count,84480.0
mean,758.48
std,1112.89
min,9.0
10%,71.0
25%,135.0
50%,298.0
70%,681.0
75%,887.0
85%,1524.0


# Load data AND Vectorize 

When we call the vectorizing function, it calls the data loader under the hood.
See the configs in:
- `subclu2/config`

In [15]:
path_djb_repo = '/home/david.bermejo/repos/subreddit_clustering_i18n/' 
path_djb_models = '/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/models' 
file_vectorize_py = 'subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf'

config_vectorize = 'vectorize_subreddits_test_local'

print(path_djb_repo)
print(file_vectorize_py)
print(config_vectorize)

/home/david.bermejo/repos/subreddit_clustering_i18n/
subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf
vectorize_subreddits_test_local


## All files as single DF

For subreddit-metadata, aim for:
- `first_n_chars` ~ 2,000 (~90th percentile)
- `batch_inference_rows` below 1,500 (fewer OOM errors => fewer retries)


*NOTE*: in previous test we had ~140k subreddits in 2 files:
```bash
`lowercase_text: False`
`limit_first_n_chars: 2200`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 2000`
0:03:58.241812 <- df_subs vectorizing time elapsed
# large batch & too many characters makes it slow b/c it runs OOM often, so we spend a lot of time retrying.


`limit_first_n_chars: 2000`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 1700`
0:02:21.720878 <- df_subs vectorizing time elapsed

`limit_first_n_chars: 1900`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 1600`
0:02:04.692077 <- df_subs vectorizing time elapsed
```

```bash
# single file with 84k subreddits
`Saving df_embeddings to: gcs://gazette-models-temp/i18n_topic_model_batch/runs/20220602/subreddits/text/embedding/2022-06-02_183302/df-84480_by_514.parquet`
`limit_first_n_chars: 2000`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 1500`
0:01:15.324874 <- Total vectorize fxn time elapsed
```

In [31]:
# run on sample data, test experiment

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=false \
    limit_first_n_chars=2000 \
    batch_inference_rows=1500

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'local_model_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
Data Loader kwags:
  columns: ['subreddit_id', 'subreddit_name', 'subreddit_meta_for_embeddings']
  df_format: pandas
  unique_check: False
  verbose: True
  bucket_name: gazette-models-temp
  gcs_path: i18n_topic_model_batch/runs/20220602/subreddits/text
  local_cache_path: /home/jupyter/subreddit_clustering_i18n/data/local_cache/
  n_sample_files: None
  n_files_slice_start: None
  n_files_slice_end: None
`2022-06-02 18:33:02,080` | `INFO` | `Using hydra's path`
`2022-06-02 18:33:02,080` | `INFO` | `  Log file created at: /home/jupyter/subreddit_clustering_i18n/hydr

## Run in bucket owned my i18n
This bucket retains data longer than the gazette temp bucket

In [54]:
# run on sample data, test experiment

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=false \
    limit_first_n_chars=2000 \
    batch_inference_rows=1500 \
    data_text.bucket_name="i18n-subreddit-clustering" \
    output_bucket="i18n-subreddit-clustering"

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'local_model_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
Data Loader kwags:
  columns: ['subreddit_id', 'subreddit_name', 'subreddit_meta_for_embeddings']
  df_format: pandas
  unique_check: False
  verbose: True
  bucket_name: i18n-subreddit-clustering
  gcs_path: i18n_topic_model_batch/runs/20220602/subreddits/text
  local_cache_path: /home/jupyter/subreddit_clustering_i18n/data/local_cache/
  n_sample_files: None
  n_files_slice_start: None
  n_files_slice_end: None
`2022-06-02 19:21:32,090` | `INFO` | `Using hydra's path`
`2022-06-02 19:21:32,090` | `INFO` | `  Log file created at: /home/jupyter/subreddit_clustering_i18

## Rough time projections
Based on the file(s) processed above. Here are some rough projections for how long it might take to process all posts needed for the topic model.

In [55]:
# Projections
l_estimates_ = list()

time_mins = 1.25
rows_embedded_ = 84400

rows_to_embed_ = int(100e6)
projected_time_mins = time_mins * (rows_to_embed_ / rows_embedded_)

l_estimates_.append(
    {
        'n_rows': rows_to_embed_,
        'n_jobs': 1,
        'projected_hours': projected_time_mins / 60,
        'projected_days': projected_time_mins / (60 * 24),
        'projected_mins': projected_time_mins,
    }
)

for n_parallel_jobs_ in range(2, 9):
    proj_mins_parallel = projected_time_mins / n_parallel_jobs_
    l_estimates_.append(
        {
            'n_rows': rows_to_embed_,
            'n_jobs': n_parallel_jobs_,
            'projected_hours': proj_mins_parallel / 60, 
            'projected_days': proj_mins_parallel / (60 * 24), 
            'projected_mins': proj_mins_parallel,
        }
    )

style_df_numeric(
    pd.DataFrame(l_estimates_)
)

Unnamed: 0,n_rows,n_jobs,projected_hours,projected_days,projected_mins
0,100000000,1,24.68,1.03,1481.04
1,100000000,2,12.34,0.51,740.52
2,100000000,3,8.23,0.34,493.68
3,100000000,4,6.17,0.26,370.26
4,100000000,5,4.94,0.21,296.21
5,100000000,6,4.11,0.17,246.84
6,100000000,7,3.53,0.15,211.58
7,100000000,8,3.09,0.13,185.13


## Files sequentially

For these tests, see the previous notebook (02.2). We can't test multiple files with the latest subreddit descriptions/meta because it all fits in a single file now.

### only 1 file (sampled)

In [None]:
# # run on sample data

# !cd $path_djb_repo && python -m $file_vectorize_py \
#     --config-name $config_vectorize \
#     process_individual_files=true \
#     limit_first_n_chars=500 \
#     batch_inference_rows=2600 \
#     n_sample_files=1

### all files

In [None]:
# # run on sample data

# !cd $path_djb_repo && python -m $file_vectorize_py \
#     --config-name $config_vectorize \
#     process_individual_files=true \
#     limit_first_n_chars=900 \
#     batch_inference_rows=1600

### Only files in slice (first 2)

In [None]:
# # run on sample data

# !cd $path_djb_repo && python -m $file_vectorize_py \
#     --config-name $config_vectorize \
#     process_individual_files=true \
#     limit_first_n_chars=500 \
#     batch_inference_rows=2600 \
#     n_files_slice_end=2

In [None]:
# # run on sample data

# !cd $path_djb_repo && python -m $file_vectorize_py \
#     --config-name $config_vectorize \
#     process_individual_files=true \
#     limit_first_n_chars=500 \
#     batch_inference_rows=2600 \
#     n_files_slice_start=0 \
#     n_files_slice_end=2

### Only last file

In [None]:
# # run on sample data

# !cd $path_djb_repo && python -m $file_vectorize_py \
#     --config-name $config_vectorize \
#     process_individual_files=true \
#     limit_first_n_chars=500 \
#     batch_inference_rows=2600 \
#     n_files_slice_start=1