# Purpose

Use this notebook to test the new data-loader (GCS), configs, and embeddings class that we'll be using in kubeflow.

For inference (getting embeddings) it might be better to read from GCS than from SQL.

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
from datetime import datetime
import logging
from pathlib import Path

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

from tqdm import tqdm

import subclu
# from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)

# new modules to test:
from google.cloud import bigquery
from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_gcs import (
    LoadSubredditsGCS
)
from subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf import(
    get_embeddings_as_df,
    upload_folder_to_gcs,
)



print_lib_versions([bigquery, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
google.cloud.bigquery	v: 2.20.0
numpy		v: 1.18.5
pandas		v: 1.2.5
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.5.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()
logging.info('loggging ready')

16:56:49 | INFO | "loggging ready"


# Auth note
This notebook assumes you have authenticated using the gcloud CLI. Example</br>
```bash
gcloud auth application-default login
```

# Test embeddings function on plain df

To make sure that the function itself is fine.

In [4]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_name_title_related_subs_and_clean_descriptions'],
    col_unique_check='subreddit_id',
    df_format='pandas',
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
    unique_check=False,
    verbose= True,
)
subs.local_cache()

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")
assert 2 == len(subs.local_files_)

df_ = subs.read_as_one_df()
print(df_.shape)

16:57:29 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
16:57:29 | INFO | "  20 <- Files matching prefix"
16:57:29 | INFO | "  20 <- Files to check"
16:57:29 | INFO | "    000000000000.parquet <- File already exists, not downloading"
16:57:29 | INFO | "    000000000001_test.parquet <- File already exists, not downloading"
16:57:29 | INFO | "  Files already cached: 2"
16:57:29 | INFO | "  Files already downloaded."
16:57:29 | INFO | "  df format: pandas"


2 <- Local files
2 <- Local parquet files
(159874, 3)
CPU times: user 401 ms, sys: 255 ms, total: 656 ms
Wall time: 3.16 s


In [5]:
# # load model
# import tensorflow_hub as hub
# import tensorflow_text
# model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")


In [6]:
# %%time
# # Get embeddings
# df_vect_ = get_embeddings_as_df(
#     model=model,
#     df=df_,
#     col_text='subreddit_name_title_related_subs_and_clean_descriptions',
#     cols_index=['subreddit_id', 'subreddit_name'],
#     verbose_init=True,
#     verbose=True,
# )

# Test upload folder to GCS

We need this if we're not using mlflow to track a model's output

In [9]:
%%time

upload_folder_to_gcs(
    bucket_name='gazette-models-temp',
    gcs_output_root='i18n_topic_model_batch/runs/20220412/subreddits/text/TEST_UPLOAD',
    local_dir='/home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text',
    gcs_new_subfolder=None,
    verbose=True,
    dry_run=True,
)

17:10:56 | INFO | "dry_run=True"
17:10:57 | INFO | "Uploading file
  from: /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text/000000000000.parquet
  to: gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text/TEST_UPLOAD/000000000000.parquet"
17:10:57 | INFO | "Uploading file
  from: /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text/000000000001_test.parquet
  to: gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text/TEST_UPLOAD/000000000001_test.parquet"


CPU times: user 17.4 ms, sys: 24.1 ms, total: 41.5 ms
Wall time: 1.25 s


In [22]:
%%time

upload_folder_to_gcs(
    bucket_name='gazette-models-temp',
    gcs_output_root='i18n_topic_model_batch/runs/20220412/subreddits/text/TEST_UPLOAD',
    local_dir='/home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text',
    gcs_new_subfolder='subfolder_test',
    verbose=True,
    dry_run=True,
)

17:52:08 | INFO | "dry_run=True"
17:52:10 | INFO | "Uploading file
  from: /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text/000000000000.parquet
  to: gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text/TEST_UPLOAD/subfolder_test/000000000000.parquet"
17:52:10 | INFO | "Uploading file
  from: /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text/000000000001_test.parquet
  to: gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text/TEST_UPLOAD/subfolder_test/000000000001_test.parquet"


/home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text
CPU times: user 24 ms, sys: 16.4 ms, total: 40.4 ms
Wall time: 1.23 s


# Load data AND Vectorize 

In [23]:
path_djb_repo = '/home/david.bermejo/repos/subreddit_clustering_i18n/' 
path_djb_models = '/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/models' 
file_vectorize_py = 'subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf'

config_vectorize = 'vectorize_subreddits_test_local'

print(path_djb_repo)
print(file_vectorize_py)
print(config_vectorize)

/home/david.bermejo/repos/subreddit_clustering_i18n/
subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf
vectorize_subreddits_test_local


## All files as single DF

For subreddit-metadata, aim for:
- `first_n_chars` below 1,900
- `batch_inference_rows` below 1,600


```bash
`lowercase_text: False`
`limit_first_n_chars: 2200`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 2000`
0:03:58.241812 <- df_subs vectorizing time elapsed
# large batch & too many characters makes it slow b/c it runs OOM often, so we spend a lot of time retrying.


`limit_first_n_chars: 2000`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 1700`
0:02:21.720878 <- df_subs vectorizing time elapsed

`limit_first_n_chars: 1900`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 1600`
0:02:04.692077 <- df_subs vectorizing time elapsed
```

In [23]:
# run on sample data, test experiment

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=false \
    limit_first_n_chars=500 \
    batch_inference_rows=2600

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 22:04:55,525` | `INFO` | `Start vectorize function`
`2022-04-14 22:04:55,525` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 22:04:57,101` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 22:05:01,105` | `INFO` | `Model loaded`
`2022-04-14 22:05:01,105` | `INFO` | `Loading text...`
`2022-04-14 22:05:02,357` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 22:05:02,430` | `INFO` | `  19 <- Files match

## Files sequentially

### only 1 file (sampled)

In [27]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=500 \
    batch_inference_rows=2600 \
    n_sample_files=1

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'local_model_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-15 18:02:12,129` | `INFO` | `Using hydra's path`
`2022-04-15 18:02:12,129` | `INFO` | `  Log file created at: /home/jupyter/subreddit_clustering_i18n/hydra_runs/outputs/2022-04-15/18-02-12/logs/2022-04-15_18-02-12_vectorize_text.log`
`2022-04-15 18:02:12,129` | `INFO` | `Start vectorize function`
`2022-04-15 18:02:12,130` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-15 18:02:13,638` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-15 18:02:17,654` | `INFO` | `Model loaded`
`2022-04-15 18:02:18,899` | `INFO` | `  Local folder to down

### all files

In [18]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=900 \
    batch_inference_rows=1600

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 21:59:22,813` | `INFO` | `Start vectorize function`
`2022-04-14 21:59:22,813` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 21:59:24,383` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 21:59:28,412` | `INFO` | `Model loaded`
`2022-04-14 21:59:29,716` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 21:59:29,801` | `INFO` | `  11 <- Files matching prefix`
`2022-04-14 21:59:29,801` | `INFO` | `  11 

### Only files in slice (first 2)

In [20]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=500 \
    batch_inference_rows=2600 \
    n_files_slice_end=2

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 22:01:42,913` | `INFO` | `Start vectorize function`
`2022-04-14 22:01:42,913` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 22:01:44,469` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 22:01:48,603` | `INFO` | `Model loaded`
`2022-04-14 22:01:49,889` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 22:01:49,964` | `INFO` | `  14 <- Files matching prefix`
`2022-04-14 22:01:49,964` | `INFO` | `  14 

In [21]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=500 \
    batch_inference_rows=2600 \
    n_files_slice_start=0 \
    n_files_slice_end=2

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 22:02:58,149` | `INFO` | `Start vectorize function`
`2022-04-14 22:02:58,149` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 22:02:59,672` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 22:03:03,807` | `INFO` | `Model loaded`
`2022-04-14 22:03:05,083` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 22:03:05,157` | `INFO` | `  16 <- Files matching prefix`
`2022-04-14 22:03:05,158` | `INFO` | `  16 

### Only last file

In [22]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=500 \
    batch_inference_rows=2600 \
    n_files_slice_start=1

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 22:04:12,714` | `INFO` | `Start vectorize function`
`2022-04-14 22:04:12,714` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 22:04:14,256` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 22:04:18,291` | `INFO` | `Model loaded`
`2022-04-14 22:04:19,670` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 22:04:19,741` | `INFO` | `  18 <- Files matching prefix`
`2022-04-14 22:04:19,741` | `INFO` | `  17 