# Purpose

Use this notebook to test the new data-loader (GCS), configs, and embeddings class that we'll be using in kubeflow.

For inference (getting embeddings) it might be better to read from GCS than from SQL.

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
from pathlib import Path

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

from tqdm import tqdm

import subclu
# from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)

# new modules to test:
from google.cloud import bigquery
from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_gcs import (
    LoadSubredditsGCS
)
from subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf import(
    get_embeddings_as_df
)



print_lib_versions([bigquery, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
google.cloud.bigquery	v: 2.20.0
numpy		v: 1.18.5
pandas		v: 1.2.5
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.5.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()
logging.info('loggging ready')

06:54:05 | INFO | "loggging ready"


# Test embeddings function on plain df

To make sure that the function itself is fine.

In [24]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_name_title_related_subs_and_clean_descriptions'],
    col_unique_check='subreddit_id',
    df_format='pandas',
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
    unique_check=False,
    verbose= True,
)
subs.local_cache()

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")
assert 2 == len(subs.local_files_)

df_ = subs.read_as_one_df()
print(df_.shape)

08:13:59 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
08:13:59 | INFO | "  2 <- Files matching prefix"
08:13:59 | INFO | "  2 <- Files to check"
08:13:59 | INFO | "    000000000000.parquet <- File already exists, not downloading"
08:13:59 | INFO | "    000000000001_test.parquet <- File already exists, not downloading"
08:13:59 | INFO | "  Files already cached: 2"
08:13:59 | INFO | "0:00:01.398938  <- Downloading files elapsed time"
08:13:59 | INFO | "  Files already downloaded."
08:13:59 | INFO | "  df format: pandas"


2 <- Local files
2 <- Local parquet files
(159874, 3)
CPU times: user 374 ms, sys: 310 ms, total: 685 ms
Wall time: 1.91 s


In [6]:
# # load model
# import tensorflow_hub as hub
# import tensorflow_text
# model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")


In [7]:
# %%time
# # Get embeddings
# df_vect_ = get_embeddings_as_df(
#     model=model,
#     df=df_,
#     col_text='subreddit_name_title_related_subs_and_clean_descriptions',
#     cols_index=['subreddit_id', 'subreddit_name'],
#     verbose_init=True,
#     verbose=True,
# )

# Load data AND Vectorize 

In [10]:
path_djb_repo = '/home/david.bermejo/repos/subreddit_clustering_i18n/' 
path_djb_models = '/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/models' 
file_vectorize_py = 'subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf'

config_vectorize = 'vectorize_subreddits_test_local'

print(path_djb_repo)
print(file_vectorize_py)
print(config_vectorize)

/home/david.bermejo/repos/subreddit_clustering_i18n/
subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf
vectorize_subreddits_test_local


## All files as single DF

For subreddit-metadata, aim for:
- `first_n_chars` below 1,900
- `batch_inference_rows` below 1,600


```bash
`lowercase_text: False`
`limit_first_n_chars: 2200`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 2000`
0:03:58.241812 <- df_subs vectorizing time elapsed
# large batch & too many characters makes it slow b/c it runs OOM often, so we spend a lot of time retrying.


`limit_first_n_chars: 2000`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 1700`
0:02:21.720878 <- df_subs vectorizing time elapsed

`limit_first_n_chars: 1900`
`limit_first_n_chars_retry: 700`
`Getting embeddings in batches of size: 1600`
0:02:04.692077 <- df_subs vectorizing time elapsed
```

In [23]:
# run on sample data, test experiment

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=false

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'output_folder', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 07:46:59,786` | `INFO` | `Start vectorize function`
`2022-04-14 07:46:59,786` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 07:47:01,328` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 07:47:05,455` | `INFO` | `Model loaded`
`2022-04-14 07:47:05,455` | `INFO` | `Loading text...`
`2022-04-14 07:47:06,737` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 07:47:06,814` | `INFO` | `  

## Files sequentially

### all files

In [30]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=900 \
    batch_inference_rows=1600

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 08:42:48,738` | `INFO` | `Start vectorize function`
`2022-04-14 08:42:48,738` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 08:42:50,309` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 08:42:54,761` | `INFO` | `Model loaded`
`2022-04-14 08:42:56,079` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 08:42:56,146` | `INFO` | `  2 <- Files matching prefix`
`2022-04-14 08:42:56,147` | `INFO` | `  2 <-

### only 1 file (sampled)

In [31]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=500 \
    batch_inference_rows=2600 \
    n_sample_files=1

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 08:51:52,441` | `INFO` | `Start vectorize function`
`2022-04-14 08:51:52,441` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 08:51:54,641` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 08:51:58,922` | `INFO` | `Model loaded`
`2022-04-14 08:52:00,240` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 08:52:00,315` | `INFO` | `  1 <- Files matching prefix`
`2022-04-14 08:52:00,315` | `INFO` | `  1 <-

### only files in slice (first 2)

In [32]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=500 \
    batch_inference_rows=2600 \
    n_files_slice_end=2

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 08:53:53,278` | `INFO` | `Start vectorize function`
`2022-04-14 08:53:53,278` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 08:53:54,815` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 08:53:59,055` | `INFO` | `Model loaded`
`2022-04-14 08:54:00,323` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 08:54:00,394` | `INFO` | `  5 <- Files matching prefix`
`2022-04-14 08:54:00,395` | `INFO` | `  5 <-

In [33]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=500 \
    batch_inference_rows=2600 \
    n_files_slice_start=0 \
    n_files_slice_end=2

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 08:59:16,861` | `INFO` | `Start vectorize function`
`2022-04-14 08:59:16,861` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 08:59:18,414` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 08:59:22,773` | `INFO` | `Model loaded`
`2022-04-14 08:59:24,047` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 08:59:24,121` | `INFO` | `  7 <- Files matching prefix`
`2022-04-14 08:59:24,121` | `INFO` | `  7 <-

### Only last filem

In [34]:
# run on sample data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=true \
    limit_first_n_chars=500 \
    batch_inference_rows=2600 \
    n_files_slice_start=1

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 09:02:53,998` | `INFO` | `Start vectorize function`
`2022-04-14 09:02:53,998` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 09:02:55,638` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 09:02:59,934` | `INFO` | `Model loaded`
`2022-04-14 09:03:01,207` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 09:03:01,277` | `INFO` | `  9 <- Files matching prefix`
`2022-04-14 09:03:01,277` | `INFO` | `  8 <-