# Purpose

Use this notebook to test the new data-loader (GCS), configs, and embeddings class that we'll be using in kubeflow.

For inference (getting embeddings) it might be better to read from GCS than from SQL.

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
from pathlib import Path

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

from tqdm import tqdm

import subclu
# from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)

# new modules to test:
from google.cloud import bigquery



print_lib_versions([bigquery, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
google.cloud.bigquery	v: 2.20.0
numpy		v: 1.18.5
pandas		v: 1.2.5
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.5.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()
logging.info('loggging ready')

05:39:41 | INFO | "loggging ready"


# Load data AND Vectorize 

In [4]:
path_djb_repo = '/home/david.bermejo/repos/subreddit_clustering_i18n/' 
path_djb_models = '/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/models' 
file_vectorize_py = 'subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf'

config_vectorize = 'vectorize_subreddits_test_local'

print(path_djb_repo)
print(file_vectorize_py)
print(config_vectorize)

/home/david.bermejo/repos/subreddit_clustering_i18n/
subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf
vectorize_subreddits_test_local


## All files as single DF

In [9]:
# run on sample data, test experiment

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize \
    process_individual_files=false

CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'output_folder', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'get_embeddings_verbose', 'cols_index'])
`2022-04-14 05:55:50,674` | `INFO` | `Start vectorize function`
`2022-04-14 05:55:50,674` | `INFO` | `Loading model: use_multilingual_3`
`2022-04-14 05:55:52,276` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
`2022-04-14 05:55:56,426` | `INFO` | `Model loaded`
`2022-04-14 05:55:56,426` | `INFO` | `Loading text...`
`2022-04-14 05:55:57,732` | `INFO` | `  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text`
`2022-04-14 05:55:57,795` | `INFO` | `  2 <- Files matching prefix`
`

## Files sequentially