# Purpose

Use this notebook to vectorize the text of combined Post + Comments.

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm import tqdm

import subclu
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([np, pd, subclu])

python		v 3.7.10
===
numpy		v: 1.18.5
pandas		v: 1.2.5
subclu		v: 0.6.0


In [5]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()
logging.info('loggging ready')

08:42:12 | INFO | "loggging ready"


# Auth note
This notebook assumes you have authenticated using the gcloud CLI. Example</br>
```bash
gcloud auth application-default login
```

# Load data AND Vectorize 

When we call the vectorizing function, it calls the data loader under the hood.
See the configs in:
- `subclu2/config`

In [6]:
path_djb_repo = '/home/david.bermejo/repos/subreddit_clustering_i18n/' 
path_djb_models = '/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/models' 
file_vectorize_py = 'subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf'

config_vectorize = 'vectorize_post_and_comments_combined_seed_v0.6.0'

print(path_djb_repo)
print(file_vectorize_py)
print(config_vectorize)

/home/david.bermejo/repos/subreddit_clustering_i18n/
subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf
vectorize_post_and_comments_combined_seed_v0.6.0


## Run in bucket owned by i18n
This bucket retains data longer than the gazette temp bucket

In [None]:
# run on full data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize

CFG keys:
  dict_keys(['data_text_and_metadata', 'config_description', 'local_cache_path', 'local_model_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
Data Loader kwags:
  columns: ['subreddit_id', 'subreddit_name', 'post_id', 'post_and_comment_text_clean']
  df_format: pandas
  unique_check: False
  verbose: True
  bucket_name: i18n-subreddit-clustering
  gcs_path: i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all
  local_cache_path: /home/jupyter/subreddit_clustering_i18n/data/local_cache/
  n_sample_files: None
  n_files_slice_start: None
  n_files_slice_end: None
`2022-08-11 08:42:18,382` | `INFO` | `Using hydra's path`
`2022-08-11 08:42:18,382` | `INFO` | `  Log file crea