# Purpose

Use this notebook to vectorize the text for subreddit metadata.

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm import tqdm

import subclu
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([np, pd, subclu])

python		v 3.7.10
===
numpy		v: 1.18.5
pandas		v: 1.2.5
subclu		v: 0.6.1


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()
logging.info('loggging ready')

06:33:44 | INFO | "loggging ready"


# Auth note
This notebook assumes you have authenticated using the gcloud CLI. Example</br>
```bash
gcloud auth application-default login
```

# Load data AND Vectorize 

When we call the vectorizing function, it calls the data loader under the hood.
See the configs in:
- `subclu2/config/`
    - `data_text_and_metadata/`
        -  `vX.X.X_model.yaml`
    - `vectorize_subreddit_meta_vX.X.X.yaml`


In [5]:
path_djb_repo = '/home/david.bermejo/repos/subreddit_clustering_i18n/' 
path_djb_models = '/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/models' 
file_vectorize_py = 'subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf'

config_vectorize = 'vectorize_subreddit_meta_v0.6.1'

print(path_djb_repo)
print(file_vectorize_py)
print(config_vectorize)

/home/david.bermejo/repos/subreddit_clustering_i18n/
subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf
vectorize_subreddit_meta_v0.6.1


## Run in bucket owned by i18n
This bucket retains data longer than the gazette temp bucket

In [6]:
# run on full data

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize

CFG keys:
  dict_keys(['data_text_and_metadata', 'config_description', 'local_cache_path', 'local_model_path', 'output_bucket', 'gcs_path_text_key', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'limit_first_n_chars_retry', 'get_embeddings_verbose', 'cols_index'])
Data Loader kwags:
  columns: ['subreddit_id', 'subreddit_name', 'subreddit_meta_for_embeddings']
  df_format: pandas
  unique_check: False
  verbose: True
  bucket_name: i18n-subreddit-clustering
  gcs_path: i18n_topic_model_batch/runs/20221107/subreddits/text
  local_cache_path: /home/jupyter/subreddit_clustering_i18n/data/local_cache/
  n_sample_files: None
  n_files_slice_start: None
  n_files_slice_end: None
`2022-11-07 07:46:32,968` | `INFO` | `Using hydra's path`
`2022-11-07 07:46:32,968` | `INFO` | `  Log file created at: /home/jupyter/subreddit_c

## Rough time projections
Based on the file(s) processed above. Here are some rough projections for how long it might take to process all posts needed for the topic model.

In [7]:
# Projections
l_estimates_ = list()

time_mins = 10.0 + (1/6)
rows_embedded_ = 781653

rows_to_embed_ = int(53663530)
projected_time_mins = time_mins * (rows_to_embed_ / rows_embedded_)

l_estimates_.append(
    {
        'n_rows': rows_to_embed_,
        'n_jobs': 1,
        'projected_hours': projected_time_mins / 60,
        'projected_days': projected_time_mins / (60 * 24),
        'projected_mins': projected_time_mins,
    }
)

for n_parallel_jobs_ in range(2, 9):
    proj_mins_parallel = projected_time_mins / n_parallel_jobs_
    l_estimates_.append(
        {
            'n_rows': rows_to_embed_,
            'n_jobs': n_parallel_jobs_,
            'projected_hours': proj_mins_parallel / 60, 
            'projected_days': proj_mins_parallel / (60 * 24), 
            'projected_mins': proj_mins_parallel,
        }
    )

style_df_numeric(
    pd.DataFrame(l_estimates_)
)

Unnamed: 0,n_rows,n_jobs,projected_hours,projected_days,projected_mins
0,53663530,1,11.63,0.48,697.98
1,53663530,2,5.82,0.24,348.99
2,53663530,3,3.88,0.16,232.66
3,53663530,4,2.91,0.12,174.5
4,53663530,5,2.33,0.1,139.6
5,53663530,6,1.94,0.08,116.33
6,53663530,7,1.66,0.07,99.71
7,53663530,8,1.45,0.06,87.25
