# Purpose

Use this notebook to test the new data-loader (GCS), configs, and embeddings class that we'll be using in kubeflow.

For inference (getting embeddings) it might be better to read from GCS than from SQL.

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
from pathlib import Path

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

from tqdm import tqdm

import subclu
# from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)

# new modules to test:
from google.cloud import bigquery

from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_sql import (
    convert_iter_to_sql_str,
    LoadSubredditsSQL
)



print_lib_versions([bigquery, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
google.cloud.bigquery	v: 2.20.0
numpy		v: 1.18.5
pandas		v: 1.2.5
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.5.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()
logging.info('loggging ready')

03:39:54 | INFO | "loggging ready"


# Load data with new class

Load data from a test folder where we have 2 parquet files. This way we can test slicing & sampling.

In [4]:
from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_gcs import (
    LoadSubredditsGCS
)

## Cache files

### All files, read as 1 pandas df

In [5]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_name_title_related_subs_and_clean_descriptions'],
    col_unique_check='subreddit_id',
    df_format='pandas',
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
    unique_check=False,
    verbose= True,
)
subs._local_cache()

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")
assert 2 == len(subs.local_files_)

df_ = subs.read_as_one_df()
print(df_.shape)

03:39:56 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
03:39:56 | INFO | "  2 <- Files matching prefix"
03:39:56 | INFO | "  2 <- Files to check"
03:39:56 | INFO | "    000000000000.parquet <- File already exists, not downloading"
03:39:56 | INFO | "    000000000001_test.parquet <- File already exists, not downloading"
03:39:56 | INFO | "  Files already cached: 2"
03:39:56 | INFO | "0:00:01.380230  <- Downloading files elapsed time"
03:39:56 | INFO | "  Files already downloaded."
03:39:56 | INFO | "  df format: pandas"


2 <- Local files
2 <- Local parquet files
(159874, 3)
CPU times: user 364 ms, sys: 275 ms, total: 639 ms
Wall time: 1.85 s


In [6]:
df_.head()

Unnamed: 0,subreddit_id,subreddit_name,subreddit_name_title_related_subs_and_clean_descriptions
0,t5_2sxhs,place,"place. r/place. \nplace\nplace\n\nSome have visited a canvas before. A place where togetherness created more. Now in numbers far greater, taking more space, It falls upon you to create a better place.\n\nThere is an empty canvas.\n\nYou..."
1,t5_2qh1i,askreddit,"AskReddit. r/AskReddit. \nAsk Reddit...\naskreddit, AskReddit\n\nAskReddit is the place to ask and answer thought provoking questions.\n\nSERIOUS askreddit true Serious \n\nRules AskReddit wikiwiki rules:\n1. You must post a clear and d..."
2,t5_2qhsa,interestingasfuck,"interestingasfuck. r/interestingasfuck. \nInteresting As Fuck\ninterestingasfuck, TodayILearned, notinteresting, mildlyinteresting, offbeat, oddlysatisfying, damnthatsinteresting, Unexpected, wtf\n\nFor anything that is InterestingAsFuc..."
3,t5_2y77d,antiwork,"antiwork. r/antiwork. \nAntiwork: Unemployment for all, not just the rich!\nantiwork, antitrampo, AntiTaff, antiarbeit, antiworkItaly, tegenwerken, antiwork_slovenija, Antiwork_UK, Anarchism, Anarchy101, IWW, LateStageCapitalism, lostge..."
4,t5_2qh13,worldnews,"worldnews. r/worldnews. \nWorld News\nNews, PoliticalDiscussion, WorldEvents, GeoPolitics, IntheNews, GlobalTalk, Breakingnews, Business, Economics, Environment, History, HumanRights, Features, UpliftingNews, NewsOfTheWeird, FakeNews, I..."


### All files, read as 1 `dask` df

In [7]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_name_title_related_subs_and_clean_descriptions'],
    col_unique_check='subreddit_id',
    df_format='dask',
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
    unique_check=False,
    verbose= True,
)
subs._local_cache()

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")
assert 2 == len(subs.local_files_)

df_ = subs.read_as_one_df()
print(df_.shape)

03:39:58 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
03:39:58 | INFO | "  2 <- Files matching prefix"
03:39:58 | INFO | "  2 <- Files to check"
03:39:58 | INFO | "    000000000000.parquet <- File already exists, not downloading"
03:39:58 | INFO | "    000000000001_test.parquet <- File already exists, not downloading"
03:39:58 | INFO | "  Files already cached: 2"
03:39:58 | INFO | "0:00:01.337023  <- Downloading files elapsed time"
03:39:58 | INFO | "  Files already downloaded."
03:39:58 | INFO | "  df format: dask"


2 <- Local files
2 <- Local parquet files
(Delayed('int-7a927216-326b-4f38-ad3c-7ec261434fe9'), 3)
CPU times: user 39.6 ms, sys: 35.1 ms, total: 74.7 ms
Wall time: 1.35 s


In [8]:
df_.tail()

Unnamed: 0,subreddit_id,subreddit_name,subreddit_name_title_related_subs_and_clean_descriptions
79932,t5_s3j8c,antianimearmy,"AntiAnimeArmy. r/AntiAnimeArmy. \nJoin the Army Today!\nAnimeAddictsAnonymous, NoAnimePolice, AnimeHate\n\nThe Anti Anime Army is to destroy the evil that is Anime.\n\nSeeking help with your addiction?\ntry our spin off subreddit:\nAnim..."
79933,t5_3stsd4,poorlymadepolicememes,poorlymadepolicememes. r/poorlymadepolicememes. \npoorlymadepolicememes\nThe sub about the very poorly made memer and podcaster\n\nFeel free to discuss the podcast episodes and memes\n\nA bunch of police memes that aren’t really that gr...
79934,t5_48te5o,svampedyrkning,"Svampedyrkning. r/Svampedyrkning. \nSvampedyrkning\nEt dansk subreddit dedikeret til svampedyrkning. Her kan man diskutere dyrkning af gourmet , psilocybin og medicinsvampe, med henblik på at hjælpe hinanden, dele sine erfaringer og udv..."
79935,t5_4ua7kp,meditieren,"meditieren. r/meditieren. \nMeditieren - Entspannung von Kopf bis Fuß.\nmeditieren\n\nWillkommen auf meditieren! Dieser Sub dient dem Austausch von Erfahrungen, Geschichten und Anleitungen in Bezug auf die Praxis der Meditation.\n\nWILK..."
79936,t5_4cefbk,handball_de,handball_de. r/handball_de. \nHandball\nFür alles rund um den Profi und Amateurhandball!


### All files, yield each file as separate df

In [9]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_name_title_related_subs_and_clean_descriptions'],
    col_unique_check='subreddit_id',
    df_format='pandas',
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
    unique_check=False,
    verbose= True,
)
subs._local_cache()
for df_y_ in tqdm(subs.yield_each_file_as_df(), total=subs.n_local_parquet_files_):
    print(df_y_.shape)

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")
assert 2 == len(subs.local_files_)

03:39:59 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
03:39:59 | INFO | "  2 <- Files matching prefix"
03:39:59 | INFO | "  2 <- Files to check"
03:39:59 | INFO | "    000000000000.parquet <- File already exists, not downloading"
03:39:59 | INFO | "    000000000001_test.parquet <- File already exists, not downloading"
03:39:59 | INFO | "  Files already cached: 2"
03:39:59 | INFO | "0:00:01.351215  <- Downloading files elapsed time"
  0%|          | 0/2 [00:00<?, ?it/s]03:39:59 | INFO | "  Files already downloaded."
 50%|█████     | 1/2 [00:00<00:00,  3.98it/s]

(79937, 3)


100%|██████████| 2/2 [00:00<00:00,  4.04it/s]

(79937, 3)
2 <- Local files
2 <- Local parquet files
CPU times: user 323 ms, sys: 270 ms, total: 593 ms
Wall time: 1.85 s





### Sample file (1st file)

In [10]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_description'],
    col_unique_check='subreddit_id',
    df_format='pandas',
    n_sample_files=1,
    n_files_slice_start=None,
    n_files_slice_end=None,
    unique_check=True,
    verbose= True,
)
df_ = subs.read_as_one_df()
print(df_.shape)

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")

assert(1 == len(subs.local_files_)), "Expected to sample only 1 file"

03:40:01 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
03:40:01 | INFO | "  1 <- Files matching prefix"
03:40:01 | INFO | "  1 <- Files to check"
03:40:01 | INFO | "    000000000000.parquet <- File already exists, not downloading"
03:40:01 | INFO | "  Files already cached: 1"
03:40:01 | INFO | "0:00:01.394998  <- Downloading files elapsed time"
03:40:01 | INFO | "  df format: pandas"
03:40:01 | INFO | "  Checking ID uniqueness..."


(79937, 3)
1 <- Local files
1 <- Local parquet files
CPU times: user 174 ms, sys: 77.9 ms, total: 252 ms
Wall time: 1.57 s


### Slice -- last file

In [11]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_description'],
    col_unique_check='subreddit_id',
    df_format='pandas',
    n_sample_files=None,
    n_files_slice_start=-1,
    n_files_slice_end=None,
    unique_check=True,
    verbose= True,
)
df_ = subs.read_as_one_df()
print(df_.shape)

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")

assert(1 == len(subs.local_files_)), "Expected slice with only 1 file"

03:40:03 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
03:40:03 | INFO | "  2 <- Files matching prefix"
03:40:03 | INFO | "  1 <- Files to check"
03:40:03 | INFO | "    000000000001_test.parquet <- File already exists, not downloading"
03:40:03 | INFO | "  Files already cached: 1"
03:40:03 | INFO | "0:00:01.361892  <- Downloading files elapsed time"
03:40:03 | INFO | "  df format: pandas"
03:40:03 | INFO | "  Checking ID uniqueness..."


(79937, 3)
1 <- Local files
1 <- Local parquet files
CPU times: user 184 ms, sys: 136 ms, total: 321 ms
Wall time: 1.6 s


In [12]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_description'],
    col_unique_check='subreddit_id',
    df_format='pandas',
  
    n_sample_files=None,
    n_files_slice_start=1,
    n_files_slice_end=2,
    unique_check=True,
    verbose= True,
)
df_ = subs.read_as_one_df()
print(df_.shape)

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")

assert(1 == len(subs.local_files_)), "Expected slice with only 1 file"

03:40:04 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
03:40:04 | INFO | "  2 <- Files matching prefix"
03:40:04 | INFO | "  1 <- Files to check"
03:40:04 | INFO | "    000000000001_test.parquet <- File already exists, not downloading"
03:40:04 | INFO | "  Files already cached: 1"
03:40:04 | INFO | "0:00:01.336908  <- Downloading files elapsed time"
03:40:04 | INFO | "  df format: pandas"
03:40:05 | INFO | "  Checking ID uniqueness..."


(79937, 3)
1 <- Local files
1 <- Local parquet files
CPU times: user 184 ms, sys: 141 ms, total: 326 ms
Wall time: 1.59 s


### Slice first file

In [13]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_description'],
    col_unique_check='subreddit_id',
    df_format='pandas',
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=1,
    unique_check=True,
    verbose= True,
)
df_ = subs.read_as_one_df()
print(df_.shape)

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")

assert(1 == len(subs.local_files_)), "Expected slice with 1 file"

03:40:06 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
03:40:06 | INFO | "  2 <- Files matching prefix"
03:40:06 | INFO | "  1 <- Files to check"
03:40:06 | INFO | "    000000000000.parquet <- File already exists, not downloading"
03:40:06 | INFO | "  Files already cached: 1"
03:40:06 | INFO | "0:00:01.384844  <- Downloading files elapsed time"
03:40:06 | INFO | "  df format: pandas"
03:40:06 | INFO | "  Checking ID uniqueness..."


(79937, 3)
1 <- Local files
1 <- Local parquet files
CPU times: user 191 ms, sys: 126 ms, total: 317 ms
Wall time: 1.62 s


### Slice first 2 files

In [14]:
%%time

subs = LoadSubredditsGCS(
    bucket_name='gazette-models-temp',
    gcs_path='i18n_topic_model_batch/runs/20220412/subreddits/text',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=['subreddit_id', 'subreddit_name', 'subreddit_description'],
    col_unique_check='subreddit_id',
    df_format='pandas',
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=2,
    unique_check=False,
    verbose= True,
)
df_ = subs.read_as_one_df()
print(df_.shape)

print(f"{len(subs.local_files_)} <- Local files")
print(f"{len(subs.local_parquet_files_)} <- Local parquet files")

assert(2 == len(subs.local_files_)), "Expected 2 files"

03:40:08 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/gazette-models-temp/i18n_topic_model_batch/runs/20220412/subreddits/text"
03:40:08 | INFO | "  2 <- Files matching prefix"
03:40:08 | INFO | "  2 <- Files to check"
03:40:08 | INFO | "    000000000000.parquet <- File already exists, not downloading"
03:40:08 | INFO | "    000000000001_test.parquet <- File already exists, not downloading"
03:40:08 | INFO | "  Files already cached: 2"
03:40:08 | INFO | "0:00:01.365880  <- Downloading files elapsed time"
03:40:08 | INFO | "  df format: pandas"


(159874, 3)
2 <- Local files
2 <- Local parquet files
CPU times: user 321 ms, sys: 240 ms, total: 561 ms
Wall time: 1.77 s


# Load data AND Vectorize 

In [15]:
path_djb_repo = '/home/david.bermejo/repos/subreddit_clustering_i18n/' 
path_djb_models = '/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/models' 
file_vectorize_py = 'subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf'

config_vectorize = 'vectorize_subreddits_test_local'

print(path_djb_repo)
print(file_vectorize_py)
print(config_vectorize)

/home/david.bermejo/repos/subreddit_clustering_i18n/
subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf
vectorize_subreddits_test_local


## All files as single DF

In [17]:
# run on sample data, test experiment

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_vectorize

2022-04-14 04:19:25.399262: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0
CFG keys:
  dict_keys(['data_text', 'config_description', 'local_cache_path', 'output_bucket', 'output_folder', 'gcs_path', 'data_loader_name', 'data_loader_kwargs', 'n_sample_files', 'n_files_slice_start', 'n_files_slice_end', 'process_individual_files', 'col_text_for_embeddings', 'model_name', 'batch_inference_rows', 'limit_first_n_chars', 'get_embeddings_verbose', 'cols_index'])
Error executing job with overrides: []
Traceback (most recent call last):
  File "/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/i18n_topic_model_batch/subclu2/get_embeddings/vectorize_text_tf.py", line 78, in vectorize_text
    **{k: v for k, v in cfg.items() if k not in ['data_test', 'data_loader_kwargs']}
  File "/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/i18n_topic_model_batch/subclu2/get_embeddings/vectorize_text_tf.py", line 135, i

## Files sequentially