# Purpose

Use this notebook to test the new data-loader (SQL) and embeddings class that we'll be using in kubeflow.

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from datetime import datetime
import logging

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

# import mlflow

import subclu
# from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)

# new modules to test:
from google.cloud import bigquery

from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_sql import (
    convert_iter_to_sql_str,
    LoadSubredditsSQL
)



print_lib_versions([bigquery, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
google.cloud.bigquery	v: 2.20.0
numpy		v: 1.18.5
pandas		v: 1.2.5
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.5.0


In [4]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()
logging.info('loggging ready')

03:19:26 | INFO | "loggging ready"


# Test SQL query speeds

Which one is faster? Since we'll be loading lots of text data speed will matter.
Turns out the default library wins, so no need to add `pandas-gbq` to the full requirements file.

```
# 200k subreddits

## pandas-gbq
15.7 s ± 555 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## bigquery client
8.51 s ± 1.34 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

# 100k subreddits 
(change order in case one of them is getting the benefit of getting cached)

## BQ
6.75 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

## pandas-gbq
9.43 s ± 369 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

```

In [44]:
test_sql_200 = """
SELECT
            
        subreddit_id
        , name
        , title
        , description

FROM data-prod-165221.ds_v2_postgres_tables.subreddit_lookup
WHERE 1=1
    AND dt = (CURRENT_DATE() - 2)  -- subreddit_lookup
    -- Exclude user-profiles + spam & sketchy subs
    AND COALESCE(verdict, 'f') <> 'admin_removed'
    AND COALESCE(is_spam, FALSE) = FALSE
    AND COALESCE(is_deleted, FALSE) = FALSE
    AND deleted IS NULL
    AND type IN ('public', 'private', 'restricted')
    AND NOT REGEXP_CONTAINS(LOWER(name), r'^u_.*')
LIMIT 200000
"""

### Pandas-gbq

In [7]:
%%timeit
df_pd = pd.read_gbq(
    query=test_sql_200,
    progress_bar_type=None,
)

23:55:12 | INFO | "Total time taken 15.97 s.
Finished at 2022-03-30 23:55:12."
23:55:28 | INFO | "Total time taken 16.13 s.
Finished at 2022-03-30 23:55:28."
23:55:44 | INFO | "Total time taken 15.84 s.
Finished at 2022-03-30 23:55:44."
23:55:59 | INFO | "Total time taken 15.51 s.
Finished at 2022-03-30 23:55:59."
23:56:15 | INFO | "Total time taken 15.42 s.
Finished at 2022-03-30 23:56:15."
23:56:29 | INFO | "Total time taken 14.91 s.
Finished at 2022-03-30 23:56:29."
23:56:46 | INFO | "Total time taken 16.8 s.
Finished at 2022-03-30 23:56:46."
23:57:02 | INFO | "Total time taken 15.6 s.
Finished at 2022-03-30 23:57:02."


15.7 s ± 555 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### GCP's client

In [8]:
%%timeit
bigquery_client = bigquery.Client()
df_gcp = bigquery_client.query(test_sql_200).to_dataframe()

8.51 s ± 1.34 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
test_sql_100 = """
SELECT
            
        subreddit_id
        , name
        , title
        , description

FROM data-prod-165221.ds_v2_postgres_tables.subreddit_lookup
WHERE 1=1
    AND dt = (CURRENT_DATE() - 2)  -- subreddit_lookup
    -- Exclude user-profiles + spam & sketchy subs
    AND COALESCE(verdict, 'f') <> 'admin_removed'
    AND COALESCE(is_spam, FALSE) = FALSE
    AND COALESCE(is_deleted, FALSE) = FALSE
    AND deleted IS NULL
    AND type IN ('public', 'private', 'restricted')
    AND NOT REGEXP_CONTAINS(LOWER(name), r'^u_.*')
LIMIT 100000
"""

### GCP's client

In [10]:
%%timeit
bigquery_client = bigquery.Client()
df_gcp = bigquery_client.query(test_sql_100).to_dataframe()

6.75 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Pandas-gbq

In [11]:
%%timeit
df_pd = pd.read_gbq(
    query=test_sql_100,
    progress_bar_type=None,
)

00:01:07 | INFO | "Total time taken 9.53 s.
Finished at 2022-03-31 00:01:07."
00:01:16 | INFO | "Total time taken 9.33 s.
Finished at 2022-03-31 00:01:16."
00:01:26 | INFO | "Total time taken 9.63 s.
Finished at 2022-03-31 00:01:26."
00:01:35 | INFO | "Total time taken 9.67 s.
Finished at 2022-03-31 00:01:35."
00:01:45 | INFO | "Total time taken 9.81 s.
Finished at 2022-03-31 00:01:45."
00:01:54 | INFO | "Total time taken 8.68 s.
Finished at 2022-03-31 00:01:54."
00:02:04 | INFO | "Total time taken 9.71 s.
Finished at 2022-03-31 00:02:04."
00:02:13 | INFO | "Total time taken 9.19 s.
Finished at 2022-03-31 00:02:13."


9.43 s ± 369 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Run query to get subreddits

In [12]:
# subreddit_cls = LoadSubredditsSQL(
#     table='all_reddit_subreddits',
#     dataset='all_reddit',
#     columns='subreddit_name',
#     project_name='data-prod-165221',
#     col_unique_check='subreddit_name',
#     sql_template='all_reddit_subreddits',
#     log_query=True,
# )
# df_subs = subreddit_cls.get_as_dataframe()

In [16]:
subreddit_cls2 = LoadSubredditsSQL(
    table='subreddit_lookup',
    dataset='ds_v2_postgres_tables',
    project_name='data-prod-165221',
    sql_template='subreddit_lookup',
    concat_text_cols="CONCAT(name, '. ', COALESCE(title, ''), '. ', COALESCE(description, ''))",
    log_query=True,
    limit_clause='LIMIT 50000',
)
df_subs_slo = subreddit_cls2.get_as_dataframe()

03:21:50 | INFO | "# Connecting to BigQuery... #"
03:21:51 | INFO | "# Running query... #"
03:21:51 | INFO | "
        SELECT
            
                subreddit_id
                # , name
                # , title
                # , description
            
, CONCAT(name, '. ', COALESCE(title, ''), '. ', COALESCE(description, '')) AS concat_text
        FROM data-prod-165221.ds_v2_postgres_tables.subreddit_lookup
        WHERE 1=1
            AND dt = (CURRENT_DATE() - 2)  -- subreddit_lookup
            -- Exclude user-profiles + spam & sketchy subs
            AND COALESCE(verdict, 'f') <> 'admin_removed'
            AND COALESCE(is_spam, FALSE) = FALSE
            AND COALESCE(is_deleted, FALSE) = FALSE
            AND deleted IS NULL
            AND type IN ('public', 'private', 'restricted')
            AND NOT REGEXP_CONTAINS(LOWER(name), r'^u_.*')
        LIMIT 50000
        "
03:21:51 | INFO | "  2022-03-31 03:21:51.257244 | query START time"
03:21:55 | INFO | "  2022-03-

In [17]:
df_subs_slo.head()

Unnamed: 0,subreddit_id,concat_text
0,t5_zo9in,a:t5_zo9in. Subreddit to use the in app reddit browser.
1,t5_gdrsz,"agentpanda. a testbed for agentpanda stuff. there's nothing that matters here, go away"
2,t5_62eaoz,furryeslove. furryeslove.
3,t5_5742f0,AvengeCopperGolem. AvengeCopperGolem.
4,t5_5zz2v4,Technologymiles. Technologymiles.


In [18]:
df_subs = subreddit_cls2.get_as_dataframe()

03:21:55 | INFO | "  Query already cached"
03:21:55 | INFO | "  (50000, 2) <- df.shape"


In [19]:
df_subs.isnull().sum()

subreddit_id    0
concat_text     0
dtype: int64

In [20]:
df_subs[df_subs['concat_text'].isnull()]

Unnamed: 0,subreddit_id,concat_text


# Load data from within new class

In [51]:
path_djb_repo = '/home/david.bermejo/repos/subreddit_clustering_i18n/' 
path_djb_models = '/home/david.bermejo/repos/subreddit_clustering_i18n/subclu/models' 
file_vectorize_py = 'subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf'

config_name = 'vectorize_subreddits_sql_test_local'

print(path_djb_repo)
print(file_vectorize_py)
print(config_name)

/home/david.bermejo/repos/subreddit_clustering_i18n/
subclu.i18n_topic_model_batch.subclu2.get_embeddings.vectorize_text_tf
vectorize_subreddits_sql_test_local


In [52]:
# run on sample data, test experiment

!cd $path_djb_repo && python -m $file_vectorize_py \
    --config-name $config_name \
    data_text.subreddit_meta.data_loader_kwargs.limit_clause="LIMIT 50000"

2022-03-31 07:53:36.630460: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0
CFG keys: dict_keys(['data_text', 'config_description', 'model_name', 'thing_to_vectorize', 'col_text_for_embeddings', 'batch_inference_rows', 'limit_first_n_chars', 'get_embeddings_verbose', 'cols_index', 'output_bucket', 'output_folder'])
`2022-03-31 07:53:38,826` | `INFO` | `Creating vectorizing class for subreddit_meta...`
`2022-03-31 07:53:38,827` | `INFO` | `Start vectorize function`
`2022-03-31 07:53:38,827` | `INFO` | `Lodaing model: use_multilingual_3`
`2022-03-31 07:53:38,857` | `INFO` | `Using /tmp/tfhub_modules to cache modules.`
2022-03-31 07:53:39.009061: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2022-03-31 07:53:39.948886: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must 

In [47]:
# !pip list