# Purpose

### 2022-09-09
Test running queries in parallel with `dask`. Now that we'll run ANN for 250+ subreddits, running in a single thread could take a loooong time.


### 2022-08-01
Calculating precise nearest neighbors has become too expensive as we go over 40k subreddits. So instead let's calculate approx nearest neighbors (ANN). 

In this notebook we use [ANNOY](https://github.com/spotify/annoy).  Main reason for using annoy over FAISS is that annoy has official wheels in pypi, but FAISS only officially supports installation from conda. For now we don't want to depend on third-party wheels for FAISS b/c that can be messy to install & replicate in a VM. Maybe when we switch to kubeflow we can try FAISS.


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import gc
import os
import logging
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm import tqdm

import mlflow
import hydra
import annoy

import subclu
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)
from subclu.models import aggregate_embeddings_pd

from subclu.utils import set_working_directory, get_project_subfolder
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)
from subclu.models.reshape_clusters_v050 import save_fpr_json


print_lib_versions([annoy, dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
annoy		v: 1.17.0
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.6.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set sqlite database as MLflow URI

In [4]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db'

## Get list of experiments with new function

In [5]:
mlf.list_experiment_meta(output_format='pandas').tail(9)

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
30,30,v0.5.0_mUSE_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/30,active
31,31,v0.5.0_mUSE_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/31,active
32,32,v0.5.0_nearest_neighbors_test,gs://i18n-subreddit-clustering/mlflow/mlruns/32,active
33,33,v0.5.0_nearest_neighbors,gs://i18n-subreddit-clustering/mlflow/mlruns/33,active
34,34,v0.6.0_mUSE_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/34,active
35,35,v0.6.0_mUSE_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/35,active
36,36,v0.6.0_mUSE_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/36,active
37,37,v0.6.0_mUSE_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/37,active
38,38,v0.6.0_nearest_neighbors,gs://i18n-subreddit-clustering/mlflow/mlruns/38,active


## Get runs from embeddings aggregation jobs

Want to make sure we can load these artifacts for other jobs

In [6]:
%%time

df_mlf_runs =  mlf.search_all_runs(experiment_ids=[35])
df_mlf_runs.shape

CPU times: user 56.5 ms, sys: 8.27 ms, total: 64.8 ms
Wall time: 64 ms


(4, 43)

In [7]:
df_mlf_runs.head()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.df_subs_agg_c1_uw-cols,metrics.time_fxn-full_aggregation_fxn_minutes,metrics.time_fxn-data_loading_time,metrics.memory_free,metrics.time_fxn-df_posts_agg_c1_no_delay,metrics.time_fxn-df_subs_agg_c1_uw,metrics.memory_total,metrics.df_subs_agg_c1-cols,metrics.df_v_subs-rows,metrics.df_v_subs-cols,metrics.df_subs_agg_c1_uw-rows,metrics.memory_used,metrics.df_v_post_comments-cols,metrics.cpu_count,metrics.df_posts_agg_c1-cols,metrics.df_v_post_comments-rows,metrics.df_posts_agg_c1-rows,metrics.df_subs_agg_c1-rows,metrics.memory_used_percent,metrics.time_fxn-df_subs_agg_c1,params.mlflow_experiment,params.weight_subreddit_meta,params.mlflow_tracking_uri,params.embeddings_post_and_comments_path,params.cpu_count,params.host_name,params.weight_post_and_comments,params.bucket_output,params.embeddings_subreddit_path,params.memory_total,params.agg_style,params.embeddings_bucket,tags.mlflow.source.git.commit,tags.mlflow.user,tags.mlflow.source.name,tags.mlflow.source.type,tags.host_name
0,badc44b0e5ac467da14f710da0b410c6,35,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts,2022-08-16 08:41:53.006000+00:00,2022-08-31 04:01:44.575000+00:00,515.0,820.674805,3.698822,1197814.0,544.288655,15.926672,1444961.0,515.0,771760.0,514.0,771760.0,774858.0,515.0,96.0,515.0,51906348.0,51906348.0,771760.0,0.536248,15.926672,v0.6.0_mUSE_aggregates,0.15,sqlite,i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding/2022-08-11_084218,96,djb-100-2021-04-28-djb-eda-german-subs,0.85,i18n-subreddit-clustering,i18n_topic_model_batch/runs/20220811/subreddits/text/embedding/2022-08-11_082859,1444961,dask_delayed,i18n-subreddit-clustering,df6a30d80cfe36c1badb1531c7cbae7dd1046f21,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,LOCAL,djb-100-2021-04-28-djb-eda-german-subs
1,ca79765b72c5428395b02926612d85fd,35,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/35/ca79765b72c5428395b02926612d85fd/artifacts,2022-08-16 08:41:31.162000+00:00,2022-08-31 03:13:27.187000+00:00,,,3.719202,1175788.0,,,1444961.0,,771760.0,514.0,,442282.0,515.0,96.0,,51906348.0,,,0.306086,,v0.6.0_mUSE_aggregates,0.15,sqlite,i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding/2022-08-11_084218,96,djb-100-2021-04-28-djb-eda-german-subs,0.85,i18n-subreddit-clustering,i18n_topic_model_batch/runs/20220811/subreddits/text/embedding/2022-08-11_082859,1444961,serial,i18n-subreddit-clustering,df6a30d80cfe36c1badb1531c7cbae7dd1046f21,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,LOCAL,djb-100-2021-04-28-djb-eda-german-subs
2,7552abcd785d4e229c6272aebf1beaf3,35,FAILED,gs://i18n-subreddit-clustering/mlflow/mlruns/35/7552abcd785d4e229c6272aebf1beaf3/artifacts,2022-08-16 08:35:25.596000+00:00,2022-08-16 08:40:12.627000+00:00,,,0.734633,1281046.0,,,1444961.0,,771760.0,514.0,,59844.0,515.0,96.0,,11644466.0,,,0.041416,,v0.6.0_mUSE_aggregates,0.15,sqlite,i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding/2022-08-11_084218,96,djb-100-2021-04-28-djb-eda-german-subs,0.85,i18n-subreddit-clustering,i18n_topic_model_batch/runs/20220811/subreddits/text/embedding/2022-08-11_082859,1444961,dask_delayed,i18n-subreddit-clustering,df6a30d80cfe36c1badb1531c7cbae7dd1046f21,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,LOCAL,djb-100-2021-04-28-djb-eda-german-subs
3,15b78241482e492cae90644e9733b50c,35,FAILED,gs://i18n-subreddit-clustering/mlflow/mlruns/35/15b78241482e492cae90644e9733b50c/artifacts,2022-08-16 07:37:37.787000+00:00,2022-08-16 07:43:09.168000+00:00,,,,1430384.0,,,1444961.0,,771760.0,514.0,,2822.0,,96.0,,,,,0.001953,,,0.15,,i18n_topic_model_batch/runs/20220811/post_and_comment_text_combined/text_all/embedding/2022-08-11_084218,96,djb-100-2021-04-28-djb-eda-german-subs,0.85,,i18n_topic_model_batch/runs/20220811/subreddits/text/embedding/2022-08-11_082859,1444961,,i18n-subreddit-clustering,df6a30d80cfe36c1badb1531c7cbae7dd1046f21,jupyter,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,LOCAL,djb-100-2021-04-28-djb-eda-german-subs


In [8]:
run_uuid = 'badc44b0e5ac467da14f710da0b410c6'

# Check run artifacts

In [9]:
l_artifacts_top_level = mlf.list_run_artifacts(
    run_id=run_uuid,
    only_top_level=True,
    verbose=True,
)
len(l_artifacts_top_level)

08:11:54 | INFO | "   219 <- Artifacts to check count"
08:11:54 | INFO | "   219 <- Artifacts clean count"
08:11:54 | INFO | "     5 <- Artifacts & folders at TOP LEVEL clean count"


5

In [10]:
l_artifacts_all = mlf.list_run_artifacts(
    run_id=run_uuid,
    only_top_level=False,
    verbose=False,
)
len(l_artifacts_all)

08:12:01 | INFO | "   219 <- Artifacts clean count"
08:12:01 | INFO | "     5 <- Artifacts & folders at TOP LEVEL clean count"


219

In [11]:
l_artifacts_top_level

['df_posts_agg_c1',
 'df_subs_agg_c1',
 'df_subs_agg_c1_ndjson',
 'df_subs_agg_c1_unweighted',
 'df_subs_agg_c1_unweighted_ndjson']

In [12]:
l_sub_c = [i for i in l_artifacts_all if 'df_subs_agg_c1' in i]
print(len(l_sub_c))
l_sub_c[:6]

14


['mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_subs_agg_c1/_common_metadata',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_subs_agg_c1/_metadata',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_subs_agg_c1/part.0.parquet',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_subs_agg_c1/part.1.parquet',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_subs_agg_c1/part.2.parquet',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_subs_agg_c1/part.3.parquet']

In [13]:
l_post_c = [i for i in l_artifacts_all if 'df_posts_agg_c1' in i]
print(len(l_post_c))
l_post_c[:6]

205


['mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_posts_agg_c1/_common_metadata',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_posts_agg_c1/_metadata',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_posts_agg_c1/part.0.parquet',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_posts_agg_c1/part.1.parquet',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_posts_agg_c1/part.10.parquet',
 'mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_posts_agg_c1/part.100.parquet']

# Load aggregated embeddings

use `gsutil` to download embeddings for posts b/c that can take a LONG time to download sequentially. `gsutil` makes parallel downloaidng much faster and reports download speeds above 500MB / s:
```bash
ents_sub_desc/part.67.parquet...
/ [2/197 files][ 61.7 GiB/ 75.4 GiB]  81% Done 632.0 MiB/s ETA 00:00:22
```

In [14]:
%%time

df_agg_sub_c = mlf.read_run_artifact(
    run_id=run_uuid,
    artifact_folder='df_subs_agg_c1',
    read_function='pd_parquet',
    verbose=False,
)
print(df_agg_sub_c.shape)

08:12:07 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/35/badc44b0e5ac467da14f710da0b410c6/artifacts/df_subs_agg_c1"
100%|########################################| 14/14 [00:00<00:00, 51554.22it/s]
08:12:07 | INFO | "  Parquet files found:     4"
08:12:07 | INFO | "  Parquet files to use:     4"


(771760, 515)
CPU times: user 9.88 s, sys: 3.61 s, total: 13.5 s
Wall time: 7.63 s


In [15]:
df_agg_sub_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 771760 entries, 0 to 771759
Columns: 515 entries, subreddit_id to embeddings_511
dtypes: float32(512), int64(1), object(2)
memory usage: 1.5+ GB


In [16]:
df_agg_sub_c.iloc[:5, :25]

Unnamed: 0,subreddit_id,subreddit_name,posts_for_embeddings_count,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21
0,t5_1001tl,jewel_xo,1,-0.013827,-0.022239,0.049441,0.04947,-0.003573,0.0403,-0.017904,0.008067,-0.037719,-0.002597,0.008067,0.063312,0.014693,-0.042116,0.039357,-0.001491,0.068194,-0.0228,0.044375,0.017662,0.055988,-0.039419
1,t5_10029e,milkyhentai,1,-0.023227,-0.002677,0.031942,-0.010885,-0.02366,0.031216,0.044461,-0.024292,0.015413,0.047858,0.066368,0.076699,-0.040204,-0.004243,-0.048563,0.01012,0.024157,-0.020109,0.05592,0.015352,0.033357,0.010606
2,t5_1006k8,badwouldyourather,1,-0.032487,0.024979,-0.021948,0.039006,0.053261,0.037567,0.036113,0.011514,0.012002,0.020055,0.052276,0.02545,0.050676,-8e-06,-0.005012,0.000559,0.058759,-0.002293,0.010347,0.009482,0.024522,-0.02372
3,t5_100806,jojojosiah,2,0.004711,0.005103,0.037912,0.023591,0.02459,0.029586,-0.012185,-0.031729,-0.016308,0.063303,-0.015289,0.008682,0.008985,0.006243,-0.000484,0.015052,0.003276,0.002508,0.009955,-0.004335,0.001302,-0.016699
4,t5_1009a3,memesenespanol,380,0.003731,-0.013876,-0.003987,0.002683,-0.010202,0.038552,0.012759,0.016535,-0.056693,0.001183,0.009329,-0.005247,0.00963,-0.001513,0.000606,-0.004258,-0.005582,-0.002777,-0.001939,0.002463,-0.003895,-0.006126


In [17]:
df_agg_sub_c.iloc[-10:, :25]

Unnamed: 0,subreddit_id,subreddit_name,posts_for_embeddings_count,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21
771750,t5_6qo047,scienceno488,0,0.049827,0.008324,0.033611,0.032743,0.025569,0.08428,-0.016352,-0.054408,-0.030566,-0.041512,-0.073808,0.016681,-0.042703,0.033103,-0.078955,-0.012576,-0.057177,-0.055155,-0.023454,-0.001195,-0.053379,0.017656
771751,t5_6phqtd,jessivann,0,-0.03014,-0.02967,0.020029,0.082169,0.044654,0.040252,-0.059091,-0.060705,-0.063691,-0.000536,0.060795,0.091693,-0.052438,-0.0421,-0.031989,0.03837,-0.0137,-0.020486,0.059314,0.009449,0.082631,-0.055906
771752,t5_6ttq8w,emiii,0,-0.014652,-0.025286,0.043648,0.031067,0.015594,0.04937,-0.0435,-0.038448,-0.079371,0.045315,0.05531,0.089274,-0.030252,0.011965,-0.010014,-0.002786,0.049631,0.012006,0.05387,0.002287,0.051208,0.021202
771753,t5_6hetta,tinomantana,0,-0.039023,0.00349,0.019,0.030172,0.045525,0.027864,-0.06264,-0.071491,-0.060166,0.022435,0.082675,0.084102,-0.073745,0.023707,-0.021793,-0.006323,0.015783,-0.057229,0.059388,0.006303,0.070736,-0.043325
771754,t5_6t85bh,twistedspun,0,-0.041944,0.011073,0.008569,0.025391,0.04935,0.047871,-0.018863,0.007396,-0.059508,0.070213,0.088125,0.106481,-0.048567,0.022983,-0.02309,-0.011047,0.04233,-0.022907,0.015026,0.017319,0.03738,0.012689
771755,t5_wi2zh,autocadmemes,0,-0.031634,-0.03242,-0.076026,-0.027858,0.026071,-0.07847,0.050454,0.065865,-0.03423,-0.037524,0.073746,-0.062604,-0.046956,-0.072842,-0.072843,0.040903,0.066408,0.024241,-0.073683,0.017086,-0.057149,0.00301
771756,t5_6rhnln,gamestopcensorship,0,-0.03322,-0.046152,0.040127,-0.071822,0.037267,0.045015,-0.014639,0.002809,-0.014747,-0.03038,0.042313,-0.050563,-0.068255,-0.057389,0.008784,-0.008465,0.00622,-0.029101,-0.070836,-0.01327,0.021226,0.000765
771757,t5_6pg8i2,coffeepoblog,0,-0.023961,-0.00217,0.044053,-0.000543,0.048995,0.034303,-0.014573,0.051975,-0.068334,-0.032103,-0.026937,-0.056592,-0.054808,0.002123,-0.082676,0.071959,-0.061371,-0.02176,0.016584,-0.02959,0.019676,-0.072884
771758,t5_6cv9bz,threadtreatment,0,-0.020639,0.030801,-0.007754,0.075224,0.028055,0.026198,0.027853,0.057297,-0.058218,0.045436,0.031521,0.02764,-0.053154,0.035169,-0.021854,-0.052122,0.019887,-0.027726,-0.095366,-0.017542,-0.018356,-0.046369
771759,t5_4la7jo,spongebobcringememes,0,-0.004059,0.013185,-0.059028,0.054473,0.048189,0.0258,0.028421,0.038999,-0.09201,0.065643,0.035775,-0.049399,-0.010349,-0.042322,-0.045216,-0.043074,0.044714,-0.033876,0.0506,0.025916,0.04787,5.7e-05


# Filter subreddits to use in ANN index

In a previous version we only kept subs that had embeddings AND clustering data. 
<br>Now that we cover 700k subreddits for v0.6.0, we need to be more thoughtfula bout how we'll select which subs to keep for ANN.

For v0.6.0 we'll keep only subs that have 5+ posts in L90 days. From this mode dashboard we expect that number to be around 250k subreddits.

Mode Dashboard: https://app.mode.com/reddit/reports/e6cde33162c4 

## Load metdata to apply other filters [optional]

If we want to filter subreddits based on other data, we'll need to pull data from mlflow or BigQuery.


In [18]:
# run_id_final_model = ''

In [19]:
# l_artifacts_top_level = mlf.list_run_artifacts(
#     run_id=run_id_final_model,
#     only_top_level=True,
#     verbose=True,
# )
# len(l_artifacts_top_level)

In [20]:
# l_artifacts_all = mlf.list_run_artifacts(
#     run_id=run_id_final_model,
#     only_top_level=False,
#     verbose=False,
# )
# len(l_artifacts_all)

In [21]:
# l_artifacts_top_level

In [22]:
# l_sub_c = [i for i in l_artifacts_all if 'df_labels' in i]
# print(len(l_sub_c))
# l_sub_c[:6]

In [23]:
# df_labels = mlf.read_run_artifact(
#     run_id=run_id_final_model,
#     artifact_folder='df_labels',
#     read_function='pd_parquet',
#     verbose=False,
# )
# print(df_labels.shape)

In [24]:
# df_labels.iloc[:5, :15]

## Apply filters

In v0.6.0, we already have the number of posts for embedding in the embedding file, so we don't need to load additional files to apply filters

In [25]:
# use df_pc_counts because it has the counts for post+comment after filtering for length
value_counts_and_pcts(
  pd.cut(
      df_agg_sub_c['posts_for_embeddings_count'],
      bins=[-1, 0, 1, 2, 3, 4, 5, 49, np.inf],
      labels=[
        "00 posts", "01 post", '02 posts', '03 posts',
        '04 posts', '05 posts'
        , '06-49 posts', '50+ posts'
      ]
  ).rename('posts_with_len_3+'),
  sort_index=True,
  add_col_prefix=False,
  count_type='subreddits',
  sort_index_ascending=False,
  cumsum_count=True,
  reset_index=True,
).hide_index().set_caption(f"<h4 align='left'>Post distribution for subreddits with 1 view & 1 attempted post in L90-days</h4>")

posts_with_len_3+,subreddits_count,percent_of_subreddits,cumulative_sum_of_subreddits,cumulative_percent_of_subreddits
50+ posts,72510,9.4%,72510,9.4%
06-49 posts,154858,20.1%,227368,29.5%
05 posts,23205,3.0%,250573,32.5%
04 posts,33084,4.3%,283657,36.8%
03 posts,56898,7.4%,340555,44.1%
02 posts,125070,16.2%,465625,60.3%
01 post,240338,31.1%,705963,91.5%
00 posts,65797,8.5%,771760,100.0%


In [26]:
df_agg_sub_c_raw = df_agg_sub_c.copy()

df_agg_sub_c = df_agg_sub_c[df_agg_sub_c['posts_for_embeddings_count'] >= 5]
df_agg_sub_c.shape

(250573, 515)

# Build annoy index

I created a custom `AnnoyIndex` class with some extra methods to create outputs & (and calculate cosine distance) for BigQuery.

In [27]:
from subclu.models.nn_annoy import AnnoyIndex

In [28]:
%%time

index_cols = ['subreddit_id', 'subreddit_name']
l_embedding_cols = [c for c in df_agg_sub_c.columns if c.startswith('embeddings_')]
n_trees = 200

nn_index = AnnoyIndex(
    df_agg_sub_c[l_embedding_cols + index_cols],
    index_cols=index_cols,
    metric='angular',
    n_trees=n_trees,
)

nn_index.build()

CPU times: user 1h 1min 57s, sys: 40.4 s, total: 1h 2min 37s
Wall time: 1min 2s


## Test `search_k`
`search_k=-1` will search all trees and get the most accurate results but it will take longer to compute.

Even with small changes we can see in the examples below that there is a time difference and sometimes even in the top10 results we will miss a neighbor when we set k<=3 (only search k=3 -> only search 3 trees).

In [29]:
%%time

n_test_i = 212
nn_index.get_top_n_by_item(n_test_i, k=9, search_k=-1, include_distances=True)

CPU times: user 275 ms, sys: 72 ms, total: 347 ms
Wall time: 346 ms


Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance
0,t5_10dzqu,godawfulmovies,0,t5_10dzqu,godawfulmovies,0.0
1,t5_10dzqu,godawfulmovies,1,t5_me7ba,podcastsharing,0.69861
2,t5_10dzqu,godawfulmovies,2,t5_2u29p,filmjunk,0.705137
3,t5_10dzqu,godawfulmovies,3,t5_2c7q0h,podcastpromoting,0.716215
4,t5_10dzqu,godawfulmovies,4,t5_n99oj,findthepathpodcast,0.716643
5,t5_10dzqu,godawfulmovies,5,t5_t6jv7,sinisterhood,0.716963
6,t5_10dzqu,godawfulmovies,6,t5_2zzeu,highersidechats,0.720665
7,t5_10dzqu,godawfulmovies,7,t5_np3is,letsgo2courtpodcast,0.721888
8,t5_10dzqu,godawfulmovies,8,t5_2t8p3,wehatemovies,0.723386


In [30]:
%%time
nn_index.get_top_n_by_item(n_test_i, k=9, search_k=1, include_distances=True)

CPU times: user 245 ms, sys: 12 µs, total: 245 ms
Wall time: 244 ms


Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance
0,t5_10dzqu,godawfulmovies,0,t5_10dzqu,godawfulmovies,0.0
1,t5_10dzqu,godawfulmovies,1,t5_2u29p,filmjunk,0.705137
2,t5_10dzqu,godawfulmovies,2,t5_2c7q0h,podcastpromoting,0.716215
3,t5_10dzqu,godawfulmovies,3,t5_n99oj,findthepathpodcast,0.716643
4,t5_10dzqu,godawfulmovies,4,t5_2zzeu,highersidechats,0.720665
5,t5_10dzqu,godawfulmovies,5,t5_np3is,letsgo2courtpodcast,0.721888
6,t5_10dzqu,godawfulmovies,6,t5_35xxi9,headgumpodcast,0.738195
7,t5_10dzqu,godawfulmovies,7,t5_2vo38,harmontown,0.741415
8,t5_10dzqu,godawfulmovies,8,t5_26gz8w,theteamhouse,0.746122


In [35]:
top_k_test_ = 20
cols_drop_ = ['subreddit_id_a', 'subreddit_id_b', 'distance']
cols_append_ = ['subreddit_name_b',]
df_compare_sk = nn_index.get_top_n_by_item(
    n_test_i, k=top_k_test_, search_k=-1, include_distances=True
).drop(cols_drop_, axis=1)

for k_ in [int(0.998 * n_trees), int(0.85 * n_trees), 
           int(0.5 * n_trees), min([200, int(0.1 * n_trees)]),
           1]:
    df_compare_sk = pd.concat(
        [
            df_compare_sk,
            nn_index.get_top_n_by_item(
                n_test_i, k=top_k_test_, search_k=k_, include_distances=True
            )[cols_append_].rename(columns={c: f"{c}_{k_}" for c in df_compare_sk.columns})
        ],
        axis=1,
    )
df_compare_sk

Unnamed: 0,subreddit_name_a,distance_rank,subreddit_name_b,subreddit_name_b_199,subreddit_name_b_170,subreddit_name_b_100,subreddit_name_b_20,subreddit_name_b_1
0,godawfulmovies,0,godawfulmovies,godawfulmovies,godawfulmovies,godawfulmovies,godawfulmovies,godawfulmovies
1,godawfulmovies,1,podcastsharing,filmjunk,filmjunk,filmjunk,filmjunk,filmjunk
2,godawfulmovies,2,filmjunk,podcastpromoting,podcastpromoting,podcastpromoting,podcastpromoting,podcastpromoting
3,godawfulmovies,3,podcastpromoting,findthepathpodcast,findthepathpodcast,findthepathpodcast,findthepathpodcast,findthepathpodcast
4,godawfulmovies,4,findthepathpodcast,highersidechats,highersidechats,highersidechats,highersidechats,highersidechats
5,godawfulmovies,5,sinisterhood,letsgo2courtpodcast,letsgo2courtpodcast,letsgo2courtpodcast,letsgo2courtpodcast,letsgo2courtpodcast
6,godawfulmovies,6,highersidechats,headgumpodcast,headgumpodcast,headgumpodcast,headgumpodcast,headgumpodcast
7,godawfulmovies,7,letsgo2courtpodcast,harmontown,harmontown,harmontown,harmontown,harmontown
8,godawfulmovies,8,wehatemovies,theteamhouse,theteamhouse,theteamhouse,theteamhouse,theteamhouse
9,godawfulmovies,9,weeklyplanetpodcast,headgum,headgum,headgum,headgum,headgum


## Get df with all items

For 80k subreddits it took:

Test a few values of k_search.
```bash
100%|██████████| 81973/81973 [1:17:02<00:00, 17.73it/s]
17:07:23 | INFO | "(8115327, 7) <- df_top_items shape"
```

In [34]:
%%time

for i in tqdm(range(nn_index.n_rows)):
    _ = nn_index.index.get_nns_by_item(
        i,
        n=200,
        search_k=-1,
        include_distances=True
    )

100%|██████████| 250573/250573 [30:55<00:00, 135.07it/s]

CPU times: user 30min 51s, sys: 13.3 s, total: 31min 4s
Wall time: 30min 55s





In [None]:
from typing import Tuple


top_k = 200
l_topk_dfs = list()

for i in tqdm(range(nn_index.n_rows), mininterval=2.5):
    i_nn: Tuple[int, float] = nn_index.index.get_nns_by_item(
        1,
        top_k + 1,
        search_k=-1,
        include_distances=True
    )
    # we need to skip the first item because it's always the self (index number)
    l_topk_dfs.append(
        pd.DataFrame(
            {
                'seed_ix': [i] * top_k,
                'nn_ix': i_nn[0][1:],
                'distance': i_nn[1][1:],
                'distance_rank': [v for v in range(1, top_k + 1)]
            },
        )
    )
logging.info(f"Start combining all dfs together...")
df_nn_top = pd.concat(l_topk_dfs, ignore_index=True)
df_nn_top.shape

 54%|█████▎    | 134613/250573 [18:08<15:19, 126.09it/s]

In [47]:
# print(f"search all (k_search=-1)")
# df_nn_top_old = nn_index.get_top_n_by_item_all(
#     k=201,
#     search_k=-1, 
#     include_distances=True,
#     append_i=True,
# )

In [46]:
# print(f"search all (k_search=-1)")
# df_nn_top_old2 = nn_index.get_top_n_by_item_all(
#     k=101,
#     search_k=-1, 
#     include_distances=True,
#     append_i=True,
# )

# Convert plain distance to cosine similarity
from [github](https://github.com/spotify/annoy/issues/112#issuecomment-686513356)
```
cosine_similarity = 1 - cosine_distance^2/2
```

In [111]:
%%time

col_cosine_similarity = 'cosine_similarity'

if col_cosine_similarity not in df_nn_top:
    df_nn_top[col_cosine_similarity] = (
        1 -
        (df_nn_top['distance'] ** 2) / 2
    )

CPU times: user 252 ms, sys: 272 ms, total: 524 ms
Wall time: 523 ms


In [152]:
df_nn_top.head()

Unnamed: 0,seed_ix,nn_ix,distance,distance_rank,cosine_similarity
0,0,243977,0.815885,1,0.667166
1,0,24691,0.831054,2,0.654674
2,0,1905,0.840645,3,0.646658
3,0,91727,0.842226,4,0.645328
4,0,128576,0.84243,5,0.645156


In [153]:
df_nn_top.tail()

Unnamed: 0,seed_ix,nn_ix,distance,distance_rank,cosine_similarity
50114595,250572,69975,0.920653,196,0.576199
50114596,250572,242456,0.920664,197,0.576189
50114597,250572,231779,0.920789,198,0.576073
50114598,250572,87827,0.920812,199,0.576052
50114599,250572,41140,0.920877,200,0.575993


# Reshape to include subreddit IDs & subreddit names

Also reshape to final JSON output given that it'll be easier to load into GCP that way


In [None]:
TODO

In [86]:
# TODO convert this dictionary into an object in the class itself
d_index_to_sub_id = dict()
d_index_to_sub_name = dict()

# getting subreddit name using a df is EXPENSIVE
n_test_ix_d = 2000
# for i in tqdm(range(len(nn_index.index_labels)), mininterval=2.5):
for i in tqdm(range(n_test_ix_d), mininterval=2):
    d_index_to_sub_id[i] = nn_index.index_labels[i]
    
    d_index_to_sub_name[i] = (
        nn_index.index_labels_df
        .loc[nn_index.index_labels_df['subreddit_id'] == d_index_to_sub_id[i], 'subreddit_name']
        .values[0]
    )

100%|██████████| 2000/2000 [00:30<00:00, 64.73it/s]


In [85]:
# we expect that by location (after resetting index) we should get the same dict, but we need to check
d_index_to_sub_id_ix = dict()
d_index_to_sub_name_ix = dict()

df_labels_reset_index = nn_index.index_labels_df.copy().reset_index(drop=True)

for i in tqdm(range(n_test_ix_d), mininterval=2):
    d_index_to_sub_id_ix[i] = nn_index.index_labels[i]
    d_index_to_sub_name_ix[i] = df_labels_reset_index.iloc[i, 1]

100%|██████████| 2000/2000 [00:00<00:00, 47974.38it/s]


In [101]:
%%time

df_check_sub_ids = pd.DataFrame([d_index_to_sub_id, d_index_to_sub_id_ix], index=['regular', 'index']).T
print(df_check_sub_ids.shape)
np.all(df_check_sub_ids['regular'] == df_check_sub_ids['index'])

(2000, 2)
CPU times: user 114 ms, sys: 18 µs, total: 114 ms
Wall time: 113 ms


True

In [103]:
%%time

df_check_sub_names = pd.DataFrame([d_index_to_sub_name, d_index_to_sub_name_ix], index=['regular', 'index']).T
print(df_check_sub_names.shape)
np.all(df_check_sub_names['regular'] == df_check_sub_names['index'])
assert np.all(df_check_sub_names['regular'] == df_check_sub_names['index'])

(2000, 2)
CPU times: user 113 ms, sys: 15 µs, total: 113 ms
Wall time: 113 ms


In [117]:
del d_index_to_sub_id, d_index_to_sub_id_ix, d_index_to_sub_name, d_index_to_sub_name_ix

### ~Move forward with index-based creation for dictionary~
~It's orders of magnitude faster! and in testing we get the same values~

UPDATE: don't even bother with dictionary & replace, just straight up merge on index. It's 100x faster.

In [148]:
# %%time

# # we expect that by location (after resetting index) we should get the same dict, but we need to check
# d_index_to_sub_id_all = dict()
# d_index_to_sub_name_all = dict()

# df_labels_reset_index = nn_index.index_labels_df.copy().reset_index(drop=True)

# for i in tqdm(range(len(nn_index.index_labels)), mininterval=1):
#     d_index_to_sub_id_all[i] = nn_index.index_labels[i]
#     d_index_to_sub_name_all[i] = df_labels_reset_index.iloc[i, 1]

In [106]:
# %%timeit
# # replacing the index with the subreddit ID seems to be the most expensive part
# #. Even for ~1000 subreddits it can take 10+ seconds
# _ = df_nn_top_final.head(1000)['seed_ix'].replace(d_index_to_sub_id)

In [114]:
df_nn_top.shape

(50114600, 7)

In [151]:
df_nn_top.head()

Unnamed: 0,seed_ix,nn_ix,distance,distance_rank,cosine_similarity
0,0,243977,0.815885,1,0.667166
1,0,24691,0.831054,2,0.654674
2,0,1905,0.840645,3,0.646658
3,0,91727,0.842226,4,0.645328
4,0,128576,0.84243,5,0.645156


### We're going to get all the names & IDs for the ANN because that can be super expensive

Better to do it once for all subs and save it, than one at a time

-- 
- 0 seconds (!!) | merge on index for ID & Name
- 9 seconds | lookup ID + merge for name
- 20 seconds | lookup ID + lookup Name

OMG I wasted so much time, but merging on index is so freaking fast. 
It's faster to lookup the ID and then merge a df with the name+index/

In [146]:
%%time
# fudge, if the "replace" step is the worst thing, then just do a merge on index! 
#. after all, the index of the df (after we reset_index()) is the ID we get from ANNOY!

df_labels_reset_index = nn_index.index_labels_df.copy().reset_index(drop=True)
df_nn_top_final = df_nn_top.copy()
prefix_similar_sub = 'similar'

# append IDs & names for seed & nn (nearest neighbors)
df_nn_top_final = (
    df_labels_reset_index
    .merge(
        df_nn_top_final,
        how='right',
        left_index=True,
        right_on='seed_ix'
    )
)
df_nn_top_final.shape

CPU times: user 3.58 s, sys: 1.82 s, total: 5.4 s
Wall time: 5.4 s


(50114600, 9)

In [147]:
df_nn_top_final.head()

Unnamed: 0,subreddit_id,subreddit_name,seed_ix,nn_ix,distance,distance_rank,ann_dt,model_version,cosine_similarity
0,t5_1009a3,memesenespanol,0,243977,0.815885,1,2022-09-09,0.6.0,0.667166
1,t5_1009a3,memesenespanol,0,24691,0.831054,2,2022-09-09,0.6.0,0.654674
2,t5_1009a3,memesenespanol,0,1905,0.840645,3,2022-09-09,0.6.0,0.646658
3,t5_1009a3,memesenespanol,0,91727,0.842226,4,2022-09-09,0.6.0,0.645328
4,t5_1009a3,memesenespanol,0,128576,0.84243,5,2022-09-09,0.6.0,0.645156


In [140]:
# d_topk_final = dict()
l_topk_final = list()
df_labels_reset_index = nn_index.index_labels_df.copy().reset_index(drop=True)

d_topk_meta = {
    'pt': datetime.utcnow().strftime("%Y-%m-%d"),
    'mlflow_run_id': run_uuid, 
    'model_name': 'cau-text-mUSE',
    'model_version': 'v0.6.0',
}


for seed_ix_, df_seed_ in tqdm(df_nn_top.head(400).groupby(['seed_ix'])):
    # df_seed_['subreddit_id'] = df_seed_['nn_ix'].replace(d_index_to_sub_id_all)
    df_seed_ = df_seed_.merge(
        df_labels_reset_index,
        how='left',
        left_on='nn_ix',
        right_index=True,
    )
    l_topk_final.append(
        {
            **d_topk_meta,
            **{
                'subreddit_id': d_index_to_sub_id_all[seed_ix_],
                'subreddit_name': d_index_to_sub_name_all[seed_ix_],
                'similar_subreddit': {
                    'subreddit_id': list(df_seed_['subreddit_id']),
                    'subreddit_name': list(df_seed_['subreddit_name']),
                    'cosine_similarity': list(df_seed_['cosine_similarity']),
                    'distance_rank': list(df_seed_['distance_rank']),
                }
            }
        }
    )

100%|██████████| 2/2 [00:00<00:00, 66.02it/s]


In [142]:
# l_topk_final[0]

In [121]:
%%time

mask_seed_ = df_nn_top['seed_ix'] == 2
df_seed_ = df_nn_top[mask_seed_]
df_seed_['subreddit_id'] = df_seed_['nn_ix'].replace(d_index_to_sub_id_all)
df_seed_['subreddit_name'] = df_seed_['nn_ix'].replace(d_index_to_sub_name_all)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


CPU times: user 22.5 s, sys: 1.07 s, total: 23.6 s
Wall time: 23.1 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [122]:
df_seed_.head()

Unnamed: 0,seed_ix,nn_ix,distance,distance_rank,ann_dt,model_version,cosine_similarity,subreddit_id,subreddit_name
400,2,243977,0.815885,1,2022-09-09,0.6.0,0.667166,t5_o07w2,quackity
401,2,24691,0.831054,2,2022-09-09,0.6.0,0.654674,t5_2rmzr,cyr
402,2,1905,0.840645,3,2022-09-09,0.6.0,0.646658,t5_13s5fy,moistcr1tikal
403,2,91727,0.842226,4,2022-09-09,0.6.0,0.645328,t5_3w6bst,thehoa
404,2,128576,0.84243,5,2022-09-09,0.6.0,0.645156,t5_5d0ktc,fellaspodcast


In [125]:
%%time
for seed_ix_, df_seed_ in tqdm(df_nn_top.head(1000).groupby(['seed_ix'])):
    print(seed_ix_)
    print(df_seed_.shape)

100%|██████████| 5/5 [00:00<00:00, 3313.56it/s]

0
(200, 7)
1
(200, 7)
2
(200, 7)
3
(200, 7)
4
(200, 7)
CPU times: user 2.96 ms, sys: 3.99 ms, total: 6.95 ms
Wall time: 5.7 ms





## Check some example outputs

In [45]:
df_nn_top.head()

Unnamed: 0,seed_ix,nn_ix,distance,distance_rank,ann_dt,model_version
0,0,243977,0.815885,1,2022-09-09,0.6.0
1,0,24691,0.831054,2,2022-09-09,0.6.0
2,0,1905,0.840645,3,2022-09-09,0.6.0
3,0,91727,0.842226,4,2022-09-09,0.6.0
4,0,128576,0.84243,5,2022-09-09,0.6.0


In [65]:
df_nn_top_final.head()

Unnamed: 0_level_0,subreddit_name_a,seed_ix,nn_ix,distance,distance_rank,ann_dt,model_version
subreddit_id_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
t5_1009a3,memesenespanol,0,243977,0.815885,1,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,24691,0.831054,2,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,1905,0.840645,3,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,91727,0.842226,4,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,128576,0.84243,5,2022-09-09,0.6.0


In [66]:
(
    df_nn_top_final[df_nn_top_final['subreddit_name_a'] == 'ich_iel']
    .head(15)
)

Unnamed: 0_level_0,subreddit_name_a,seed_ix,nn_ix,distance,distance_rank,ann_dt,model_version
subreddit_id_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [67]:
(
    df_nn_top_final[df_nn_top_final['subreddit_name_a'] == 'vegetarischde']
    .head(15)
)

Unnamed: 0_level_0,subreddit_name_a,seed_ix,nn_ix,distance,distance_rank,ann_dt,model_version
subreddit_id_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [None]:
(
    df_nn_top_final[df_nn_top_final['subreddit_name_a'] == 'antivegan']
    .head(15)
)

In [68]:
(
    df_nn_top_final[df_nn_top_final['subreddit_name_a'] == 'mexico']
    .head(15)
)

Unnamed: 0_level_0,subreddit_name_a,seed_ix,nn_ix,distance,distance_rank,ann_dt,model_version
subreddit_id_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [72]:
(
    df_nn_top_final[df_nn_top_final['subreddit_name_a'] == 'memesenespanol']
    .head(15)
)

Unnamed: 0_level_0,subreddit_name_a,seed_ix,nn_ix,distance,distance_rank,ann_dt,model_version
subreddit_id_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
t5_1009a3,memesenespanol,0,243977,0.815885,1,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,24691,0.831054,2,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,1905,0.840645,3,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,91727,0.842226,4,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,128576,0.84243,5,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,55750,0.847682,6,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,3038,0.849107,7,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,591,0.849913,8,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,17592,0.851358,9,2022-09-09,0.6.0
t5_1009a3,memesenespanol,0,1331,0.852055,10,2022-09-09,0.6.0


In [69]:
(
    df_nn_top_final[df_nn_top_final['subreddit_name_a'] == 'de']
    .head(15)
)

Unnamed: 0_level_0,subreddit_name_a,seed_ix,nn_ix,distance,distance_rank,ann_dt,model_version
subreddit_id_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [None]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'ich_iel']
    .head(15)
)

In [None]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'cfb']
    .head(15)
)

In [None]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'finanzen']
    .head(15)
)

# Add dt/pt column & metadata columns
- ann dt

In [None]:
df_nn_top['ann_dt'] = pd.to_datetime(pd.to_datetime(datetime.utcnow()).date())

In [None]:
df_nn_top['model_version'] = '0.6.0'

In [None]:
df_nn_top.tail()

In [None]:
df_nn_top.info()

# Save to local & log to Mlflow

Instead of saving it to random location in GCS, save artifact locally & then log it to mlflow job as a new artifact.

Make sure to append a timestamp in case we try different ANN approaches


In [None]:
manual_model_timestamp = datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')
path_this_model = get_project_subfolder(
    f"data/models/ann/manual_v060_{manual_model_timestamp}"
)
Path.mkdir(path_this_model, parents=True, exist_ok=True)
path_this_model

In [None]:
%%time

save_pd_df_to_parquet_in_chunks(
    df_nn_top,
    path_this_model / 'df_index_only',
    write_index=False
)

# Save df to GCS

From here we should be able to share & create a BigQuery table

In [None]:
BREAK

In [66]:
import pyarrow as pa

In [86]:
gs_nearest_neighbors_root = 'gs://i18n-subreddit-clustering/data/models/nearest_neighbors'
gs_this_model = f"{gs_nearest_neighbors_root}/manual_model_{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}"
gs_this_model

'gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-08-01_181632'

In [87]:
%%time
shape_ = df_nn_top.shape

df_nn_top.to_parquet(
    f"{gs_this_model}/df_nearest_neighbors_top_full-{shape_[0]}_by_{shape_[1]}.parquet"
)

CPU times: user 5.09 s, sys: 457 ms, total: 5.54 s
Wall time: 10.1 s


## Create schema for BigQuery parquet file

We'll save 1 file with all the raw data and a different file with the data needed to create a BQ table.

We'll also create a schema for the BQ file so that BQ can display some metadata about the tables

In [137]:
from google.cloud import bigquery


l_cols_for_bq = [
    'subreddit_id_a',
    'subreddit_name_a',
    'distance_rank',
    'subreddit_id_b',
    'subreddit_name_b',
    'cosine_similarity',
    'ann_dt',
    'model_version',
]

l_bq_json_schema = [
    bigquery.SchemaField(
        mode="REQUIRED",
        name="subreddit_id_a",
        field_type="STRING",
        description="Seed subreddit ID. Seed = subreddit for which you want to find N most similar subs"
    ),
    bigquery.SchemaField(
        mode="REQUIRED",
        name="subreddit_name_a",
        field_type="STRING",
        description="Seed subreddit name"
    ),
    bigquery.SchemaField(
        mode="NULLABLE",
        name="distance_rank",
        field_type="INTEGER",
        description="Rank for how closse subreddit_A is subreddit_B. 1=closest, 2=2nd closest"
    ),
    bigquery.SchemaField(
        mode="REQUIRED",
        name="subreddit_id_b",
        field_type="STRING",
        description="Subreddit ID for most similar subs to subreddit_a"
    ),
    bigquery.SchemaField(
        mode="REQUIRED",
        name="subreddit_name_b",
        field_type="STRING",
        description="Subreddit B name"
    ),
    bigquery.SchemaField(
        mode="REQUIRED",
        name="cosine_similarity",
        field_type="FLOAT",
        description="Similarity between sub-a & sub-b. 1=identical, 0=not related, -1=opposite"
    ),
    bigquery.SchemaField(
        mode="NULLABLE",
        name="ann_dt",
        field_type="TIMESTAMP",
        description="Date that approximate nearest neighbors file was created"
    ),
    bigquery.SchemaField(
        mode="NULLABLE",
        name="model_version",
        field_type="STRING",
        description="Version for CAU model that created the embeddings to compare subreddit distance"
    )
]
d_metadata_description = dict()
for sf_ in l_bq_json_schema:
    d_metadata_description[sf_.name] = sf_.description

pa_bq_schema = pa.schema(
    [
        pa.field(
            'subreddit_id_a', pa.string(),
            nullable=False,
            # This metadata doesn't get ready by BigQuery :/
            # metadata={'description': "Seed subreddit ID. Seed = subreddit for which you want to find N most similar subs"},
        ),
        pa.field(
            'subreddit_name_a', pa.string(),
        ),
        ('distance_rank', pa.int32()),
        ('subreddit_id_b', pa.string()),
        ('subreddit_name_b', pa.string()),
        ('cosine_similarity', pa.float64()),
        ('ann_dt', pa.timestamp('ns')),
        ('model_version', pa.string()),
    ],
    metadata=d_metadata_description,
)
pa_bq_schema

subreddit_id_a: string not null
subreddit_name_a: string
distance_rank: int32
subreddit_id_b: string
subreddit_name_b: string
cosine_similarity: double
ann_dt: timestamp[ns]
model_version: string
-- schema metadata --
subreddit_id_a: 'Seed subreddit ID. Seed = subreddit for which you want t' + 26
subreddit_name_a: 'Seed subreddit name'
distance_rank: 'Rank for how closse subreddit_A is subreddit_B. 1=closest' + 15
subreddit_id_b: 'Subreddit ID for most similar subs to subreddit_a'
subreddit_name_b: 'Subreddit B name'
cosine_similarity: 'Similarity between sub-a & sub-b. 1=identical, 0=not ' + 20
ann_dt: 'Date that approximate nearest neighbors file was created'
model_version: 'Version for CAU model that created the embeddings to comp' + 22

In [133]:
# pa_bq_schema.metadata

In [138]:
%%time
shape_ = df_nn_top[l_cols_for_bq].shape
path_bq = f"{gs_this_model}/df_nearest_neighbors_top_bigquery-{shape_[0]}_by_{shape_[1]}.parquet"
print(path_bq)

df_nn_top[l_cols_for_bq].to_parquet(
    path_bq,
    engine='pyarrow',
    schema=pa_bq_schema,
)

gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-08-01_181632/df_nearest_neighbors_top_bigquery-8115327_by_8.parquet
CPU times: user 5.22 s, sys: 413 ms, total: 5.63 s
Wall time: 8.29 s


# Upload to BQ

using `bq load` won't work with a JSON schema in BQ.

Instead, let's try using the python client. NOTE: we'll need to get the right authentication in the VM that has the correct read & write access, e.g.,:
```bash
# login
gcloud auth application-default login

# logout
gcloud auth application-default revoke
```

In [142]:
## This works, but won't read the "description" or other nice metadata from parquet files

# !bq load \
#     --source_format=PARQUET \
#     --project_id=reddit-employee-datasets \
#     david_bermejo.subclu_v0050_subreddit_distances_c_top_100 \
#     "gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-08-01_181632/df_nearest_neighbors_top_bigquery-8115327_by_8.parquet"

In [141]:
%%time

client = bigquery.Client()

# Set table_id to the ID of the table to create.
table_id = "reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_distances_c_top_100"

# WRITE_TRUNCATE -> replace
# WRITE_APPEND -> append data
job_config = bigquery.LoadJobConfig(
    schema=l_bq_json_schema,
    source_format=bigquery.SourceFormat.PARQUET,
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
)
uri = "gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-08-01_181632/df_nearest_neighbors_top_bigquery-8115327_by_8.parquet"

load_job = client.load_table_from_uri(
    uri,
    table_id,
    location="US",  # Must match the destination dataset location.
    job_config=job_config,
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))


Loaded 8115327 rows.
CPU times: user 46.7 ms, sys: 124 ms, total: 171 ms
Wall time: 1min 8s


# Appendix & Scratch

In [None]:
BREAK

In [18]:
# %%time
# # use gsutil to download post-level embeddings b/c it'll be much faster to run it in parallel

# remote_key =  'mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1'

# # Need to remove the last part of the local path otherwise we'll get duplicate subfolders:
# #. top/2021-12-14/2021-12-14 instead of top/2021-12-14
# local_f = f"/home/jupyter/subreddit_clustering_i18n/data/local_cache/{'/'.join(remote_key.split('/')[:-1])}"
# Path(local_f).mkdir(parents=True, exist_ok=True)
# remote_gs_path = f"gs://i18n-subreddit-clustering/{remote_key}"
# print(f"Remote path:\n  {remote_gs_path}")
# print(f"Local path:\n  {local_f}")

# # `-n` flag means "no clober", so it should skip existing files (only copy new files)
# !gsutil -m cp -r -n $remote_gs_path $local_f

In [21]:
## We'll do posts in a separate notebook
# %%time

# df_agg_posts_c = mlf.read_run_artifact(
#     run_id=run_uuid,
#     artifact_folder='df_post_level_agg_c_post_comments_sub_desc',
#     read_function='pd_parquet',
#     verbose=False,
# )
# print(df_agg_posts_c.shape)

In [62]:
%%time

df_nn_top_final = df_nn_top.copy()

# append IDs & names for seed & nn (nearest neighbors)
df_nn_top_final = (
    nn_index.index_labels_df
    .rename(columns={c: f"{c}_a" for c in nn_index.index_labels_df.columns})
    .merge(
        df_nn_top_final.head(1000)
        .assign(subreddit_id_a=df_nn_top_final.head(1000)['seed_ix'].replace(d_index_to_sub_id)),
        how='right',
        on='subreddit_id_a'
    )
)
df_nn_top_final.shape

CPU times: user 10.6 s, sys: 468 ms, total: 11.1 s
Wall time: 11 s


(1000, 8)

In [63]:
%%time

df_nn_top_final = df_nn_top.copy()

# append IDs & names for seed & nn (nearest neighbors)
df_nn_top_final = (
    nn_index.index_labels_df
    .rename(columns={c: f"{c}_a" for c in nn_index.index_labels_df.columns})
    .set_index('subreddit_id_a')
    .merge(
        df_nn_top_final.head(1000)
        .assign(subreddit_id_a=df_nn_top_final.head(1000)['seed_ix'].replace(d_index_to_sub_id))
        .set_index('subreddit_id_a')
        ,
        how='right',
        left_index=True,
        right_index=True,
    )
)
df_nn_top_final.shape

CPU times: user 10.6 s, sys: 420 ms, total: 11.1 s
Wall time: 11 s


(1000, 7)

In [133]:
# This is 2x slower because it looks up sub_id & sub_name in series. 
#. Instead: lookup sub_id and do a df.merge() to get sub_name
d_topk_final = dict()
l_topk_final = list()

d_topk_meta = {
    'pt': datetime.utcnow().strftime("%Y-%m-%d"),
    'mlflow_run_id': run_uuid, 
    'model_name': 'cau-text-mUSE',
    'model_version': 'v0.6.0',
}


for seed_ix_, df_seed_ in tqdm(df_nn_top.head(400).groupby(['seed_ix'])):
    df_seed_['subreddit_id'] = df_seed_['nn_ix'].replace(d_index_to_sub_id_all)
#     df_seed_ = df_seed_.merge()
    l_topk_final.append(
        {
            **d_topk_meta,
            **{
                'subreddit_id': d_index_to_sub_id_all[seed_ix_],
                'subreddit_name': d_index_to_sub_name_all[seed_ix_],
                'similar_subreddit': {
                    'subreddit_id': list(df_seed_['subreddit_id']),
                    # 'subreddit_name': list(df_seed_['nn_ix'].replace(d_index_to_sub_name_all)),
                    'cosine_similarity': list(df_seed_['cosine_similarity']),
                    'distance_rank': list(df_seed_['distance_rank']),
                }
            }
        }
    )

100%|██████████| 2/2 [00:19<00:00,  9.82s/it]
