# Purpose


### 2022-08-01
Calculating precise nearest neighbors has become too expensive as we go over 40k subreddits. So instead let's calculate approx nearest neighbors (ANN). 

In this notebook we use [ANNOY](https://github.com/spotify/annoy).  Main reason for using annoy over FAISS is that annoy has official wheels in pypi, but FAISS only officially supports installation from conda. For now we don't want to depend on third-party wheels for FAISS.


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [115]:
from datetime import datetime
import gc
import os
import logging
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm import tqdm

import mlflow
import hydra
import annoy

import subclu
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)
from subclu.models import aggregate_embeddings_pd

from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)
from subclu.models.reshape_clusters_v050 import save_fpr_json


print_lib_versions([annoy, dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
annoy		v: 1.17.0
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.5.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set sqlite database as MLflow URI

In [4]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db'

## Get list of experiments with new function

In [5]:
mlf.list_experiment_meta(output_format='pandas').tail(9)

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
25,25,v0.4.1_mUSE_clustering_new_metrics,gs://i18n-subreddit-clustering/mlflow/mlruns/25,active
26,26,v0.4.1_nearest_neighbors_test,gs://i18n-subreddit-clustering/mlflow/mlruns/26,active
27,27,v0.4.1_nearest_neighbors,gs://i18n-subreddit-clustering/mlflow/mlruns/27,active
28,28,v0.5.0_mUSE_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/28,active
29,29,v0.5.0_mUSE_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/29,active
30,30,v0.5.0_mUSE_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/30,active
31,31,v0.5.0_mUSE_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/31,active
32,32,v0.5.0_nearest_neighbors_test,gs://i18n-subreddit-clustering/mlflow/mlruns/32,active
33,33,v0.5.0_nearest_neighbors,gs://i18n-subreddit-clustering/mlflow/mlruns/33,active


## Get runs from embeddings aggregation jobs

Want to make sure we can load these artifacts for other jobs

In [7]:
%%time

df_mlf_runs =  mlf.search_all_runs(experiment_ids=[29])
df_mlf_runs.shape

CPU times: user 52.3 ms, sys: 3.87 ms, total: 56.2 ms
Wall time: 55.5 ms


(2, 36)

In [8]:
df_mlf_runs.head()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.time_fxn-full_aggregation_fxn_minutes,metrics.df_v_subs-cols,metrics.df_subs_agg_c1-cols,metrics.time_fxn-df_posts_agg_c1,metrics.time_fxn-df_subs_agg_c1,metrics.memory_used,metrics.memory_free,metrics.memory_used_percent,metrics.df_posts_agg_c1-cols,metrics.df_v_post_comments-rows,metrics.time_fxn-data_loading_time,metrics.memory_total,metrics.cpu_count,metrics.df_v_subs-rows,metrics.df_posts_agg_c1-rows,metrics.df_subs_agg_c1-rows,metrics.df_v_post_comments-cols,params.weight_post_and_comments,params.host_name,params.embeddings_bucket,params.memory_total,params.cpu_count,params.embeddings_subreddit_path,params.embeddings_post_and_comments_path,params.weight_subreddit_meta,tags.host_name,tags.mlflow.source.name,tags.mlflow.source.git.commit,tags.mlflow.user,tags.mlflow.source.type
0,bfe6cbd59a21480c8c2b9923a3a9cbd6,29,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts,2022-07-01 11:42:15.973000+00:00,2022-07-01 14:36:59.617000+00:00,174.728815,514.0,514.0,161.522672,1.536616,166480.0,1381009.0,0.115214,515.0,16360314.0,0.926008,1444961.0,96.0,196371.0,16360314.0,81973.0,515.0,0.85,djb-100-2021-04-28-djb-eda-german-subs,i18n-subreddit-clustering,1444961,96,i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555,i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925,0.15,djb-100-2021-04-28-djb-eda-german-subs,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,4fbcc02af1efc611f6dc25f1ac5369375346aa8c,jupyter,LOCAL
1,03b1c15c736340cc8d33ce75debdc7f7,29,FINISHED,gs://i18n-subreddit-clustering/mlflow/mlruns/29/03b1c15c736340cc8d33ce75debdc7f7/artifacts,2022-07-01 11:38:17.732000+00:00,2022-07-01 11:41:31.509000+00:00,3.209811,514.0,514.0,1.128882,0.008693,110789.0,1381402.0,0.076673,515.0,16360314.0,0.953466,1444961.0,96.0,196371.0,123000.0,573.0,515.0,0.85,djb-100-2021-04-28-djb-eda-german-subs,i18n-subreddit-clustering,1444961,96,i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555,i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925,0.15,djb-100-2021-04-28-djb-eda-german-subs,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,4fbcc02af1efc611f6dc25f1ac5369375346aa8c,jupyter,LOCAL


In [9]:
run_uuid = 'bfe6cbd59a21480c8c2b9923a3a9cbd6'

# Check run artifacts

In [10]:
l_artifacts_top_level = mlf.list_run_artifacts(
    run_id=run_uuid,
    only_top_level=True,
    verbose=True,
)
len(l_artifacts_top_level)

15:22:33 | INFO | "    69 <- Artifacts to check count"
15:22:33 | INFO | "    69 <- Artifacts clean count"
15:22:33 | INFO | "     2 <- Artifacts & folders at TOP LEVEL clean count"


2

In [11]:
l_artifacts_all = mlf.list_run_artifacts(
    run_id=run_uuid,
    only_top_level=False,
    verbose=False,
)
len(l_artifacts_all)

15:22:39 | INFO | "    69 <- Artifacts clean count"
15:22:39 | INFO | "     2 <- Artifacts & folders at TOP LEVEL clean count"


69

In [12]:
l_artifacts_top_level

['df_posts_agg_c1', 'df_subs_agg_c1']

In [14]:
l_sub_c = [i for i in l_artifacts_all if 'df_subs_agg_c1' in i]
print(len(l_sub_c))
l_sub_c[:6]

3


['mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1/_common_metadata',
 'mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1/_metadata',
 'mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1/part.0.parquet']

In [15]:
l_post_c = [i for i in l_artifacts_all if 'df_posts_agg_c1' in i]
print(len(l_post_c))
l_post_c[:6]

66


['mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_posts_agg_c1/_common_metadata',
 'mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_posts_agg_c1/_metadata',
 'mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_posts_agg_c1/part.0.parquet',
 'mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_posts_agg_c1/part.1.parquet',
 'mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_posts_agg_c1/part.10.parquet',
 'mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_posts_agg_c1/part.11.parquet']

# Load aggregated embeddings

use `gsutil` to download embeddings for posts b/c that can take a LONG time to download sequentially. `gsutil` makes parallel downloaidng much faster and reports download speeds above 500MB / s:
```bash
ents_sub_desc/part.67.parquet...
/ [2/197 files][ 61.7 GiB/ 75.4 GiB]  81% Done 632.0 MiB/s ETA 00:00:22
```

In [20]:
%%time
# use gsutil to download post-level embeddings b/c it'll be much faster to run it in parallel

remote_key =  'mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1'

# Need to remove the last part of the local path otherwise we'll get duplicate subfolders:
#. top/2021-12-14/2021-12-14 instead of top/2021-12-14
local_f = f"/home/jupyter/subreddit_clustering_i18n/data/local_cache/{'/'.join(remote_key.split('/')[:-1])}"
Path(local_f).mkdir(parents=True, exist_ok=True)
remote_gs_path = f"gs://i18n-subreddit-clustering/{remote_key}"
print(f"Remote path:\n  {remote_gs_path}")
print(f"Local path:\n  {local_f}")

# `-n` flag means "no clober", so it should skip existing files (only copy new files)
!gsutil -m cp -r -n $remote_gs_path $local_f

Remote path:
  gs://i18n-subreddit-clustering/mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1
Local path:
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts
Skipping existing item: file:///home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1/_common_metadata
Skipping existing item: file:///home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1/_metadata
Skipping existing item: file:///home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1/part.0.parquet
CPU times: user 9.26 ms, sys: 26.4 ms, total: 35.6 ms
Wall time: 1.52 s


In [21]:
## We'll do posts in a separate notebook
# %%time

# df_agg_posts_c = mlf.read_run_artifact(
#     run_id=run_uuid,
#     artifact_folder='df_post_level_agg_c_post_comments_sub_desc',
#     read_function='pd_parquet',
#     verbose=False,
# )
# print(df_agg_posts_c.shape)

In [22]:
%%time

df_agg_sub_c = mlf.read_run_artifact(
    run_id=run_uuid,
    artifact_folder='df_subs_agg_c1',
    read_function='pd_parquet',
    verbose=False,
)
print(df_agg_sub_c.shape)

15:37:17 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/29/bfe6cbd59a21480c8c2b9923a3a9cbd6/artifacts/df_subs_agg_c1"
100%|##########################################| 3/3 [00:00<00:00, 12800.52it/s]
15:37:17 | INFO | "  Parquet files found:     1"
15:37:17 | INFO | "  Parquet files to use:     1"


(81973, 514)
CPU times: user 4.36 s, sys: 1.36 s, total: 5.72 s
Wall time: 8.98 s


In [23]:
df_agg_sub_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81973 entries, 0 to 81972
Columns: 514 entries, subreddit_id to embeddings_511
dtypes: float32(512), object(2)
memory usage: 161.4+ MB


In [24]:
df_agg_sub_c.iloc[:5, :25]

Unnamed: 0,subreddit_id,subreddit_name,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21,embeddings_22
0,t5_1009a3,memesenespanol,0.008419,-0.019185,-0.002546,0.004805,-0.015185,0.039421,0.010277,0.017569,-0.055053,-0.0046,0.010028,-0.009343,0.005398,-0.003557,-0.002658,-0.004677,-0.008021,-0.007665,-0.004388,-0.002808,-0.006471,-0.00733,0.012932
1,t5_100a1y,karstcast,-0.041405,-5e-05,-0.017356,0.031231,-0.008441,0.057712,0.033627,-0.009535,-0.072768,0.045227,0.058249,0.016315,-0.000252,0.029993,-0.025715,0.047034,-0.01999,0.002565,-0.006243,0.059901,0.026689,-0.039812,-0.008799
2,t5_100eoi,measuredpenis,-0.031335,0.027641,0.03358,0.027154,-0.004368,0.040248,0.007771,0.001586,-0.050101,0.004506,0.077162,0.082472,-0.005974,-0.021785,0.012499,-0.003118,0.028605,-0.025298,0.045636,-0.006962,0.038677,-0.010537,-0.005422
3,t5_100i9c,dragonballart,-0.00631,-0.014417,-0.020473,-0.047072,-0.028049,0.040935,-0.015178,0.011991,-0.046303,0.03955,0.017996,-0.025655,-0.00345,-0.021044,-0.022055,-0.011928,-0.004111,0.001859,0.031169,0.000369,0.002588,-0.002757,0.021869
4,t5_100mht,thelongestgameever2,-0.009634,-0.022158,0.019577,-0.000169,-0.045278,0.050387,-0.005863,-0.010414,-0.028075,-0.006754,-0.012094,-0.010896,0.049545,0.002619,-0.010922,-0.035918,0.013825,-0.05342,-0.012907,0.020754,-0.002489,-0.03381,-0.010063


In [25]:
df_agg_sub_c.iloc[-10:, :25]

Unnamed: 0,subreddit_id,subreddit_name,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21,embeddings_22
81963,t5_zyxlz,catsenjoyingpets,0.005982,0.00152,0.012971,0.009731,-0.040641,0.029886,0.017776,-0.001675,-0.061297,0.041725,0.00557,-0.002277,0.01563,0.053033,-0.004575,-0.000381,0.007203,-0.009603,0.05053,-0.012457,0.015613,-0.022345,0.024328
81964,t5_zyxsf,gladiatorsandals,0.035082,0.014228,0.025307,-0.017528,-0.015862,0.036342,0.006328,-0.02243,-0.04081,-0.023315,0.002716,0.048175,-0.033246,-0.025232,0.010923,-0.011943,0.009128,-0.024358,0.050871,0.005424,-0.008086,0.028314,0.014248
81965,t5_zyz1w,k_on_shuffle,-0.011377,0.023965,0.041733,-0.010157,0.035751,0.03872,-0.015874,-0.01362,0.00775,0.037055,-0.027191,0.020789,-0.023487,0.020691,-0.0244,-0.013452,0.004272,-0.02677,0.027883,0.0152,-0.026795,-0.006335,0.000449
81966,t5_zz27k,capcomhomearcade,-0.002962,-0.021336,0.015305,0.004804,-0.015484,-0.013002,0.009767,-0.017814,-0.047847,0.038056,0.021027,-0.042177,0.014485,0.008857,0.039708,0.017389,-7e-06,-0.005876,0.03836,-0.01516,0.009002,-0.000229,0.034124
81967,t5_zz4jm,aithesomniumfiles,-0.017261,0.007027,0.010723,0.001267,0.000868,0.021436,0.004496,0.002664,-0.023934,0.007253,0.024536,-0.002693,0.044589,0.012118,0.01648,0.003802,0.016258,-0.012357,0.013519,0.005375,-0.029291,-0.009064,0.015744
81968,t5_zz9nd,freyanightingale,0.041456,0.020634,0.010827,0.029859,-0.0337,0.048856,0.024756,-0.024959,-0.058297,0.011454,0.036036,0.063945,0.009929,-0.004287,-0.002938,-0.005173,0.003698,-0.017464,0.026774,0.019328,-0.038333,0.028035,0.029953
81969,t5_zzebd,mk3supra,-0.029058,-0.003269,0.001232,-0.010903,-0.053896,0.063239,0.009007,0.009073,-0.059915,-0.017614,0.013702,0.007835,0.04551,-0.002194,0.013315,0.017507,0.022693,0.001937,-0.015099,0.004942,-0.019533,0.008993,0.046277
81970,t5_zzgss,epididymitis,-0.037259,-0.001366,0.001502,-0.014607,-0.0636,0.052982,0.030545,-0.028854,-0.053021,-0.015713,0.054555,0.063024,0.022153,0.005796,0.044372,-0.035738,-0.024904,-0.039965,-0.027446,0.026354,-0.00957,-0.02609,0.042448
81971,t5_zzjup,morgonaut,-0.014793,-0.02627,0.038431,0.008339,0.00557,-0.024845,0.001907,0.029336,9.4e-05,0.018976,0.027022,-0.016802,-0.027257,-0.036809,0.04472,-0.026263,-0.01212,0.01375,-0.003191,-0.001602,-0.027321,-0.01354,-0.00479
81972,t5_zzszh,circumcisiongrief,-0.011436,0.027289,0.005682,0.006329,-0.01495,0.03629,0.007651,0.032805,-0.016379,0.007657,0.055554,0.057527,0.043022,0.002019,0.012934,0.009255,-0.028879,-0.016636,-0.040156,-0.00662,0.013331,-0.005055,0.022023


## Set index so it's easier to work with annoy

nvm, set it when passing the df into annoy index

In [37]:
# try:
#     df_agg_sub_c = df_agg_sub_c.drop('index', axis=1)
# except KeyError:
#     pass

# l_embedding_cols = [c for c in df_agg_sub_c.columns if c.startswith('embedding')]
# l_ix_cols = [c for c in df_agg_sub_c.columns if c not in l_embedding_cols]
# print(l_ix_cols)

# # df_agg_sub_c = df_agg_sub_c.set_index(l_ix_cols)
# # print(df_agg_sub_c.shape)

In [27]:
# df_agg_sub_c.iloc[-10:, :25]

In [28]:
# df_agg_sub_c.info()

# Load subreddits from final table [deprecated]

~There may be some that didn't make the final cut so we don't want to have stray subs in final table~

In a previous version we only kept subs that had embeddings AND clustering data. From now on, keep embeddings & distance for all subs b/c in the long run we want to keep all subs anyway.

In [193]:
BREAK

NameError: name 'TODO' is not defined

In [29]:
# run_id_final_model = ''

In [30]:
# l_artifacts_top_level = mlf.list_run_artifacts(
#     run_id=run_id_final_model,
#     only_top_level=True,
#     verbose=True,
# )
# len(l_artifacts_top_level)

In [31]:
# l_artifacts_all = mlf.list_run_artifacts(
#     run_id=run_id_final_model,
#     only_top_level=False,
#     verbose=False,
# )
# len(l_artifacts_all)

In [32]:
# l_artifacts_top_level

In [33]:
# l_sub_c = [i for i in l_artifacts_all if 'df_labels' in i]
# print(len(l_sub_c))
# l_sub_c[:6]

In [34]:
# df_labels = mlf.read_run_artifact(
#     run_id=run_id_final_model,
#     artifact_folder='df_labels',
#     read_function='pd_parquet',
#     verbose=False,
# )
# print(df_labels.shape)

In [35]:
# df_labels.iloc[:5, :15]

## Keep only subs that are in final model

In [36]:
# print(df_agg_sub_c.shape)
# df_agg_sub_c = df_agg_sub_c[df_agg_sub_c['subreddit_id'].isin(df_labels['subreddit_id'])]

# print(df_agg_sub_c.shape)

# Build annoy index

I created a custom `AnnoyIndex` class with some extra methods to create outputs & (and calculate cosine distance) for BigQuery.

In [38]:
from subclu.models.nn_annoy import AnnoyIndex

In [40]:
%%time

index_cols = ['subreddit_id', 'subreddit_name']
l_embedding_cols = [c for c in df_agg_sub_c.columns if c.startswith('embedding')]

nn_index = AnnoyIndex(
    # df_agg_sub_c[l_embedding_cols + index_cols].sample(n=11000, random_state=42),
    df_agg_sub_c[l_embedding_cols + index_cols],
    index_cols=index_cols,
    metric='angular',
    n_trees=2000,
)

CPU times: user 58.1 ms, sys: 104 ms, total: 162 ms
Wall time: 161 ms


In [41]:
%%time
nn_index.build()

CPU times: user 1h 13min 42s, sys: 51.5 s, total: 1h 14min 34s
Wall time: 1min 4s


In [42]:
%%time

n_test_i = 29500
nn_index.get_top_n_by_item(n_test_i, k=15, search_k=-1, include_distances=True)

CPU times: user 52.8 ms, sys: 7.96 ms, total: 60.7 ms
Wall time: 65.9 ms


Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance
0,t5_2yonj,butterfly,0,t5_2yonj,butterfly,0.0
1,t5_2yonj,butterfly,1,t5_2sp0f,butterflies,0.362473
2,t5_2yonj,butterfly,2,t5_2smr2,moths,0.529516
3,t5_2yonj,butterfly,3,t5_2vwmm,caterpillars,0.634845
4,t5_2yonj,butterfly,4,t5_2qygr,entomology,0.635856
5,t5_2yonj,butterfly,5,t5_3026h,monarchbutterfly,0.646075
6,t5_2yonj,butterfly,6,t5_2r87c,insects,0.654993
7,t5_2yonj,butterfly,7,t5_2s5s1,species,0.657224
8,t5_2yonj,butterfly,8,t5_1kqfex,butterflygardening,0.690289
9,t5_2yonj,butterfly,9,t5_2s3bw,whatsthisbug,0.699216


In [43]:
%%time
nn_index.get_top_n_by_item(n_test_i, k=15, search_k=2, include_distances=True)

CPU times: user 45.6 ms, sys: 21 µs, total: 45.6 ms
Wall time: 44.9 ms


Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance
0,t5_2yonj,butterfly,0,t5_2yonj,butterfly,0.0
1,t5_2yonj,butterfly,1,t5_2sp0f,butterflies,0.362473
2,t5_2yonj,butterfly,2,t5_2smr2,moths,0.529516
3,t5_2yonj,butterfly,3,t5_2qygr,entomology,0.635856
4,t5_2yonj,butterfly,4,t5_3026h,monarchbutterfly,0.646075
5,t5_2yonj,butterfly,5,t5_2r87c,insects,0.654993
6,t5_2yonj,butterfly,6,t5_2s5s1,species,0.657224
7,t5_2yonj,butterfly,7,t5_2vtf7,awwnverts,0.702606
8,t5_2yonj,butterfly,8,t5_3i3bw,insect,0.703254
9,t5_2yonj,butterfly,9,t5_31trw,dragonflies,0.707511


In [44]:
%%time
nn_index.get_top_n_by_item(n_test_i, k=20, include_distances=False)

CPU times: user 45.5 ms, sys: 3.91 ms, total: 49.4 ms
Wall time: 48.3 ms


Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b
0,t5_2yonj,butterfly,0,t5_2yonj,butterfly
1,t5_2yonj,butterfly,1,t5_2sp0f,butterflies
2,t5_2yonj,butterfly,2,t5_2smr2,moths
3,t5_2yonj,butterfly,3,t5_2vwmm,caterpillars
4,t5_2yonj,butterfly,4,t5_2qygr,entomology
5,t5_2yonj,butterfly,5,t5_3026h,monarchbutterfly
6,t5_2yonj,butterfly,6,t5_2r87c,insects
7,t5_2yonj,butterfly,7,t5_2s5s1,species
8,t5_2yonj,butterfly,8,t5_1kqfex,butterflygardening
9,t5_2yonj,butterfly,9,t5_2s3bw,whatsthisbug


## Get df with all items

In [45]:
df_nn_top = nn_index.get_top_n_by_item_all(
    k=100,
    search_k=-1, 
    include_distances=True,
    append_i=True,
)

100%|██████████| 81973/81973 [1:17:02<00:00, 17.73it/s]
17:07:23 | INFO | "(8115327, 7) <- df_top_items shape"


In [46]:
df_nn_top.head()

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
1,t5_1009a3,memesenespanol,1,t5_3qq2qy,beelcitosmemes,0.617543,0.80932
2,t5_1009a3,memesenespanol,2,t5_10wycq,memesesp,0.62925,0.802022
3,t5_1009a3,memesenespanol,3,t5_3wam26,latesitoo,0.668825,0.776337
4,t5_1009a3,memesenespanol,4,t5_6hud5b,illoganga,0.679677,0.76902
5,t5_1009a3,memesenespanol,5,t5_69coi0,anzutops777oficial,0.692039,0.760541


In [47]:
df_nn_top.tail()

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
8197295,t5_zzszh,circumcisiongrief,95,t5_3bj8n,restoringdick,0.910865,0.585162
8197296,t5_zzszh,circumcisiongrief,96,t5_2s5k4,sexed,0.911389,0.584685
8197297,t5_zzszh,circumcisiongrief,97,t5_3d635,sissyology,0.912031,0.5841
8197298,t5_zzszh,circumcisiongrief,98,t5_2sekm,trollxchromosomes,0.912716,0.583474
8197299,t5_zzszh,circumcisiongrief,99,t5_adf18,nothowgirlswork,0.912891,0.583315


In [48]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'ich_iel']
    .head(15)
)

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
3662701,t5_37k29,ich_iel,1,t5_2qmr6,aeiou,0.547037,0.850375
3662702,t5_37k29,ich_iel,2,t5_39bxv,ik_ihe,0.588198,0.827011
3662703,t5_37k29,ich_iel,3,t5_39uv3,kopiernudeln,0.590737,0.825515
3662704,t5_37k29,ich_iel,4,t5_ofkj1,okbrudimongo,0.606613,0.81601
3662705,t5_37k29,ich_iel,5,t5_w2zxy,okoidawappler,0.612174,0.812622
3662706,t5_37k29,ich_iel,6,t5_17d5ey,ichbin40undlustig,0.617501,0.809346
3662707,t5_37k29,ich_iel,7,t5_5tuqhy,scheissepfostieren,0.620746,0.807337
3662708,t5_37k29,ich_iel,8,t5_318w4,cirkeltrek,0.633709,0.799207
3662709,t5_37k29,ich_iel,9,t5_3b2y1,einfach_posten,0.640746,0.794722
3662710,t5_37k29,ich_iel,10,t5_3hn0l,deutschememes,0.653955,0.786171


In [60]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'vegetarischde']
    .head(15)
)

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
5210101,t5_4c06em,vegetarischde,1,t5_37ruc,vegande,0.392512,0.922967
5210102,t5_4c06em,vegetarischde,2,t5_25v3wn,kreisvegs,0.472536,0.888355
5210103,t5_4c06em,vegetarischde,3,t5_2uquu,askvegans,0.553144,0.847016
5210104,t5_4c06em,vegetarischde,4,t5_2qhpm,vegan,0.559594,0.843427
5210105,t5_4c06em,vegetarischde,5,t5_2ven0,antivegan,0.568863,0.838197
5210106,t5_4c06em,vegetarischde,6,t5_2qhzr,vegetarianism,0.581378,0.831
5210107,t5_4c06em,vegetarischde,7,t5_3jwb3,veganita,0.588474,0.826849
5210108,t5_4c06em,vegetarischde,8,t5_109235,exvegans,0.588729,0.826699
5210109,t5_4c06em,vegetarischde,9,t5_675dds,vegancirclejerkchat,0.589959,0.825974
5210110,t5_4c06em,vegetarischde,10,t5_2sgfh,vegancirclejerk,0.600225,0.819865


In [61]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'antivegan']
    .head(15)
)

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
2470701,t5_2ven0,antivegan,1,t5_2sgfh,vegancirclejerk,0.306356,0.953073
2470702,t5_2ven0,antivegan,2,t5_2qhpm,vegan,0.328896,0.945914
2470703,t5_2ven0,antivegan,3,t5_675dds,vegancirclejerkchat,0.350246,0.938664
2470704,t5_2ven0,antivegan,4,t5_kycqf,veganforcirclejerkers,0.356404,0.936488
2470705,t5_2ven0,antivegan,5,t5_30wk6,veganmemes,0.358652,0.935684
2470706,t5_2ven0,antivegan,6,t5_109235,exvegans,0.363321,0.933999
2470707,t5_2ven0,antivegan,7,t5_2uquu,askvegans,0.386959,0.925132
2470708,t5_2ven0,antivegan,8,t5_2qhzr,vegetarianism,0.401766,0.919292
2470709,t5_2ven0,antivegan,9,t5_2sa7z,debateavegan,0.435358,0.905232
2470710,t5_2ven0,antivegan,10,t5_37ruc,vegande,0.469215,0.889918


In [56]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'mexico']
    .head(15)
)

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
870501,t5_2qhv7,mexico,1,t5_2qm06,monterrey,0.511194,0.86934
870502,t5_2qhv7,mexico,2,t5_2tocwj,tfwyouliveinmexico,0.54976,0.848882
870503,t5_2qhv7,mexico,3,t5_3la4d,mujico,0.558648,0.843956
870504,t5_2qhv7,mexico,4,t5_2sbh1,mexicali,0.577908,0.833011
870505,t5_2qhv7,mexico,5,t5_2up3k,ticos,0.586034,0.828282
870506,t5_2qhv7,mexico,6,t5_4ywzju,askmexico,0.588372,0.826909
870507,t5_2qhv7,mexico,7,t5_2ujoy,memexico,0.599531,0.820281
870508,t5_2qhv7,mexico,8,t5_2lxxle,mexicow,0.601053,0.819368
870509,t5_2qhv7,mexico,9,t5_2r8eh,mejico,0.620872,0.807259
870510,t5_2qhv7,mexico,10,t5_2s5noh,mexico4t,0.627347,0.803218


In [57]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'de']
    .head(15)
)

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
159901,t5_22i0,de,1,t5_4egnbw,dezwo,0.450197,0.898661
159902,t5_22i0,de,2,t5_3caax,600euro,0.502833,0.87358
159903,t5_22i0,de,3,t5_irnzx,dachschaden,0.532695,0.858118
159904,t5_22i0,de,4,t5_392ha,asozialesnetzwerk,0.540717,0.853813
159905,t5_22i0,de,5,t5_3jxvk,tja,0.560871,0.842712
159906,t5_22i0,de,6,t5_2qo9i,austria,0.564859,0.840467
159907,t5_22i0,de,7,t5_2qhjz,france,0.649404,0.789137
159908,t5_22i0,de,8,t5_3b2y1,einfach_posten,0.65678,0.78432
159909,t5_22i0,de,9,t5_nls07,belgium2,0.657435,0.78389
159910,t5_22i0,de,10,t5_jsyzh,poldersocialisme,0.658357,0.783283


# Convert plain distance to cosine similarity
from [github](https://github.com/spotify/annoy/issues/112#issuecomment-686513356)
```
cosine_similarity = 1 - cosine_distance^2/2
```

In [50]:
%%time

col_cosine_similarity = 'cosine_similarity'

if col_cosine_similarity not in df_nn_top:
    df_nn_top[col_cosine_similarity] = (
        1 -
        (df_nn_top['distance'] ** 2) / 2
    )

CPU times: user 12 µs, sys: 0 ns, total: 12 µs
Wall time: 16.7 µs


In [51]:
df_nn_top.head()

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
1,t5_1009a3,memesenespanol,1,t5_3qq2qy,beelcitosmemes,0.617543,0.80932
2,t5_1009a3,memesenespanol,2,t5_10wycq,memesesp,0.62925,0.802022
3,t5_1009a3,memesenespanol,3,t5_3wam26,latesitoo,0.668825,0.776337
4,t5_1009a3,memesenespanol,4,t5_6hud5b,illoganga,0.679677,0.76902
5,t5_1009a3,memesenespanol,5,t5_69coi0,anzutops777oficial,0.692039,0.760541


In [53]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'ich_iel']
    .head(15)
)

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
3662701,t5_37k29,ich_iel,1,t5_2qmr6,aeiou,0.547037,0.850375
3662702,t5_37k29,ich_iel,2,t5_39bxv,ik_ihe,0.588198,0.827011
3662703,t5_37k29,ich_iel,3,t5_39uv3,kopiernudeln,0.590737,0.825515
3662704,t5_37k29,ich_iel,4,t5_ofkj1,okbrudimongo,0.606613,0.81601
3662705,t5_37k29,ich_iel,5,t5_w2zxy,okoidawappler,0.612174,0.812622
3662706,t5_37k29,ich_iel,6,t5_17d5ey,ichbin40undlustig,0.617501,0.809346
3662707,t5_37k29,ich_iel,7,t5_5tuqhy,scheissepfostieren,0.620746,0.807337
3662708,t5_37k29,ich_iel,8,t5_318w4,cirkeltrek,0.633709,0.799207
3662709,t5_37k29,ich_iel,9,t5_3b2y1,einfach_posten,0.640746,0.794722
3662710,t5_37k29,ich_iel,10,t5_3hn0l,deutschememes,0.653955,0.786171


In [59]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'finanzen']
    .head(15)
)

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
3535801,t5_35m5e,finanzen,1,t5_5txdoj,finanzenat,0.46221,0.893181
3535802,t5_35m5e,finanzen,2,t5_37aoh,vosfinances,0.489612,0.88014
3535803,t5_35m5e,finanzen,3,t5_11cinh,befire,0.49199,0.878973
3535804,t5_35m5e,finanzen,4,t5_2tasy,personalfinancecanada,0.507932,0.871003
3535805,t5_35m5e,finanzen,5,t5_3isqn,italiapersonalfinance,0.509007,0.870456
3535806,t5_35m5e,finanzen,6,t5_38zrx,personalfinancenz,0.520467,0.864557
3535807,t5_35m5e,finanzen,7,t5_2w5jv,eupersonalfinance,0.524937,0.862221
3535808,t5_35m5e,finanzen,8,t5_3ljid,europefire,0.525525,0.861912
3535809,t5_35m5e,finanzen,9,t5_2clhc5,literaciafinanceira,0.533042,0.857933
3535810,t5_35m5e,finanzen,10,t5_2uo3q,ausfinance,0.535259,0.856749


# Add dt/pt column & metadata columns
- ann dt

In [55]:
TODO

NameError: name 'TODO' is not defined

In [80]:
df_nn_top['ann_dt'] = pd.to_datetime(pd.to_datetime(datetime.utcnow()).date())

In [81]:
df_nn_top['model_version'] = '0.5.0'

In [82]:
df_nn_top.head()

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity,ann_dt,model_version
1,t5_1009a3,memesenespanol,1,t5_3qq2qy,beelcitosmemes,0.617543,0.80932,2022-08-01,0.5.0
2,t5_1009a3,memesenespanol,2,t5_10wycq,memesesp,0.62925,0.802022,2022-08-01,0.5.0
3,t5_1009a3,memesenespanol,3,t5_3wam26,latesitoo,0.668825,0.776337,2022-08-01,0.5.0
4,t5_1009a3,memesenespanol,4,t5_6hud5b,illoganga,0.679677,0.76902,2022-08-01,0.5.0
5,t5_1009a3,memesenespanol,5,t5_69coi0,anzutops777oficial,0.692039,0.760541,2022-08-01,0.5.0


In [83]:
df_nn_top.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8115327 entries, 1 to 8197299
Data columns (total 9 columns):
 #   Column             Dtype         
---  ------             -----         
 0   subreddit_id_a     object        
 1   subreddit_name_a   object        
 2   distance_rank      int64         
 3   subreddit_id_b     object        
 4   subreddit_name_b   object        
 5   distance           float64       
 6   cosine_similarity  float64       
 7   ann_dt             datetime64[ns]
 8   model_version      object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 619.2+ MB


# Save df to GCS

From here we should be able to share & create a BigQuery table

In [66]:
import pyarrow as pa

In [86]:
gs_nearest_neighbors_root = 'gs://i18n-subreddit-clustering/data/models/nearest_neighbors'
gs_this_model = f"{gs_nearest_neighbors_root}/manual_model_{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}"
gs_this_model

'gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-08-01_181632'

In [87]:
%%time
shape_ = df_nn_top.shape

df_nn_top.to_parquet(
    f"{gs_this_model}/df_nearest_neighbors_top_full-{shape_[0]}_by_{shape_[1]}.parquet"
)

CPU times: user 5.09 s, sys: 457 ms, total: 5.54 s
Wall time: 10.1 s


## Create schema for BigQuery parquet file

We'll save 1 file with all the raw data and a different file with the data needed to create a BQ table.

We'll also create a schema for the BQ file so that BQ can display some metadata about the tables

In [137]:
from google.cloud import bigquery


l_cols_for_bq = [
    'subreddit_id_a',
    'subreddit_name_a',
    'distance_rank',
    'subreddit_id_b',
    'subreddit_name_b',
    'cosine_similarity',
    'ann_dt',
    'model_version',
]

l_bq_json_schema = [
    bigquery.SchemaField(
        mode="REQUIRED",
        name="subreddit_id_a",
        field_type="STRING",
        description="Seed subreddit ID. Seed = subreddit for which you want to find N most similar subs"
    ),
    bigquery.SchemaField(
        mode="REQUIRED",
        name="subreddit_name_a",
        field_type="STRING",
        description="Seed subreddit name"
    ),
    bigquery.SchemaField(
        mode="NULLABLE",
        name="distance_rank",
        field_type="INTEGER",
        description="Rank for how closse subreddit_A is subreddit_B. 1=closest, 2=2nd closest"
    ),
    bigquery.SchemaField(
        mode="REQUIRED",
        name="subreddit_id_b",
        field_type="STRING",
        description="Subreddit ID for most similar subs to subreddit_a"
    ),
    bigquery.SchemaField(
        mode="REQUIRED",
        name="subreddit_name_b",
        field_type="STRING",
        description="Subreddit B name"
    ),
    bigquery.SchemaField(
        mode="REQUIRED",
        name="cosine_similarity",
        field_type="FLOAT",
        description="Similarity between sub-a & sub-b. 1=identical, 0=not related, -1=opposite"
    ),
    bigquery.SchemaField(
        mode="NULLABLE",
        name="ann_dt",
        field_type="TIMESTAMP",
        description="Date that approximate nearest neighbors file was created"
    ),
    bigquery.SchemaField(
        mode="NULLABLE",
        name="model_version",
        field_type="STRING",
        description="Version for CAU model that created the embeddings to compare subreddit distance"
    )
]
d_metadata_description = dict()
for sf_ in l_bq_json_schema:
    d_metadata_description[sf_.name] = sf_.description

pa_bq_schema = pa.schema(
    [
        pa.field(
            'subreddit_id_a', pa.string(),
            nullable=False,
            # This metadata doesn't get ready by BigQuery :/
            # metadata={'description': "Seed subreddit ID. Seed = subreddit for which you want to find N most similar subs"},
        ),
        pa.field(
            'subreddit_name_a', pa.string(),
        ),
        ('distance_rank', pa.int32()),
        ('subreddit_id_b', pa.string()),
        ('subreddit_name_b', pa.string()),
        ('cosine_similarity', pa.float64()),
        ('ann_dt', pa.timestamp('ns')),
        ('model_version', pa.string()),
    ],
    metadata=d_metadata_description,
)
pa_bq_schema

subreddit_id_a: string not null
subreddit_name_a: string
distance_rank: int32
subreddit_id_b: string
subreddit_name_b: string
cosine_similarity: double
ann_dt: timestamp[ns]
model_version: string
-- schema metadata --
subreddit_id_a: 'Seed subreddit ID. Seed = subreddit for which you want t' + 26
subreddit_name_a: 'Seed subreddit name'
distance_rank: 'Rank for how closse subreddit_A is subreddit_B. 1=closest' + 15
subreddit_id_b: 'Subreddit ID for most similar subs to subreddit_a'
subreddit_name_b: 'Subreddit B name'
cosine_similarity: 'Similarity between sub-a & sub-b. 1=identical, 0=not ' + 20
ann_dt: 'Date that approximate nearest neighbors file was created'
model_version: 'Version for CAU model that created the embeddings to comp' + 22

In [133]:
# pa_bq_schema.metadata

In [138]:
%%time
shape_ = df_nn_top[l_cols_for_bq].shape
path_bq = f"{gs_this_model}/df_nearest_neighbors_top_bigquery-{shape_[0]}_by_{shape_[1]}.parquet"
print(path_bq)

df_nn_top[l_cols_for_bq].to_parquet(
    path_bq,
    engine='pyarrow',
    schema=pa_bq_schema,
)

gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-08-01_181632/df_nearest_neighbors_top_bigquery-8115327_by_8.parquet
CPU times: user 5.22 s, sys: 413 ms, total: 5.63 s
Wall time: 8.29 s


# Upload to BQ

using `bq load` won't work with a JSON schema in BQ.

Instead, let's try using the python client. NOTE: we'll need to get the right authentication in the VM that has the correct read & write access, e.g.,:
```bash
# login
gcloud auth application-default login

# logout
gcloud auth application-default revoke
```

In [142]:
## This works, but won't read the "description" or other nice metadata from parquet files

# !bq load \
#     --source_format=PARQUET \
#     --project_id=reddit-employee-datasets \
#     david_bermejo.subclu_v0050_subreddit_distances_c_top_100 \
#     "gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-08-01_181632/df_nearest_neighbors_top_bigquery-8115327_by_8.parquet"

In [141]:
%%time

client = bigquery.Client()

# Set table_id to the ID of the table to create.
table_id = "reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_distances_c_top_100"

# WRITE_TRUNCATE -> replace
# WRITE_APPEND -> append data
job_config = bigquery.LoadJobConfig(
    schema=l_bq_json_schema,
    source_format=bigquery.SourceFormat.PARQUET,
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
)
uri = "gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-08-01_181632/df_nearest_neighbors_top_bigquery-8115327_by_8.parquet"

load_job = client.load_table_from_uri(
    uri,
    table_id,
    location="US",  # Must match the destination dataset location.
    job_config=job_config,
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))


Loaded 8115327 rows.
CPU times: user 46.7 ms, sys: 124 ms, total: 171 ms
Wall time: 1min 8s
