# Purpose


### 2021-03-28
Calculating precise nearest neighbors has become too expensive as we go over 40k subreddits. So instead let's calculate approx nearest neighbors. In this notebook we'll test [ANNOY](https://github.com/spotify/annoy).  Main reason for using annoy over FAISS is that annoy has official wheels in pypi, but FAISS only officially supports installation from conda. We don't want to depend on third-party wheels for FAISS.



# Notebook setup

In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
from datetime import datetime
import gc
import os
import logging
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm import tqdm

import mlflow
import hydra
import annoy

import subclu
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)
from subclu.models import aggregate_embeddings_pd

from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)


print_lib_versions([annoy, dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
annoy		v: 1.17.0
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.4.1


In [23]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set sqlite database as MLflow URI

In [24]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db'

## Get list of experiments with new function

In [25]:
mlf.list_experiment_meta(output_format='pandas').tail(9)

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
19,19,v0.4.1_mUSE_inference_test,gs://i18n-subreddit-clustering/mlflow/mlruns/19,active
20,20,v0.4.1_mUSE_inference,gs://i18n-subreddit-clustering/mlflow/mlruns/20,active
21,21,v0.4.1_mUSE_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/21,active
22,22,v0.4.1_mUSE_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/22,active
23,23,v0.4.1_mUSE_clustering_test,gs://i18n-subreddit-clustering/mlflow/mlruns/23,active
24,24,v0.4.1_mUSE_clustering,gs://i18n-subreddit-clustering/mlflow/mlruns/24,active
25,25,v0.4.1_mUSE_clustering_new_metrics,gs://i18n-subreddit-clustering/mlflow/mlruns/25,active
26,26,v0.4.1_nearest_neighbors_test,gs://i18n-subreddit-clustering/mlflow/mlruns/26,active
27,27,v0.4.1_nearest_neighbors,gs://i18n-subreddit-clustering/mlflow/mlruns/27,active


## Get runs from embeddings aggregation jobs

Want to make sure we can load these artifacts for other jobs

In [26]:
%%time

df_mlf_runs =  mlf.search_all_runs(experiment_ids=[ 22])
df_mlf_runs.shape

CPU times: user 48.9 ms, sys: 0 ns, total: 48.9 ms
Wall time: 48.1 ms


(1, 64)

In [27]:
df_mlf_runs.head()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.posts_raw_rows,metrics.df_sub_level_agg_c_post_comments_and_sub_desc-rows,metrics.df_sub_level_agg_c_post_comments_and_sub_desc-cols,metrics.df_post_level_agg_c_post_comments_sub_desc-cols,metrics.posts_raw_cols,metrics.memory_used_percent,metrics.memory_total,metrics.sub_description_raw_rows,metrics.sub_description_raw_cols,metrics.cpu_count,metrics.df_post_level_agg_c_post_comments_sub_desc-rows,metrics.memory_free,metrics.memory_used,metrics.df_sub_level_agg_a_post_only-cols,metrics.comments_raw_cols,metrics.df_sub_level_agg_a_post_only-rows,metrics.comments_raw_rows,params.mlflow_tracking_uri,params.f_log_file,params.host_name,params.posts_uuid,params.run_name,params.unique_checks,params.comments_uuid,...,params.cpu_count,params.n_sample_posts_files,params.folder_posts_text_and_meta,params.mlflow_experiment,params.col_comment_text_len,params.calculate_b_agg_posts_and_comments,params.logs_path,params.posts_folder,params.min_comment_text_len,params.col_post_id,params.n_sample_comments_files,params.memory_total,params.agg_post_comment_weight,params.bucket_name,params.calculate_similarites,params.col_subreddit_id,params.agg_post_post_weight,params.col_comment_id,params.agg_post_subreddit_desc_weight,params.agg_post_to_subreddit_weight_col,params.folder_subreddits_text_and_meta,params.agg_comments_to_post_weight_col,params.subreddit_desc_folder,params.subreddit_desc_uuid,tags.mlflow.runName,tags.mlflow.source.git.commit,tags.mlflow.user,tags.host_name,tags.mlflow.source.name,tags.mlflow.source.type
0,4144c443722e47ef9cd2f42a5e813f3b,22,RUNNING,gs://i18n-subreddit-clustering/mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts,2021-12-23 11:20:55.413000+00:00,,15629958.0,49625.0,514.0,515.0,515.0,0.216662,1937274.0,49705.0,514.0,80.0,15629958.0,1690826.0,419733.0,514.0,516.0,49625.0,54407324.0,sqlite,logs/AggregateEmbeddings/2021-12-23_11-20-55_agg_full_lc_false_pd-2021-12-23_112054.log,djb-100-2021-04-28-djb-eda-german-subs,559a8f13264245b3923ab5699ef55bfe,agg_full_lc_false_pd-2021-12-23_112054,False,"['26c8fcf422a9403ba4a844c8e380bf7f', '54ba724869bf4ec9a2cad2a4f7eca048', 'e7ed11ccdc0b45abbdf3bf19605d4498', 'a69d1b259875458283124ffdaa6efbb6']",...,80,,posts/top/2021-12-14_fix,v0.4.1_mUSE_aggregates,comment_text_len,False,logs/AggregateEmbeddings,df_vect_posts,4,post_id,,1937274,20,i18n-subreddit-clustering,True,subreddit_id,70,comment_id,10,,subreddits/top/2021-12-14,,df_vect_subreddits_description,559a8f13264245b3923ab5699ef55bfe,agg_full_lc_false_pd-2021-12-23_112054,2e3ffe692433be950ce3fdfc0f164a9e3df6b0e4,jupyter,djb-100-2021-04-28-djb-eda-german-subs,/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py,LOCAL


In [28]:
run_uuid = '4144c443722e47ef9cd2f42a5e813f3b'

In [29]:
# mask_finished = df_mlf_runs['status'] == 'FINISHED'
# mask_output_over_1M_rows = (
#     (df_mlf_runs['metrics.df_vect_posts_rows'] >= 1e5) |
#     (df_mlf_runs['metrics.df_vect_comments'] >= 1e5)
# )
# # df_mlf_runs[mask_finished].shape

# df_mlf_use_for_agg = df_mlf_runs[mask_output_over_1M_rows]
# df_mlf_use_for_agg.shape

In [30]:
# cols_with_multiple_vals = df_mlf_use_for_agg.columns[df_mlf_use_for_agg.nunique(dropna=False) > 1]
# # len(cols_with_multiple_vals)

# style_df_numeric(
#     df_mlf_use_for_agg
#     [cols_with_multiple_vals]
#     .drop(['artifact_uri', 'end_time',
#            # 'start_time',
#            ], 
#           axis=1)
#     .dropna(axis='columns', how='all')
#     .iloc[:, :30]
#     ,
#     rename_cols_for_display=True,
# )

# Check run artifacts

In [31]:
l_artifacts_top_level = mlf.list_run_artifacts(
    run_id=run_uuid,
    only_top_level=True,
    verbose=True,
)
len(l_artifacts_top_level)

15:43:07 | INFO | "   212 <- Artifacts to check count"
15:43:07 | INFO | "   212 <- Artifacts clean count"
15:43:07 | INFO | "     5 <- Artifacts & folders at TOP LEVEL clean count"


5

In [32]:
l_artifacts_all = mlf.list_run_artifacts(
    run_id=run_uuid,
    only_top_level=False,
    verbose=False,
)
len(l_artifacts_all)

15:43:11 | INFO | "   212 <- Artifacts clean count"
15:43:11 | INFO | "     5 <- Artifacts & folders at TOP LEVEL clean count"


212

In [33]:
l_artifacts_top_level

['config',
 'd_logged_dfs_subfolders',
 'df_post_level_agg_c_post_comments_sub_desc',
 'df_sub_level_agg_a_post_only',
 'df_sub_level_agg_c_post_comments_and_sub_desc']

In [34]:
l_sub_c = [i for i in l_artifacts_all if 'df_sub_level_agg_c_post_comments_and_sub_desc' in i]
print(len(l_sub_c))
l_sub_c[:6]

5


['mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_sub_level_agg_c_post_comments_and_sub_desc/_common_metadata',
 'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_sub_level_agg_c_post_comments_and_sub_desc/_metadata',
 'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_sub_level_agg_c_post_comments_and_sub_desc/part.0.parquet',
 'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_sub_level_agg_c_post_comments_and_sub_desc/part.1.parquet',
 'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_sub_level_agg_c_post_comments_and_sub_desc/part.2.parquet']

In [35]:
l_post_c = [i for i in l_artifacts_all if 'df_post_level_agg_c_post_comments_sub_desc' in i]
print(len(l_post_c))
l_post_c[:6]

197


['mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_post_level_agg_c_post_comments_sub_desc/_common_metadata',
 'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_post_level_agg_c_post_comments_sub_desc/_metadata',
 'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_post_level_agg_c_post_comments_sub_desc/part.0.parquet',
 'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_post_level_agg_c_post_comments_sub_desc/part.1.parquet',
 'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_post_level_agg_c_post_comments_sub_desc/part.10.parquet',
 'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_post_level_agg_c_post_comments_sub_desc/part.100.parquet']

# Load aggregated embeddings

use `gsutil` to download embeddings for posts b/c that can take a LONG time to download sequentially. `gsutil` makes parallel downloaidng much faster and reports download speeds above 500MB / s:
```bash
ents_sub_desc/part.67.parquet...
/ [2/197 files][ 61.7 GiB/ 75.4 GiB]  81% Done 632.0 MiB/s ETA 00:00:22
```

In [49]:
%%time
# use gsutil to download post-level embeddings b/c it'll be much faster to run it in parallel

remote_key =  'mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_post_level_agg_c_post_comments_sub_desc'
print(remote_key)
# Need to remove the last part of the local path otherwise we'll get duplicate subfolders:
#. top/2021-12-14/2021-12-14 instead of top/2021-12-14
local_f = f"/home/jupyter/subreddit_clustering_i18n/data/local_cache/{'/'.join(remote_key.split('/')[:-1])}"
Path(local_f).mkdir(parents=True, exist_ok=True)
remote_gs_path = f"gs://i18n-subreddit-clustering/{remote_key}"

# `-n` flag means "no clober", so it should skip existing files (only copy new files)
# !gsutil -m cp -r -n $remote_gs_path $local_f

mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_post_level_agg_c_post_comments_sub_desc
CPU times: user 74 µs, sys: 135 µs, total: 209 µs
Wall time: 195 µs


In [44]:
# %%time

# df_agg_posts_c = mlf.read_run_artifact(
#     run_id=run_uuid,
#     artifact_folder='df_post_level_agg_c_post_comments_sub_desc',
#     read_function='pd_parquet',
#     verbose=False,
# )
# print(df_agg_posts_c.shape)

15:48:29 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_post_level_agg_c_post_comments_sub_desc"
100%|######################################| 197/197 [00:00<00:00, 55514.50it/s]
15:48:30 | INFO | "  Parquet files found:   195"
15:48:30 | INFO | "  Parquet files to use:   195"


(15629958, 516)


In [42]:
# %%time

# df_agg_sub_a = mlf.read_run_artifact(
#     run_id=run_uuid,
#     artifact_folder='df_sub_level_agg_a_post_only',
#     read_function='pd_parquet',
#     verbose=False,
# )
# print(df_agg_sub_a.shape)

In [64]:
%%time

df_agg_sub_c = mlf.read_run_artifact(
    run_id=run_uuid,
    artifact_folder='df_sub_level_agg_c_post_comments_and_sub_desc',
    read_function='pd_parquet',
    verbose=False,
)
print(df_agg_sub_c.shape)

16:43:40 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/22/4144c443722e47ef9cd2f42a5e813f3b/artifacts/df_sub_level_agg_c_post_comments_and_sub_desc"
100%|##########################################| 5/5 [00:00<00:00, 15697.25it/s]
16:43:40 | INFO | "  Parquet files found:     3"
16:43:40 | INFO | "  Parquet files to use:     3"


(49625, 515)
CPU times: user 5.56 s, sys: 1.11 s, total: 6.67 s
Wall time: 5.51 s


In [65]:
df_agg_sub_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49625 entries, 0 to 49624
Columns: 515 entries, index to embeddings_511
dtypes: float64(512), int64(1), object(2)
memory usage: 195.0+ MB


In [66]:
df_agg_sub_c.iloc[:5, :25]

Unnamed: 0,index,subreddit_name,subreddit_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21
0,0,0hthaatsjaay,t5_46wt4h,-0.012697,0.002315,0.011109,-0.005984,-0.036264,0.008153,-0.001582,-0.017434,-0.034032,0.01881,0.021127,-0.022091,-0.01196,0.016237,0.010066,0.030702,0.011328,0.027796,0.011354,0.00651,-0.018133,-0.030889
1,1,0nlyfantastic0,t5_4byrct,-0.018998,-0.01721,0.00747,-0.003859,0.011523,0.042205,-0.018871,0.036121,-0.034871,0.012692,-0.00571,0.012133,-0.031327,0.025415,0.019227,0.007873,0.002596,-0.02878,0.054517,-0.012182,-0.013013,-0.045152
2,2,0nlyleaks,t5_36f9u6,-0.014206,-0.064709,-0.000853,-0.018357,0.039892,0.046341,0.024391,0.00453,-0.079281,0.01393,0.033977,-0.031651,-0.010252,-0.072903,0.031626,0.011762,0.027802,0.021372,0.02201,-0.066856,0.036344,-0.007425
3,3,0sanitymemes,t5_2qlzfy,-0.016652,-0.002637,0.002151,-0.00038,-0.004485,0.038169,-0.007977,-0.005778,-0.039986,0.004282,0.018017,0.006978,0.01049,-0.003704,0.00171,0.003726,0.022393,0.006149,0.002622,0.007001,-0.005759,-0.018756
4,4,0xpolygon,t5_2qgijx,-0.033193,0.038902,-0.033466,-0.051036,-0.000341,0.012167,0.009529,0.008731,-0.007742,0.013853,0.015495,0.009962,0.027135,0.006659,-0.013294,0.023316,0.021664,0.001649,0.001098,-0.00234,0.003297,-0.016152


In [67]:
df_agg_sub_c.iloc[-10:, :25]

Unnamed: 0,index,subreddit_name,subreddit_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21
49615,49615,zweitmeinung,t5_4kv9wc,-0.040518,-0.011396,0.016183,-0.028422,-0.066712,0.028768,0.004953,0.008945,-0.053609,-3.2e-05,0.001619,-0.01245,0.040547,0.004345,-0.002199,-0.034676,-0.012779,0.004648,0.000397,0.036897,-0.015819,-0.017496
49616,49616,zwift,t5_33t8j,-0.027395,-0.01527,-0.006319,0.015369,-0.038437,0.020171,0.009737,-0.007263,-0.038737,-0.005224,-0.004557,-0.015082,0.044456,-0.020974,0.019693,0.022184,0.010257,-0.010002,0.008148,-0.007386,0.002608,-0.006747
49617,49617,zwiftracing,t5_3y6zfn,0.042906,-0.018738,-0.014533,0.003808,-0.000362,0.047995,0.014497,0.017367,-0.034752,0.044055,-0.00636,-0.02363,-0.05347,-0.005947,0.043232,-0.004975,-0.013311,0.028611,0.027053,-0.042135,0.020192,-0.035474
49618,49618,zwormz,t5_558h6l,0.001943,5e-05,-0.008841,0.013108,0.009782,0.013488,-0.004737,0.001171,-0.028631,0.011872,0.006817,0.001142,-0.025191,-0.030035,0.003945,0.01221,-0.002794,-0.027809,0.004584,0.033497,-0.01179,0.002807
49619,49619,zxspectrum,t5_2ttxf,-0.012092,-0.025742,-0.00875,-0.024969,-0.018628,-0.006174,0.004686,2.1e-05,-0.027414,0.008352,0.017394,-0.017908,0.014743,0.003561,-0.000998,-0.01082,0.023164,-0.011636,0.012579,0.005268,-0.001191,-0.020699
49620,49620,zyn,t5_38nx2,-0.02206,0.010046,-0.03431,-0.022542,-0.017596,0.041781,0.011894,-0.038841,-0.051343,0.007992,0.021627,0.003101,0.044853,0.018825,0.002892,0.011217,0.001206,-0.023233,0.013473,0.018096,0.010907,-0.000968
49621,49621,zyramains,t5_35la7,-0.002744,-0.016453,0.005965,-0.016609,-0.007815,0.065392,0.024101,-0.003512,-0.027948,0.010691,0.016709,-0.00466,0.029199,-0.004788,0.012306,0.001346,-0.011995,-0.020919,-0.000191,-0.002442,-0.018061,-0.014261
49622,49622,zyzz,t5_2sosg,-0.011185,0.014175,0.004697,-0.001609,-0.025836,0.031206,0.017154,-0.009301,-0.047893,0.001483,0.010634,0.014197,0.024161,-0.011832,0.016491,-0.001671,0.006479,-0.015113,0.019097,0.009557,-0.008552,-0.011907
49623,49623,zzzzz,t5_2wxmu,0.109239,0.028351,-0.039005,-0.012098,-0.044105,-0.030205,-0.00099,-0.02961,-0.040511,0.022615,-0.070672,-0.024646,-0.010884,0.013012,0.025176,-0.066089,-0.014818,0.033189,0.01538,0.000433,0.007034,0.098833
49624,49624,zzzzzzzzzzzzzzzzzzzz,t5_3an04,0.098792,0.029517,-0.038618,-0.002381,-0.054415,-0.0217,0.005971,-0.014944,-0.045893,0.020496,-0.060823,-0.017631,-0.0102,0.0029,0.024926,-0.050978,-0.009353,0.038723,0.017978,-7e-05,0.005893,0.103079


## Set index so it's easier to work with annoy

nvm, set it when passing the df into annoy index

In [71]:
try:
    df_agg_sub_c = df_agg_sub_c.drop('index', axis=1)
except KeyError:
    pass

l_embedding_cols = [c for c in df_agg_sub_c.columns if c.startswith('embedding')]
l_ix_cols = [c for c in df_agg_sub_c.columns if c not in l_embedding_cols]
print(l_ix_cols)

# df_agg_sub_c = df_agg_sub_c.set_index(l_ix_cols)
# print(df_agg_sub_c.shape)

['subreddit_name', 'subreddit_id']


In [72]:
df_agg_sub_c.iloc[-10:, :25]

Unnamed: 0,subreddit_name,subreddit_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21,embeddings_22
49615,zweitmeinung,t5_4kv9wc,-0.040518,-0.011396,0.016183,-0.028422,-0.066712,0.028768,0.004953,0.008945,-0.053609,-3.2e-05,0.001619,-0.01245,0.040547,0.004345,-0.002199,-0.034676,-0.012779,0.004648,0.000397,0.036897,-0.015819,-0.017496,0.018465
49616,zwift,t5_33t8j,-0.027395,-0.01527,-0.006319,0.015369,-0.038437,0.020171,0.009737,-0.007263,-0.038737,-0.005224,-0.004557,-0.015082,0.044456,-0.020974,0.019693,0.022184,0.010257,-0.010002,0.008148,-0.007386,0.002608,-0.006747,0.025665
49617,zwiftracing,t5_3y6zfn,0.042906,-0.018738,-0.014533,0.003808,-0.000362,0.047995,0.014497,0.017367,-0.034752,0.044055,-0.00636,-0.02363,-0.05347,-0.005947,0.043232,-0.004975,-0.013311,0.028611,0.027053,-0.042135,0.020192,-0.035474,-0.026483
49618,zwormz,t5_558h6l,0.001943,5e-05,-0.008841,0.013108,0.009782,0.013488,-0.004737,0.001171,-0.028631,0.011872,0.006817,0.001142,-0.025191,-0.030035,0.003945,0.01221,-0.002794,-0.027809,0.004584,0.033497,-0.01179,0.002807,-0.005111
49619,zxspectrum,t5_2ttxf,-0.012092,-0.025742,-0.00875,-0.024969,-0.018628,-0.006174,0.004686,2.1e-05,-0.027414,0.008352,0.017394,-0.017908,0.014743,0.003561,-0.000998,-0.01082,0.023164,-0.011636,0.012579,0.005268,-0.001191,-0.020699,0.030823
49620,zyn,t5_38nx2,-0.02206,0.010046,-0.03431,-0.022542,-0.017596,0.041781,0.011894,-0.038841,-0.051343,0.007992,0.021627,0.003101,0.044853,0.018825,0.002892,0.011217,0.001206,-0.023233,0.013473,0.018096,0.010907,-0.000968,0.021697
49621,zyramains,t5_35la7,-0.002744,-0.016453,0.005965,-0.016609,-0.007815,0.065392,0.024101,-0.003512,-0.027948,0.010691,0.016709,-0.00466,0.029199,-0.004788,0.012306,0.001346,-0.011995,-0.020919,-0.000191,-0.002442,-0.018061,-0.014261,0.015803
49622,zyzz,t5_2sosg,-0.011185,0.014175,0.004697,-0.001609,-0.025836,0.031206,0.017154,-0.009301,-0.047893,0.001483,0.010634,0.014197,0.024161,-0.011832,0.016491,-0.001671,0.006479,-0.015113,0.019097,0.009557,-0.008552,-0.011907,0.008517
49623,zzzzz,t5_2wxmu,0.109239,0.028351,-0.039005,-0.012098,-0.044105,-0.030205,-0.00099,-0.02961,-0.040511,0.022615,-0.070672,-0.024646,-0.010884,0.013012,0.025176,-0.066089,-0.014818,0.033189,0.01538,0.000433,0.007034,0.098833,-0.018256
49624,zzzzzzzzzzzzzzzzzzzz,t5_3an04,0.098792,0.029517,-0.038618,-0.002381,-0.054415,-0.0217,0.005971,-0.014944,-0.045893,0.020496,-0.060823,-0.017631,-0.0102,0.0029,0.024926,-0.050978,-0.009353,0.038723,0.017978,-7e-05,0.005893,0.103079,-0.026173


In [70]:
df_agg_sub_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49625 entries, 0 to 49624
Columns: 514 entries, subreddit_name to embeddings_511
dtypes: float64(512), object(2)
memory usage: 194.6+ MB


# Load subreddits from final table

There may be some that didn't make the final cut so we don't want to have stray subs in final table

In [193]:
TODO

NameError: name 'TODO' is not defined

In [194]:
run_id_final_model = 'e37b0a2c3af54c588818e7efdde15df5'

In [195]:
l_artifacts_top_level = mlf.list_run_artifacts(
    run_id=run_id_final_model,
    only_top_level=True,
    verbose=True,
)
len(l_artifacts_top_level)

18:06:23 | INFO | "    93 <- Artifacts to check count"
18:06:23 | INFO | "    93 <- Artifacts clean count"
18:06:23 | INFO | "    12 <- Artifacts & folders at TOP LEVEL clean count"


12

In [196]:
l_artifacts_all = mlf.list_run_artifacts(
    run_id=run_id_final_model,
    only_top_level=False,
    verbose=False,
)
len(l_artifacts_all)

18:06:39 | INFO | "    93 <- Artifacts clean count"
18:06:39 | INFO | "    12 <- Artifacts & folders at TOP LEVEL clean count"


93

In [197]:
l_artifacts_top_level

['X_linkage',
 'clustering.log',
 'clustering_model',
 'config',
 'df_accel',
 'df_classification_reports',
 'df_labels',
 'df_supervised_metrics',
 'figures',
 'hydra',
 'optimal_ks',
 'pipeline_params']

In [198]:
l_sub_c = [i for i in l_artifacts_all if 'df_labels' in i]
print(len(l_sub_c))
l_sub_c[:6]

2


['mlflow/mlruns/25/e37b0a2c3af54c588818e7efdde15df5/artifacts/df_labels/df_labels.csv',
 'mlflow/mlruns/25/e37b0a2c3af54c588818e7efdde15df5/artifacts/df_labels/df_labels.parquet']

In [199]:
df_labels = mlf.read_run_artifact(
    run_id=run_id_final_model,
    artifact_folder='df_labels',
    read_function='pd_parquet',
    verbose=False,
)
print(df_labels.shape)

18:08:49 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/mlflow/mlruns/25/e37b0a2c3af54c588818e7efdde15df5/artifacts/df_labels"
100%|###########################################| 2/2 [00:00<00:00, 9393.74it/s]
18:08:49 | INFO | "  Parquet files found:     1"
18:08:49 | INFO | "  Parquet files to use:     1"


(49558, 119)


In [200]:
df_labels.iloc[:5, :15]

Unnamed: 0,model_sort_order,subreddit_name,subreddit_id,primary_topic,posts_for_modeling_count,k_0010_label,k_0013_label,k_0020_label,k_0023_label,k_0030_label,k_0040_label,k_0041_label,k_0050_label,k_0059_label,k_0060_label
0,4694,0hthaatsjaay,t5_46wt4h,,64.0,2,2,2,2,3,4,5,5,5,5
1,5329,0nlyfantastic0,t5_4byrct,,13.0,2,2,2,2,3,4,5,5,5,5
2,3518,0nlyleaks,t5_36f9u6,,2.0,2,2,2,2,3,3,4,4,4,4
3,46739,0sanitymemes,t5_2qlzfy,Internet Culture and Memes,1000.0,10,13,20,22,27,37,38,46,55,56
4,31155,0xpolygon,t5_2qgijx,Crypto,1000.0,7,9,13,15,20,27,28,34,38,39


# Keep only subs that are in final model

In [201]:
print(df_agg_sub_c.shape)
df_agg_sub_c = df_agg_sub_c[df_agg_sub_c['subreddit_id'].isin(df_labels['subreddit_id'])]

print(df_agg_sub_c.shape)

(49625, 514)
(49558, 514)


# Build annoy index on sample

In [63]:
from subclu.models.nn_annoy import AnnoyIndex

In [216]:
%%time

index_cols = ['subreddit_id', 'subreddit_name']

nn_index = AnnoyIndex(
    # df_agg_sub_c[l_embedding_cols + index_cols].sample(n=11000, random_state=42),
    df_agg_sub_c[l_embedding_cols + index_cols],
    index_cols=index_cols,
    metric='angular',
    n_trees=100,
)

CPU times: user 88.7 ms, sys: 101 ms, total: 189 ms
Wall time: 187 ms


In [217]:
%%time
nn_index.build()

CPU times: user 42.2 s, sys: 0 ns, total: 42.2 s
Wall time: 4.15 s


In [218]:
%%time

n_test_i = 29500
nn_index.get_top_n_by_item(n_test_i, k=15, search_k=-1, include_distances=True)

CPU times: user 30.7 ms, sys: 0 ns, total: 30.7 ms
Wall time: 30 ms


Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance
0,t5_2t73u,motivationalpics,0,t5_2t73u,motivationalpics,0.0
1,t5_2t73u,motivationalpics,1,t5_2qvm1,motivation,0.458069
2,t5_2t73u,motivationalpics,2,t5_2qmtp,inspiration,0.498368
3,t5_2t73u,motivationalpics,3,t5_2d4ebq,motivateinspire,0.513564
4,t5_2t73u,motivationalpics,4,t5_2rplx,motivationalquotes,0.575202
5,t5_2t73u,motivationalpics,5,t5_2qhb0,positive,0.585118
6,t5_2t73u,motivationalpics,6,t5_2rmfx,getmotivated,0.595429
7,t5_2t73u,motivationalpics,7,t5_2s5f5,positivity,0.651482
8,t5_2t73u,motivationalpics,8,t5_srbv5,bloomer,0.79508
9,t5_2t73u,motivationalpics,9,t5_2tna8,howtonotgiveafuck,0.810315


In [219]:
%%time
nn_index.get_top_n_by_item(n_test_i, k=15, search_k=2, include_distances=True)

CPU times: user 30.6 ms, sys: 0 ns, total: 30.6 ms
Wall time: 29.5 ms


Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance
0,t5_2t73u,motivationalpics,0,t5_2t73u,motivationalpics,0.0
1,t5_2t73u,motivationalpics,1,t5_2qvm1,motivation,0.458069
2,t5_2t73u,motivationalpics,2,t5_2qmtp,inspiration,0.498368
3,t5_2t73u,motivationalpics,3,t5_2d4ebq,motivateinspire,0.513564
4,t5_2t73u,motivationalpics,4,t5_2rplx,motivationalquotes,0.575202
5,t5_2t73u,motivationalpics,5,t5_2qhb0,positive,0.585118
6,t5_2t73u,motivationalpics,6,t5_2rmfx,getmotivated,0.595429
7,t5_2t73u,motivationalpics,7,t5_2s5f5,positivity,0.651482
8,t5_2t73u,motivationalpics,8,t5_srbv5,bloomer,0.79508
9,t5_2t73u,motivationalpics,9,t5_2tna8,howtonotgiveafuck,0.810315


In [220]:
%%time
nn_index.get_top_n_by_item(n_test_i, k=20, include_distances=False)

CPU times: user 30.8 ms, sys: 0 ns, total: 30.8 ms
Wall time: 30.2 ms


Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b
0,t5_2t73u,motivationalpics,0,t5_2t73u,motivationalpics
1,t5_2t73u,motivationalpics,1,t5_2qvm1,motivation
2,t5_2t73u,motivationalpics,2,t5_2qmtp,inspiration
3,t5_2t73u,motivationalpics,3,t5_2d4ebq,motivateinspire
4,t5_2t73u,motivationalpics,4,t5_2rplx,motivationalquotes
5,t5_2t73u,motivationalpics,5,t5_2qhb0,positive
6,t5_2t73u,motivationalpics,6,t5_2rmfx,getmotivated
7,t5_2t73u,motivationalpics,7,t5_2s5f5,positivity
8,t5_2t73u,motivationalpics,8,t5_srbv5,bloomer
9,t5_2t73u,motivationalpics,9,t5_2tna8,howtonotgiveafuck


## Get df with all items

In [221]:
df_nn_top = nn_index.get_top_n_by_item_all(
    k=100, 
    search_k=-1, 
    include_distances=True,
    append_i=True,
)

100%|██████████| 49558/49558 [21:34<00:00, 38.28it/s]
18:44:54 | INFO | "(4906242, 6) <- df_top_items shape"


In [222]:
df_nn_top.head()

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance
1,t5_46wt4h,0hthaatsjaay,1,t5_5fweuy,bestpyt,0.505327
2,t5_46wt4h,0hthaatsjaay,2,t5_4ph6vm,babyfacejassbest,0.559713
3,t5_46wt4h,0hthaatsjaay,3,t5_4p3c20,officialtootie,0.561146
4,t5_46wt4h,0hthaatsjaay,4,t5_2kxm87,honeybthatsme,0.569104
5,t5_46wt4h,0hthaatsjaay,5,t5_3ng2du,yungblasian,0.578581


In [223]:
df_nn_top.tail()

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance
4955795,t5_3an04,zzzzzzzzzzzzzzzzzzzz,95,t5_4wqs4j,realtingz,1.087297
4955796,t5_3an04,zzzzzzzzzzzzzzzzzzzz,96,t5_59sn6v,nathalie_bw_kim_,1.087468
4955797,t5_3an04,zzzzzzzzzzzzzzzzzzzz,97,t5_49d7l3,cxynxe_,1.087931
4955798,t5_3an04,zzzzzzzzzzzzzzzzzzzz,98,t5_4qd76k,zarasecret,1.088321
4955799,t5_3an04,zzzzzzzzzzzzzzzzzzzz,99,t5_2ccaem,bellelongwell,1.088337


In [234]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'ich_iel']
    .head(15)
)

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
2153601,t5_37k29,ich_iel,1,t5_ofkj1,okbrudimongo,0.638241,0.796324
2153602,t5_37k29,ich_iel,2,t5_3hn0l,deutschememes,0.666231,0.778068
2153603,t5_37k29,ich_iel,3,t5_2xbtv,buenzli,0.677372,0.770584
2153604,t5_37k29,ich_iel,4,t5_2qmr6,aeiou,0.692639,0.760125
2153605,t5_37k29,ich_iel,5,t5_w2zxy,okoidawappler,0.697673,0.756626
2153606,t5_37k29,ich_iel,6,t5_pa6tc,okemakkermaloot,0.704992,0.751493
2153607,t5_37k29,ich_iel,7,t5_3yg9h2,ovalwichs,0.716985,0.742966
2153608,t5_37k29,ich_iel,8,t5_s3lnd,okcopainattard,0.720374,0.740531
2153609,t5_37k29,ich_iel,9,t5_2vlhq,kreiswichs,0.732838,0.731474
2153610,t5_37k29,ich_iel,10,t5_kj9sb,germanmemes,0.736073,0.729098


In [235]:
(
    df_nn_top[df_nn_top['subreddit_name_a'] == 'vegande']
    .head(15)
)

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
4677501,t5_37ruc,vegande,1,t5_4c06em,vegetarischde,0.402211,0.919113
4677502,t5_37ruc,vegande,2,t5_33xgk,veganuk,0.449804,0.898838
4677503,t5_37ruc,vegande,3,t5_megyr,portugalvegan,0.505472,0.872249
4677504,t5_37ruc,vegande,4,t5_2qhpm,vegan,0.505862,0.872052
4677505,t5_37ruc,vegande,5,t5_2uquu,askvegans,0.51628,0.866727
4677506,t5_37ruc,vegande,6,t5_5d72de,happyvegans,0.518205,0.865732
4677507,t5_37ruc,vegande,7,t5_2ecvn2,brasilvegan,0.527389,0.860931
4677508,t5_37ruc,vegande,8,t5_3lmvm,veganchill,0.542344,0.852931
4677509,t5_37ruc,vegande,9,t5_109235,exvegans,0.542686,0.852746
4677510,t5_37ruc,vegande,10,t5_3neeu,australianvegans,0.546296,0.85078


# Convert plain distance to cosine similarity
from [github](https://github.com/spotify/annoy/issues/112#issuecomment-686513356)
```
cosine_similarity = 1 - cosine_distance^2/2
```

In [236]:
%%time

col_cosine_similarity = 'cosine_similarity'

if col_cosine_similarity not in df_nn_top:
    df_nn_top[col_cosine_similarity] = (
        1 -
        (df_nn_top['distance'] ** 2) / 2
    )

CPU times: user 44 µs, sys: 0 ns, total: 44 µs
Wall time: 55.1 µs


In [237]:
df_nn_top.head()

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
1,t5_46wt4h,0hthaatsjaay,1,t5_5fweuy,bestpyt,0.505327,0.872322
2,t5_46wt4h,0hthaatsjaay,2,t5_4ph6vm,babyfacejassbest,0.559713,0.843361
3,t5_46wt4h,0hthaatsjaay,3,t5_4p3c20,officialtootie,0.561146,0.842558
4,t5_46wt4h,0hthaatsjaay,4,t5_2kxm87,honeybthatsme,0.569104,0.83806
5,t5_46wt4h,0hthaatsjaay,5,t5_3ng2du,yungblasian,0.578581,0.832622


In [238]:

# df_test_ = (
#     df_nn_top[df_nn_top['subreddit_name_a'] == 'vegande']
#     .head(15)
# )

# df_test_[col_cosine_similarity] = (
#     1 -
#     (df_test_['distance'] ** 2) / 2
# )

# df_test_

## Save df to GCS

From here we should be able to share & create a BigQuery table

In [None]:
TODO

In [241]:
gs_nearest_neighbors_root = 'gs://i18n-subreddit-clustering/data/models/nearest_neighbors'
gs_this_model = f"{gs_nearest_neighbors_root}/manual_model_{datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')}"
gs_this_model

'gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-03-28_191331'

In [242]:
%%time
shape_ = df_nn_top.shape

df_nn_top.to_parquet(
    f"{gs_this_model}/df_nearest_neighbors_top-{shape_[0]}_by_{shape_[1]}.parquet"
)

CPU times: user 2.42 s, sys: 0 ns, total: 2.42 s
Wall time: 4.07 s
