# Purpose

Test loading latest embeddings to make sure they have the expected columns.

# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm import tqdm

import subclu
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)


print_lib_versions([np, pd, subclu])

python		v 3.7.10
===
numpy		v: 1.18.5
pandas		v: 1.2.5
subclu		v: 0.5.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()
logging.info('loggging ready')

09:53:46 | INFO | "loggging ready"


# Auth note
This notebook assumes you have authenticated using the gcloud CLI. Example</br>
```bash
gcloud auth application-default login
```

# Load vectorized data (embeddings)



## Metadata

In [7]:
%%time

gcs_meta_embeddings = (
    """gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555/000000000000-196371_by_514.parquet"""
)
df_emb_meta = pd.read_parquet(
    gcs_meta_embeddings
)

CPU times: user 2.92 s, sys: 1.46 s, total: 4.38 s
Wall time: 12.8 s


In [8]:
df_emb_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196371 entries, 0 to 196370
Columns: 514 entries, subreddit_id to embeddings_511
dtypes: float32(512), object(2)
memory usage: 386.5+ MB


In [9]:
df_emb_meta.shape

(196371, 514)

In [10]:
df_emb_meta.iloc[:6,:25]

Unnamed: 0,subreddit_id,subreddit_name,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21,embeddings_22
0,t5_2qh1i,askreddit,0.02279,-0.059892,-0.001451,0.046439,0.066976,0.064597,0.057965,-0.013464,-0.046748,-0.044211,-0.014546,-0.037996,0.028808,-0.060112,0.013253,-0.026466,0.061427,-0.066526,0.031853,0.000255,-0.06149,-0.054536,0.03371
1,t5_2qh3l,news,-0.048809,0.061595,0.045624,0.007804,0.072698,0.035929,0.063459,0.054612,-0.011245,0.070547,-0.031765,-0.057275,0.055175,-0.003909,-0.047737,0.011671,0.017775,-0.047829,-0.044221,-0.007286,0.01621,0.017299,0.011912
2,t5_2qh33,funny,0.063963,-0.064321,0.013925,-0.022936,0.065873,0.040977,0.03581,0.059591,-0.054132,-0.033169,-0.059364,-0.046414,0.024705,-0.029237,-0.027685,0.042819,-0.002543,-0.063652,0.06166,-0.056877,-0.051425,-0.063723,0.053308
3,t5_2y77d,antiwork,-0.02888,0.055796,0.04435,-0.00876,0.036146,0.033803,0.00908,-0.042765,0.053252,-0.022756,0.043832,-0.014859,-0.039325,0.025604,-0.066655,0.02849,0.048976,-0.052332,-0.066003,0.052697,0.061577,-0.063533,0.064345
4,t5_2qhsa,interestingasfuck,-0.045592,0.056386,-0.042588,0.049715,0.066143,0.046497,0.065407,-0.010752,-0.042452,0.04172,0.059271,-0.016806,0.054681,-0.058673,-0.05901,0.039047,0.013914,-0.065247,0.073246,-0.045689,-0.033733,-0.071861,0.062511
5,t5_2qh13,worldnews,0.068684,0.04441,0.051444,0.0215,-0.042308,-0.038877,0.034985,-0.067741,0.051444,0.067007,0.012721,-0.053409,-0.02648,0.039451,0.037109,0.023084,-0.000581,-0.034466,-0.020675,-0.054483,0.055364,-0.066214,0.030262


## Post + Commentm

In [11]:
%%time

gcs_pc_sample = (
    "gs://i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925/000000000000-539559_by_515.parquet"
)

df_emb_pc = pd.read_parquet(
    gcs_pc_sample
)

print(df_emb_pc.shape)

(539559, 515)
CPU times: user 7.6 s, sys: 2.78 s, total: 10.4 s
Wall time: 43 s


In [12]:
df_emb_pc.iloc[:6,:25]

Unnamed: 0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,embeddings_8,embeddings_9,embeddings_10,embeddings_11,embeddings_12,embeddings_13,embeddings_14,embeddings_15,embeddings_16,embeddings_17,embeddings_18,embeddings_19,embeddings_20,embeddings_21
0,t5_1009a3,memesenespanol,t3_v00j0e,-0.047121,0.009066,-0.053279,0.014465,-0.102776,0.005231,0.021099,-0.037455,-0.003633,-0.0381,0.013419,-0.005185,0.057369,-0.025638,0.011728,0.02334,0.014565,-0.004253,0.071688,-0.10455,0.019248,-0.000191
1,t5_1009a3,memesenespanol,t3_v0eg7b,-0.038859,-0.017192,-0.046399,0.053433,0.058012,0.029949,-0.001087,-0.055713,0.020075,-0.054805,0.014635,-0.042406,0.040106,-0.051338,-0.000938,-0.063216,-0.0551,0.056228,-0.068269,0.004469,0.013483,-0.012539
2,t5_1009a3,memesenespanol,t3_v0l7ym,0.039786,-0.017911,0.046004,-0.018101,-0.007475,0.035782,-0.037238,0.014344,-0.07229,0.007622,-0.038456,-0.021415,-0.04271,0.02948,0.023588,-0.012152,0.0036,-0.041428,-0.057036,-0.02801,0.002602,0.030365
3,t5_1009a3,memesenespanol,t3_v0l8vu,-0.044888,-0.012266,-0.022548,-0.039504,-0.075331,0.032429,0.01565,0.004406,-0.039762,-0.065734,0.05387,0.002627,0.04453,0.041666,-0.02459,0.004686,0.056469,-0.066854,0.009471,0.065157,-0.017683,0.035702
4,t5_1009a3,memesenespanol,t3_v0qrxj,-0.01972,-0.019271,-0.050748,0.032625,-0.048926,0.071098,0.008194,0.008235,-0.075312,0.048891,0.022565,0.057345,-0.033335,0.057854,0.03058,0.009637,-0.070813,0.011254,-0.045268,0.033784,0.042234,-0.055938
5,t5_1009a3,memesenespanol,t3_v0qse1,-0.067149,0.006003,-0.004868,0.006799,-0.118678,0.043354,-0.017358,-0.021344,-0.001903,0.013163,0.090672,0.019381,0.030529,0.003476,-0.00509,-0.019716,-0.011357,-0.057692,0.073803,0.076566,-0.025409,0.020549
