# Purpose


2022-06-29:
Use updated pandas function to get embeddings on VM machine with a ton of RAM.

Because we embedded post & text as a single embedding and we didn't use MLflow to create those embeddings, it's easier to just run the embeddings in the notebook than to re-use or re-write the old `AggregateEmbeddings` class.

Provenance:
* `v0.4.1 / djb_03.01-2021-12-aggregate_v041_posts_and_comments_pandas.ipynb`

# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [92]:
from datetime import datetime
import gc
import os
import logging
from logging import info
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm.auto import tqdm

import mlflow
import hydra

import subclu
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)
from subclu.models import aggregate_embeddings_pd

from subclu.utils import set_working_directory, get_project_subfolder
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric,
    elapsed_time,
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)

from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_gcs import LoadSubredditsGCS


print_lib_versions([dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.5.0


In [93]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set Local model paths

In [94]:
manual_model_timestamp = datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')
path_this_model = get_project_subfolder(
    f"data/models/aggregate_embeddings/manual_v050_{manual_model_timestamp}"
)
Path.mkdir(path_this_model, parents=True, exist_ok=True)
path_this_model

PosixPath('/home/jupyter/subreddit_clustering_i18n/data/models/aggregate_embeddings/manual_v050_2022-07-01_102413')

## Paths for embeddings

For v0.5.0 embeddings I didn't use mlflow to track the embeddings inference. We'll need to get them from these folders in GCS:

- [Subreddit metadata](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555`
- [Post + Comment Text (already combined)](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925`



In [65]:
RUN_DATE = '20220629'

BUCKET_NAME = 'i18n-subreddit-clustering'
EMBEDDINGS_SUB_ID = '2022-06-29_084555'
EMBEDDINGS_POST_COMMENT_ID = '2022-06-29_091925'

# Start MLflow & Log base params

In [95]:
mlf = MlflowLogger(tracking_uri='sqlite')

In [None]:
t_start_agg_embed = datetime.utcnow()
info(f"== Start run_aggregation() method ==")

info(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
mlf.set_experiment(mlflow_experiment)
mlflow.start_run(run_name=run_name)
mlf.add_git_hash_to_active_run()
mlf.set_tag_hostname(key='host_name')
mlf.log_param_hostname(key='host_name')
mlf.log_cpu_count()
mlf.log_ram_stats(param=True, only_memory_used=False)

In [None]:
elapsed_time()

# Load data

In [12]:
%%time


subs_v = LoadSubredditsGCS(
    bucket_name=BUCKET_NAME,
    gcs_path=f'i18n_topic_model_batch/runs/{RUN_DATE}/subreddits/text/embedding/{EMBEDDINGS_SUB_ID}',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='subreddit_id',
    df_format='pandas',
    unique_check=True,
    verbose= True,
    
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
)
subs_v.local_cache()

df_v_subs = subs_v.read_as_one_df()
print(df_v_subs.shape)

08:29:27 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555"
08:29:27 | INFO | "  5 <- Files matching prefix"
08:29:27 | INFO | "  5 <- Files to check"
08:29:27 | INFO | "    000000000000-196371_by_514.parquet <- File already exists, not downloading"
08:29:27 | INFO | "    2022-06-29_08-45-55_vectorize_text.log <- File already exists, not downloading"
08:29:27 | INFO | "  Files already cached: 2"
08:29:27 | INFO | "0:00:00.191944  <- Downloading files elapsed time"
08:29:27 | INFO | "  Files already downloaded."
08:29:27 | INFO | "  df format: pandas"
08:29:28 | INFO | "  Checking ID uniqueness..."


(196371, 514)
CPU times: user 1.66 s, sys: 1.81 s, total: 3.47 s
Wall time: 1.01 s


In [14]:
pc_v = LoadSubredditsGCS(
    bucket_name=BUCKET_NAME,
    gcs_path=f'i18n_topic_model_batch/runs/{RUN_DATE}/post_and_comment_text_combined/text_subreddit_seeds/embedding/{EMBEDDINGS_POST_COMMENT_ID}',
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='post_id',
    df_format='pandas',
    unique_check=False,
    verbose= True,
    
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
)
pc_v.local_cache()

df_v_pc = pc_v.read_as_one_df()
print(df_v_pc.shape)

08:31:30 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925"
08:31:30 | INFO | "  39 <- Files matching prefix"
08:31:30 | INFO | "  39 <- Files to check"
08:37:29 | INFO | "  Files already cached: 0"
08:37:29 | INFO | "0:05:59.499427  <- Downloading files elapsed time"
08:37:29 | INFO | "  Files already downloaded."
08:37:29 | INFO | "  df format: pandas"


(16360314, 515)


# Set weights & create copy dfs for new weights

In [15]:
# set weights
WEIGHT_POST_COMMENT = 0.85
WEIGHT_SUB_META = 0.15
assert(1.0 == WEIGHT_POST_COMMENT + WEIGHT_SUB_META)

In [73]:
l_ix_sub_level = ['subreddit_id', 'subreddit_name']
l_ix_post_level = l_ix_sub_level + ['post_id']

l_embedding_cols = [c for c in df_v_pc if c.startswith('embeddings_')]
print(len(l_embedding_cols))

512


In [16]:
%%time
df_v_pc_weighted = df_v_pc.copy()

df_v_subs_weighted = df_v_subs.copy()

CPU times: user 6.61 s, sys: 6.52 s, total: 13.1 s
Wall time: 13.1 s


In [20]:
# should be True
np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515])

False

In [18]:
%%time
# apply weight to all posts & subreddit meta at once (vectorized)
df_v_subs_weighted[l_embedding_cols] = df_v_subs_weighted[l_embedding_cols] * WEIGHT_SUB_META

CPU times: user 290 ms, sys: 200 ms, total: 490 ms
Wall time: 489 ms


In [19]:
%%time
# apply weight to all posts & subreddit meta at once (vectorized)
df_v_pc_weighted[l_embedding_cols] = df_v_pc_weighted[l_embedding_cols] * WEIGHT_POST_COMMENT

CPU times: user 25.7 s, sys: 17.8 s, total: 43.4 s
Wall time: 43.4 s


In [20]:
# Should be False
np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515])

False

# Aggregate to Post-Level: Post&Comments + Subreddit Meta

It's better to let pandas handle the interations with `.groupby('subreddit_id')`. Otherwise we have to create masks for each subreddit that can take much longer (10+ hours).

- ETA with masks: +17.6 hours
- ETA with groupby ~2.5 hours

```
# mask:
0%  329/81973 [04:18<17:42:36, 1.28it/s]

# .groupby()
6% 4751/81973 [09:56<2:35:06, 8.30it/s]
```

In [86]:
%%time
info(f"Start C1 - posts + comments + sub descriptions")
l_df_c1_weights = list()

for s_id, df_ in tqdm(
    df_v_pc_weighted.head(123000).groupby('subreddit_id'),
    ascii=True, mininterval=5,
):
    df_.loc[:, l_embedding_cols] = np.add(
        df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
        df_[l_embedding_cols]
    )
    l_df_c1_weights.append(df_)
    del df_


info(f"Create new C1 df")
df_posts_agg_c1 = pd.concat(l_df_c1_weights, ignore_index=True)
print(df_posts_agg_c1.shape)

info(f"C1 - post level complete")

10:19:50 | INFO | "Start C1 - posts + comments + sub descriptions"


  0%|          | 0/573 [00:00<?, ?it/s]

10:21:00 | INFO | "Create new C1 df"
10:21:00 | INFO | "C1 - post level complete"


(123000, 515)
CPU times: user 1min 9s, sys: 249 ms, total: 1min 10s
Wall time: 1min 9s


In [87]:
df_posts_agg_c1.iloc[:5, :10]

Unnamed: 0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6
0,t5_1009a3,memesenespanol,t3_v00j0e,-0.038424,-0.001508,-0.044316,0.006019,-0.078988,0.012636,0.014201
1,t5_1009a3,memesenespanol,t3_v0eg7b,-0.031401,-0.023828,-0.038469,0.039142,0.057682,0.033646,-0.004657
2,t5_1009a3,memesenespanol,t3_v0l7ym,0.035448,-0.024439,0.040074,-0.021662,0.002018,0.038604,-0.035386
3,t5_1009a3,memesenespanol,t3_v0l8vu,-0.036525,-0.019641,-0.018195,-0.039854,-0.05566,0.035754,0.00957
4,t5_1009a3,memesenespanol,t3_v0qrxj,-0.015133,-0.025595,-0.042165,0.021455,-0.033216,0.068623,0.003232


### Save post-level

In [100]:
%%time
# for post-level, we want to save using dask because it'll save to multiple files
from collections import defaultdict
d_dfs_to_save = defaultdict(dict)

d_dfs_to_save['df_posts_agg_c1']['local'] = (
    path_this_model / f"df_posts_agg_c1_{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)
d_dfs_to_save['df_posts_agg_c1']['df'] = df_posts_agg_c1

save_pd_df_to_parquet_in_chunks(
    df_posts_agg_c1,
    d_dfs_to_save['df_posts_agg_c1']['local'],
    write_index=False
)

10:44:25 | INFO | "Converting pandas to dask..."
10:44:25 | INFO | "   263.7 MB <- Memory usage"
10:44:25 | INFO | "       1	<- target Dask partitions	  350.0 <- target MB partition size"


CPU times: user 3.84 s, sys: 322 ms, total: 4.16 s
Wall time: 4.16 s


In [None]:
info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save['df_posts_agg_c1']['local'], artifact_path=d_dfs_to_save['df_posts_agg_c1'])

# Aggregate to Subreddit Level

In [89]:
%%time

info(f"SUBREDDIT-LEVEL C1 - posts + comments + sub descriptions")
df_subs_agg_c1 = (
    df_posts_agg_c1
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
    .sort_values(by=l_ix_sub_level)
)
info(f"  {df_subs_agg_c1.shape} <- df_subs_agg_c1.shape (posts + comments + sub description)")

10:21:03 | INFO | "SUBREDDIT-LEVEL C1 - posts + comments + sub descriptions"
10:21:03 | INFO | "  (573, 514) <- df_subs_agg_c1.shape (posts + comments + sub description)"


CPU times: user 310 ms, sys: 155 ms, total: 465 ms
Wall time: 462 ms


In [90]:
df_subs_agg_c1.iloc[-8:, :10]

Unnamed: 0,subreddit_id,subreddit_name,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7
565,t5_121pbz,danidezzi,-0.003288,0.024298,0.041657,0.037304,-0.016999,0.017798,0.020901,-0.009704
566,t5_121sso,subsimulatorgpt2,0.010741,0.001471,-0.005019,-0.002949,0.000168,0.02552,0.002405,-0.006943
567,t5_121vdk,subsimulatorgpt2meta,-0.017591,0.024974,-0.020804,0.00936,0.018893,0.006528,-0.011085,-0.016565
568,t5_1229nt,footfunction,-0.046698,-0.002341,0.010889,0.005713,-0.04418,0.044654,0.066613,0.021109
569,t5_1229sf,adorablebdsm,-0.005888,0.007911,0.013208,0.02546,0.023216,0.035588,-0.012712,-0.045977
570,t5_122e6w,russian_forest,-0.009,0.013543,-0.001294,-0.018704,0.018142,0.020227,0.004843,-0.048714
571,t5_122ek4,auroracomic,0.006227,-0.011303,0.005415,-0.019146,0.001395,0.030257,-0.01464,-0.002675
572,t5_122hf1,eldenring,-0.005069,0.00785,0.004079,0.020388,-0.002498,0.059155,0.005834,0.014089


### Save Subreddit level

This one we can save as a pandas df, no need to split it into multiple files

In [96]:
%%time
df_subs_agg_c1.to_parquet(
    path_this_model / f"df_subs_agg_c1-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}.parquet"
)

CPU times: user 56.8 ms, sys: 8.01 ms, total: 64.8 ms
Wall time: 63.8 ms


In [97]:
%%time
save_pd_df_to_parquet_in_chunks(
    df_subs_agg_c1,
    path_this_model / f"df_subs_agg_c1_{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}",
    write_index=False
)

10:35:59 | INFO | "Converting pandas to dask..."
10:35:59 | INFO | "     1.2 MB <- Memory usage"
10:35:59 | INFO | "       1	<- target Dask partitions	   80.0 <- target MB partition size"


CPU times: user 179 ms, sys: 0 ns, total: 179 ms
Wall time: 176 ms


In [None]:
# finish logging total time + end mlflow run
total_fxn_time = elapsed_time(start_time=t_start_agg_embed, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('full_aggregation_fxn_minutes',
                  total_fxn_time / timedelta(minutes=1)
                  )

# Test run on data sample

In [None]:
BREAK

## Aggregate to Post Level

In [38]:
# # DON'T DO THIS -- iterating per sub + mask takes waaaay longer. 
# for s_id in tqdm(
#     df_v_pc_weighted['subreddit_id'].unique(),
#     ascii=True, mininterval=2
# ):
#     mask_sub_posts = df_v_pc_weighted['subreddit_id'] == s_id
    
# #     df_v_pc_weighted.loc[mask_sub_posts, l_embedding_cols] = np.add(
# #         df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
# #         df_v_pc_weighted[mask_sub_posts][l_embedding_cols]
# #     ) 

In [41]:
l_df_c1_weights_test = list()

for s_id, df_ in tqdm(
    df_v_pc_weighted.head(123000).groupby('subreddit_id'),
    ascii=True, mininterval=5,
):
    df_.loc[:, l_embedding_cols] = np.add(
        df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
        df_[l_embedding_cols]
    )
    l_df_c1_weights_test.append(df_)

  0%|          | 0/573 [00:00<?, ?it/s]

In [43]:
len(l_df_c1_weights_test)

573

In [44]:
%%time
df_posts_agg_c1_test = pd.concat(l_df_c1_weights_test, ignore_index=True)
print(df_posts_agg_c1_test.shape)

(123000, 515)
CPU times: user 297 ms, sys: 121 ms, total: 418 ms
Wall time: 418 ms


### Save post-level

In [81]:
# for post-level, we want to save using dask because it'll save to multiple files

save_pd_df_to_parquet_in_chunks(
    df_posts_agg_c1_test,
    path_this_model / f"df_posts_agg_c1_test_{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}",
    write_index=False
)

10:08:25 | INFO | "Converting pandas to dask..."
10:08:25 | INFO | "   263.7 MB <- Memory usage"
10:08:25 | INFO | "       2	<- target Dask partitions	  250.0 <- target MB partition size"


## Aggregate to Subreddit Level

In [74]:
l_ix_sub_level

['subreddit_id', 'subreddit_name']

In [75]:

info(f"C1 - posts + comments + sub descriptions")
df_subs_agg_c1_test = (
    df_posts_agg_c1_test
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
    .sort_values(by=l_ix_sub_level)
)
info(f"  {df_subs_agg_c1_test.shape} <- df_subs_agg_c1.shape (posts + comments + sub description)")

10:04:49 | INFO | "C1 - posts + comments + sub descriptions"
10:04:49 | INFO | "  (573, 514) <- df_subs_agg_c.shape (posts + comments + sub description)"


In [80]:
df_subs_agg_c1_test.iloc[-8:, :10]

Unnamed: 0,subreddit_id,subreddit_name,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7
565,t5_121pbz,danidezzi,-0.003288,0.024298,0.041657,0.037304,-0.016999,0.017798,0.020901,-0.009704
566,t5_121sso,subsimulatorgpt2,0.010741,0.001471,-0.005019,-0.002949,0.000168,0.02552,0.002405,-0.006943
567,t5_121vdk,subsimulatorgpt2meta,-0.017591,0.024974,-0.020804,0.00936,0.018893,0.006528,-0.011085,-0.016565
568,t5_1229nt,footfunction,-0.046698,-0.002341,0.010889,0.005713,-0.04418,0.044654,0.066613,0.021109
569,t5_1229sf,adorablebdsm,-0.005888,0.007911,0.013208,0.02546,0.023216,0.035588,-0.012712,-0.045977
570,t5_122e6w,russian_forest,-0.009,0.013543,-0.001294,-0.018704,0.018142,0.020227,0.004843,-0.048714
571,t5_122ek4,auroracomic,0.006227,-0.011303,0.005415,-0.019146,0.001395,0.030257,-0.01464,-0.002675
572,t5_122hf1,eldenring,-0.005069,0.00785,0.004079,0.020388,-0.002498,0.059155,0.005834,0.014089


### Save Subreddit level

This one we can save as a pandas df, no need to split it into multiple files

In [82]:
df_subs_agg_c1_test.to_parquet(
    path_this_model / f"df_subs_agg_c1_test-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}.parquet"
)

## Other tests

In [None]:
BREAK

In [46]:
# expected it to be false
np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_posts_agg_c1_test.iloc[:1000,3:515])

False

In [59]:
df_posts_agg_c1_test.iloc[-3:, :10]

Unnamed: 0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6
122997,t5_122hf1,eldenring,t3_vkqy3i,-0.014552,0.022346,0.002888,0.060437,-0.053684,0.042971,0.035834
122998,t5_122hf1,eldenring,t3_vkqzsw,-0.001896,0.067519,0.029062,-0.003681,0.011099,0.066369,0.044884
122999,t5_122hf1,eldenring,t3_vkr34h,-0.018741,0.027568,-0.010963,0.050802,-0.05583,0.069541,0.011007


In [56]:
df_v_pc.iloc[:3, :10]

Unnamed: 0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6
0,t5_1009a3,memesenespanol,t3_v00j0e,-0.047121,0.009066,-0.053279,0.014465,-0.102776,0.005231,0.021099
1,t5_1009a3,memesenespanol,t3_v0eg7b,-0.038859,-0.017192,-0.046399,0.053433,0.058012,0.029949,-0.001087
2,t5_1009a3,memesenespanol,t3_v0l7ym,0.039786,-0.017911,0.046004,-0.018101,-0.007475,0.035782,-0.037238


In [57]:
df_v_pc_weighted.iloc[:3, :10]

Unnamed: 0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6
0,t5_1009a3,memesenespanol,t3_v00j0e,-0.040053,0.007706,-0.045287,0.012295,-0.08736,0.004446,0.017934
1,t5_1009a3,memesenespanol,t3_v0eg7b,-0.03303,-0.014613,-0.039439,0.045418,0.04931,0.025457,-0.000924
2,t5_1009a3,memesenespanol,t3_v0l7ym,0.033818,-0.015224,0.039104,-0.015386,-0.006354,0.030415,-0.031653


In [58]:
df_posts_agg_c1_test.iloc[:3, :10]

Unnamed: 0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6
0,t5_1009a3,memesenespanol,t3_v00j0e,-0.038424,-0.001508,-0.044316,0.006019,-0.078988,0.012636,0.014201
1,t5_1009a3,memesenespanol,t3_v0eg7b,-0.031401,-0.023828,-0.038469,0.039142,0.057682,0.033646,-0.004657
2,t5_1009a3,memesenespanol,t3_v0l7ym,0.035448,-0.024439,0.040074,-0.021662,0.002018,0.038604,-0.035386


In [54]:
df_v_subs_weighted[df_v_subs_weighted['subreddit_name'] == 'memesenespanol'].iloc[:, :10]

Unnamed: 0,subreddit_id,subreddit_name,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7
39811,t5_1009a3,memesenespanol,0.001629,-0.009215,0.000971,-0.006276,0.008371,0.008189,-0.003733,0.005192


In [55]:
df_v_subs[df_v_subs['subreddit_name'] == 'memesenespanol'].iloc[:, :10]

Unnamed: 0,subreddit_id,subreddit_name,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7
39811,t5_1009a3,memesenespanol,0.010863,-0.061431,0.006471,-0.041841,0.055809,0.054596,-0.024888,0.034613
