# Purpose


2022-06-29:
Use updated pandas function to get embeddings on VM machine with a ton of RAM.

Because we embedded post & text as a single embedding and we didn't use MLflow to create those embeddings, it's easier to just run the embeddings in the notebook than to re-use or re-write the old `AggregateEmbeddings` class.

Provenance:
* `v0.4.1 / djb_03.01-2021-12-aggregate_v041_posts_and_comments_pandas.ipynb`

# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from datetime import datetime, timedelta
import gc
import os
import logging
from logging import info
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns

import dask
from dask import dataframe as dd
from tqdm.auto import tqdm

import mlflow
import hydra

import subclu
from subclu.models.aggregate_embeddings import (
    AggregateEmbeddings, AggregateEmbeddingsConfig,
    load_config_agg_jupyter, get_dask_df_shape,
)
from subclu.models import aggregate_embeddings_pd

from subclu.utils import set_working_directory, get_project_subfolder
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric,
    elapsed_time,
)
from subclu.utils.mlflow_logger import MlflowLogger, save_pd_df_to_parquet_in_chunks
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)

from subclu.i18n_topic_model_batch.subclu2.utils.data_loaders_gcs import LoadSubredditsGCS


print_lib_versions([dask, hydra, mlflow, np, pd, plotly, sns, subclu])

python		v 3.7.10
===
dask		v: 2021.06.0
hydra		v: 1.1.0
mlflow		v: 1.16.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
seaborn		v: 0.11.1
subclu		v: 0.5.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set Local model paths

In [4]:
manual_model_timestamp = datetime.utcnow().strftime('%Y-%m-%d_%H%M%S')
path_this_model = get_project_subfolder(
    f"data/models/aggregate_embeddings/manual_v050_{manual_model_timestamp}"
)
Path.mkdir(path_this_model, parents=True, exist_ok=True)
path_this_model

PosixPath('/home/jupyter/subreddit_clustering_i18n/data/models/aggregate_embeddings/manual_v050_2022-07-01_114214')

## Paths for embeddings

For v0.5.0 embeddings I didn't use mlflow to track the embeddings inference. We'll need to get them from these folders in GCS:

- [Subreddit metadata](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555`
- [Post + Comment Text (already combined)](https://console.cloud.google.com/storage/browser/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925)
    - `i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925`



In [5]:
RUN_DATE = '20220629'

BUCKET_NAME = 'i18n-subreddit-clustering'
EMBEDDINGS_SUB_ID = '2022-06-29_084555'
EMBEDDINGS_POST_COMMENT_ID = '2022-06-29_091925'

# Start MLflow & Log base params

In [6]:
mlf = MlflowLogger(tracking_uri='sqlite')

In [7]:
# n-sample posts for testing
n_posts_sample = None

mlflow_experiment = 'v0.5.0_mUSE_aggregates'
# 'v0.5.0_mUSE_aggregates', 'v0.5.0_mUSE_aggregates_test'


t_start_agg_embed = datetime.utcnow()
info(f"== Start run_aggregation() method ==")



info(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
mlf.set_experiment(mlflow_experiment)
mlflow.start_run()
mlf.add_git_hash_to_active_run()
mlf.set_tag_hostname(key='host_name')
mlf.log_param_hostname(key='host_name')
mlf.log_cpu_count()
mlf.log_ram_stats(param=True, only_memory_used=False)

11:42:15 | INFO | "== Start run_aggregation() method =="
11:42:15 | INFO | "MLflow tracking URI: sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db"
11:42:16 | INFO | "host_name: djb-100-2021-04-28-djb-eda-german-subs"
11:42:16 | INFO | "cpu_count: 96"
11:42:16 | INFO | "RAM stats:
{'memory_used_percent': '1.01%', 'memory_total': '1,444,961', 'memory_used': '14,599', 'memory_free': '1,381,009'}"


{'memory_total': 1444961,
 'memory_used_percent': 0.010103386873417344,
 'memory_used': 14599,
 'memory_free': 1381009}

In [8]:
# set weights
WEIGHT_POST_COMMENT = 0.85
WEIGHT_SUB_META = 0.15
assert(1.0 == WEIGHT_POST_COMMENT + WEIGHT_SUB_META)


gcs_sub_embeddings = f'i18n_topic_model_batch/runs/{RUN_DATE}/subreddits/text/embedding/{EMBEDDINGS_SUB_ID}'
gcs_post_comment_embeddings = (
    f'i18n_topic_model_batch/runs/{RUN_DATE}/post_and_comment_text_combined/text_subreddit_seeds/embedding/{EMBEDDINGS_POST_COMMENT_ID}'
)


mlflow.log_params(
    {
        'embeddings_bucket': BUCKET_NAME,
        'embeddings_subreddit_path': gcs_sub_embeddings,
        'embeddings_post_and_comments_path': gcs_post_comment_embeddings,
        'weight_post_and_comments': WEIGHT_POST_COMMENT,
        'weight_subreddit_meta': WEIGHT_SUB_META,
    }
)

# Load data

In [9]:
%%time
t_start_data_load_ = datetime.utcnow()

subs_v = LoadSubredditsGCS(
    bucket_name=BUCKET_NAME,
    gcs_path=gcs_sub_embeddings,
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='subreddit_id',
    df_format='pandas',
    unique_check=True,
    verbose= True,
    
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
)
subs_v.local_cache()

df_v_subs = subs_v.read_as_one_df()
r_subs, c_subs = df_v_subs.shape
mlflow.log_metrics(
    {
        f"df_v_subs-rows": r_subs,
        f"df_v_subs-cols": c_subs,
    }
)
print(f"{r_subs:,.0f} rows, {c_subs:,.0f} cols")

11:42:16 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/subreddits/text/embedding/2022-06-29_084555"
11:42:16 | INFO | "  5 <- Files matching prefix"
11:42:16 | INFO | "  5 <- Files to check"
11:42:16 | INFO | "    000000000000-196371_by_514.parquet <- File already exists, not downloading"
11:42:16 | INFO | "    2022-06-29_08-45-55_vectorize_text.log <- File already exists, not downloading"
11:42:16 | INFO | "  Files already cached: 2"
11:42:16 | INFO | "  Files already downloaded."
11:42:16 | INFO | "  df format: pandas"
11:42:17 | INFO | "  Checking ID uniqueness..."


196,371 rows, 514 cols
CPU times: user 1.66 s, sys: 1.77 s, total: 3.42 s
Wall time: 1.05 s


In [10]:
%%time

pc_v = LoadSubredditsGCS(
    bucket_name=BUCKET_NAME,
    gcs_path=gcs_post_comment_embeddings,
    local_cache_path="/home/jupyter/subreddit_clustering_i18n/data/local_cache/",
    columns=None,
    col_unique_check='post_id',
    df_format='pandas',
    unique_check=False,
    verbose= True,
    
    n_sample_files=None,
    n_files_slice_start=None,
    n_files_slice_end=None,
)
pc_v.local_cache()

df_v_pc = pc_v.read_as_one_df()
r_pc, c_pc = df_v_pc.shape
mlflow.log_metrics(
    {
        f"df_v_post_comments-rows": r_pc,
        f"df_v_post_comments-cols": c_pc,
    }
)
print(f"{r_pc:,.0f} rows, {c_pc:,.0f} cols")

t_data_load = elapsed_time(start_time=t_start_data_load_, log_label='Data Loading Time', verbose=True)
mlflow.log_metric('time_fxn-data_loading_time',
                  t_data_load / timedelta(minutes=1)
                  )
mlf.log_ram_stats(only_memory_used=True)

11:42:17 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/i18n-subreddit-clustering/i18n_topic_model_batch/runs/20220629/post_and_comment_text_combined/text_subreddit_seeds/embedding/2022-06-29_091925"
11:42:17 | INFO | "  39 <- Files matching prefix"
11:42:17 | INFO | "  39 <- Files to check"
11:42:17 | INFO | "    000000000000-539559_by_515.parquet <- File already exists, not downloading"
11:42:17 | INFO | "    000000000001-598576_by_515.parquet <- File already exists, not downloading"
11:42:17 | INFO | "    000000000002-444545_by_515.parquet <- File already exists, not downloading"
11:42:17 | INFO | "    000000000003-274825_by_515.parquet <- File already exists, not downloading"
11:42:17 | INFO | "    000000000004-288676_by_515.parquet <- File already exists, not downloading"
11:42:17 | INFO | "    000000000005-324438_by_515.parquet <- File already exists, not downloading"
11:42:17 | INFO | "    000000000006-376924_by_515.p

16,360,314 rows, 515 cols


11:43:12 | INFO | "RAM stats:
{'memory_used_percent': '5.37%', 'memory_used': '77,626'}"


CPU times: user 1min 48s, sys: 2min 4s, total: 3min 53s
Wall time: 55.5 s


{'memory_used_percent': 0.05372186515760633, 'memory_used': 77626}

# Set weights & create copy dfs for new weights

In [11]:
l_ix_sub_level = ['subreddit_id', 'subreddit_name']
l_ix_post_level = l_ix_sub_level + ['post_id']

l_embedding_cols = [c for c in df_v_pc if c.startswith('embeddings_')]
print(len(l_embedding_cols))

512


In [12]:
%%time
#### UPDATE TO RESET TEST
if n_posts_sample is not None:
    df_v_pc_weighted = df_v_pc.head(n_posts_sample).copy()
else:
    df_v_pc_weighted = df_v_pc.copy()

df_v_subs_weighted = df_v_subs.copy()

CPU times: user 6.96 s, sys: 6 s, total: 13 s
Wall time: 13 s


In [13]:
# should be True
np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515])

True

In [14]:
%%time
# apply weight to all posts & subreddit meta at once (vectorized)
df_v_subs_weighted[l_embedding_cols] = df_v_subs_weighted[l_embedding_cols] * WEIGHT_SUB_META

CPU times: user 280 ms, sys: 157 ms, total: 436 ms
Wall time: 435 ms


In [15]:
%%time
# apply weight to all posts & subreddit meta at once (vectorized)
df_v_pc_weighted[l_embedding_cols] = df_v_pc_weighted[l_embedding_cols] * WEIGHT_POST_COMMENT

CPU times: user 25.9 s, sys: 13.7 s, total: 39.6 s
Wall time: 39.5 s


In [16]:
# Should be False
np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_v_pc.iloc[:1000,3:515])

False

# Aggregate to Post-Level: Post&Comments + Subreddit Meta

It's better to let pandas handle the interations with `.groupby('subreddit_id')`. Otherwise we have to create masks for each subreddit that can take much longer (10+ hours).

- ETA with masks: +17.6 hours
- ETA with groupby ~2.5 hours

```
# mask:
0%  329/81973 [04:18<17:42:36, 1.28it/s]

# .groupby()
6% 4751/81973 [09:56<2:35:06, 8.30it/s]
```

In [17]:
%%time
info(f"Start C1 - posts + comments + sub descriptions")
t_start_agg_post_c1 = datetime.utcnow()

l_df_c1_weights = list()

for s_id, df_ in tqdm(
    df_v_pc_weighted.groupby('subreddit_id'),
    ascii=True, mininterval=5,
):
    df_.loc[:, l_embedding_cols] = np.add(
        df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
        df_[l_embedding_cols]
    )
    l_df_c1_weights.append(df_)
    del df_


info(f"Create new C1 df")
df_posts_agg_c1 = pd.concat(l_df_c1_weights, ignore_index=True)

r_, c_ = df_posts_agg_c1.shape
mlflow.log_metrics(
    {
        f"df_posts_agg_c1-rows": r_,
        f"df_posts_agg_c1-cols": c_,
    }
)
print(f"{r_:,.0f} rows, {c_:,.0f} cols")
del r_, c_

t_agg_pc_c1 = elapsed_time(start_time=t_start_agg_post_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_posts_agg_c1',
                  t_agg_pc_c1 / timedelta(minutes=1)
                  )
info(f"C1 - post level complete")
mlf.log_ram_stats(only_memory_used=True)

11:44:06 | INFO | "Start C1 - posts + comments + sub descriptions"


  0%|          | 0/81973 [00:00<?, ?it/s]

14:24:56 | INFO | "Create new C1 df"
14:25:37 | INFO | "  2:41:31.360312 <- Total Agg fxn time time elapsed"
14:25:37 | INFO | "C1 - post level complete"


16,360,314 rows, 515 cols


14:25:38 | INFO | "RAM stats:
{'memory_used_percent': '11.45%', 'memory_used': '165,383'}"


CPU times: user 2h 40min 33s, sys: 1min 28s, total: 2h 42min 1s
Wall time: 2h 41min 32s


{'memory_used_percent': 0.11445499221086244, 'memory_used': 165383}

In [18]:
df_posts_agg_c1.iloc[:5, :10]

Unnamed: 0,subreddit_id,subreddit_name,post_id,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6
0,t5_1009a3,memesenespanol,t3_v00j0e,-0.038424,-0.001508,-0.044316,0.006019,-0.078988,0.012636,0.014201
1,t5_1009a3,memesenespanol,t3_v0eg7b,-0.031401,-0.023828,-0.038469,0.039142,0.057682,0.033646,-0.004657
2,t5_1009a3,memesenespanol,t3_v0l7ym,0.035448,-0.024439,0.040074,-0.021662,0.002018,0.038604,-0.035386
3,t5_1009a3,memesenespanol,t3_v0l8vu,-0.036525,-0.019641,-0.018195,-0.039854,-0.05566,0.035754,0.00957
4,t5_1009a3,memesenespanol,t3_v0qrxj,-0.015133,-0.025595,-0.042165,0.021455,-0.033216,0.068623,0.003232


### Save post-level

In [19]:
d_dfs_to_save = defaultdict(dict)

In [20]:
%%time
d_dfs_to_save['df_posts_agg_c1']['local'] = (
    path_this_model / f"df_posts_agg_c1_{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_posts_agg_c1,
    d_dfs_to_save['df_posts_agg_c1']['local'],
    write_index=False
)

info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save['df_posts_agg_c1']['local'], artifact_path='df_posts_agg_c1')

14:25:38 | INFO | "Converting pandas to dask..."
14:25:42 | INFO | "  35,070.4 MB <- Memory usage"
14:25:42 | INFO | "      64	<- target Dask partitions	  550.0 <- target MB partition size"
14:27:15 | INFO | "  Logging df to mlflow..."


CPU times: user 31min 9s, sys: 4min 25s, total: 35min 34s
Wall time: 9min 39s


# Aggregate to Subreddit Level

In [21]:
%%time

info(f"SUBREDDIT-LEVEL C1 - posts + comments + sub descriptions")
t_start_agg_subs_c1 = datetime.utcnow()

df_subs_agg_c1 = (
    df_posts_agg_c1
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
    .sort_values(by=l_ix_sub_level)
)
r_, c_ = df_subs_agg_c1.shape
mlflow.log_metrics(
    {
        f"df_subs_agg_c1-rows": r_,
        f"df_subs_agg_c1-cols": c_,
    }
)
print(f"{r_:,.0f} rows, {c_:,.0f} cols")
del r_, c_

t_agg_subs_c1 = elapsed_time(start_time=t_start_agg_subs_c1, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-df_subs_agg_c1',
                  t_agg_subs_c1 / timedelta(minutes=1)
                  )
info(f"  <- df_subs_agg_c1.shape (posts + comments + sub description)")
mlf.log_ram_stats(only_memory_used=True)

14:35:18 | INFO | "SUBREDDIT-LEVEL C1 - posts + comments + sub descriptions"
14:36:50 | INFO | "  0:01:32.196986 <- Total Agg fxn time time elapsed"
14:36:50 | INFO | "  <- df_subs_agg_c1.shape (posts + comments + sub description)"


81,973 rows, 514 cols


14:36:51 | INFO | "RAM stats:
{'memory_used_percent': '11.52%', 'memory_used': '166,480'}"


CPU times: user 1min 6s, sys: 26.2 s, total: 1min 32s
Wall time: 1min 33s


{'memory_used_percent': 0.11521418225128567, 'memory_used': 166480}

In [22]:
df_subs_agg_c1.iloc[-8:, :10]

Unnamed: 0,subreddit_id,subreddit_name,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7
81965,t5_zyz1w,k_on_shuffle,-0.011377,0.023965,0.041733,-0.010157,0.035751,0.03872,-0.015874,-0.01362
81966,t5_zz27k,capcomhomearcade,-0.002962,-0.021336,0.015305,0.004804,-0.015484,-0.013002,0.009767,-0.017814
81967,t5_zz4jm,aithesomniumfiles,-0.017261,0.007027,0.010723,0.001267,0.000868,0.021436,0.004496,0.002664
81968,t5_zz9nd,freyanightingale,0.041456,0.020634,0.010827,0.029859,-0.0337,0.048856,0.024756,-0.024959
81969,t5_zzebd,mk3supra,-0.029058,-0.003269,0.001232,-0.010903,-0.053896,0.063239,0.009007,0.009073
81970,t5_zzgss,epididymitis,-0.037259,-0.001366,0.001502,-0.014607,-0.0636,0.052982,0.030545,-0.028854
81971,t5_zzjup,morgonaut,-0.014793,-0.02627,0.038431,0.008339,0.00557,-0.024845,0.001907,0.029336
81972,t5_zzszh,circumcisiongrief,-0.011436,0.027289,0.005682,0.006329,-0.01495,0.03629,0.007651,0.032805


### Save Subreddit level

This one we can save as a pandas df, no need to split it into multiple files

In [23]:
%%time
df_subs_agg_c1.to_parquet(
    path_this_model / f"df_subs_agg_c1-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}.parquet"
)

CPU times: user 2.46 s, sys: 363 ms, total: 2.82 s
Wall time: 2.53 s


In [24]:
%%time
d_dfs_to_save['df_subs_agg_c1']['local'] = (
    path_this_model / f"df_subs_agg_c1{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}"
)

save_pd_df_to_parquet_in_chunks(
    df_subs_agg_c1,
    d_dfs_to_save['df_subs_agg_c1']['local'],
    write_index=False
)

info(f"  Logging df to mlflow...")
mlflow.log_artifacts(d_dfs_to_save['df_subs_agg_c1']['local'], artifact_path='df_subs_agg_c1')

14:36:54 | INFO | "Converting pandas to dask..."
14:36:54 | INFO | "   171.2 MB <- Memory usage"
14:36:54 | INFO | "       1	<- target Dask partitions	  350.0 <- target MB partition size"
14:36:57 | INFO | "  Logging df to mlflow..."


CPU times: user 2.82 s, sys: 428 ms, total: 3.25 s
Wall time: 5.22 s


In [25]:
# finish logging total time + end mlflow run
total_fxn_time = elapsed_time(start_time=t_start_agg_embed, log_label='Total Agg fxn time', verbose=True)
mlflow.log_metric('time_fxn-full_aggregation_fxn_minutes',
                  total_fxn_time / timedelta(minutes=1)
                  )
mlflow.end_run()

14:36:59 | INFO | "  2:54:43.728899 <- Total Agg fxn time time elapsed"


# Test run on data sample

In [26]:
mlflow.end_run("FAILED")

In [27]:
BREAK

NameError: name 'BREAK' is not defined

## Aggregate to Post Level

In [None]:
# # DON'T DO THIS -- iterating per sub + mask takes waaaay longer. 
# for s_id in tqdm(
#     df_v_pc_weighted['subreddit_id'].unique(),
#     ascii=True, mininterval=2
# ):
#     mask_sub_posts = df_v_pc_weighted['subreddit_id'] == s_id
    
# #     df_v_pc_weighted.loc[mask_sub_posts, l_embedding_cols] = np.add(
# #         df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
# #         df_v_pc_weighted[mask_sub_posts][l_embedding_cols]
# #     ) 

In [None]:
l_df_c1_weights_test = list()

for s_id, df_ in tqdm(
    df_v_pc_weighted.head(123000).groupby('subreddit_id'),
    ascii=True, mininterval=5,
):
    df_.loc[:, l_embedding_cols] = np.add(
        df_v_subs_weighted[df_v_subs_weighted['subreddit_id'] == s_id][l_embedding_cols].to_numpy(),
        df_[l_embedding_cols]
    )
    l_df_c1_weights_test.append(df_)

In [None]:
len(l_df_c1_weights_test)

In [None]:
%%time
df_posts_agg_c1_test = pd.concat(l_df_c1_weights_test, ignore_index=True)
print(df_posts_agg_c1_test.shape)

### Save post-level

In [None]:
# for post-level, we want to save using dask because it'll save to multiple files

save_pd_df_to_parquet_in_chunks(
    df_posts_agg_c1_test,
    path_this_model / f"df_posts_agg_c1_test_{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}",
    write_index=False
)

## Aggregate to Subreddit Level

In [None]:
l_ix_sub_level

In [None]:

info(f"C1 - posts + comments + sub descriptions")
df_subs_agg_c1_test = (
    df_posts_agg_c1_test
    .groupby(l_ix_sub_level, as_index=False)
    .mean()
    .sort_values(by=l_ix_sub_level)
)
info(f"  {df_subs_agg_c1_test.shape} <- df_subs_agg_c1.shape (posts + comments + sub description)")

In [None]:
df_subs_agg_c1_test.iloc[-8:, :10]

### Save Subreddit level

This one we can save as a pandas df, no need to split it into multiple files

In [None]:
df_subs_agg_c1_test.to_parquet(
    path_this_model / f"df_subs_agg_c1_test-{datetime.utcnow().strftime('%Y-%m-%d_%H%M')}.parquet"
)

## Other tests

In [None]:
BREAK

In [None]:
# expected it to be false
np.allclose(df_v_pc_weighted.iloc[:1000,3:515], df_posts_agg_c1_test.iloc[:1000,3:515])

In [None]:
df_posts_agg_c1_test.iloc[-3:, :10]

In [None]:
df_v_pc.iloc[:3, :10]

In [None]:
df_v_pc_weighted.iloc[:3, :10]

In [None]:
df_posts_agg_c1_test.iloc[:3, :10]

In [None]:
df_v_subs_weighted[df_v_subs_weighted['subreddit_name'] == 'memesenespanol'].iloc[:, :10]

In [None]:
df_v_subs[df_v_subs['subreddit_name'] == 'memesenespanol'].iloc[:, :10]