# Purpose

2021-10-21: If/when we need to get post-level aggregates it'll be better if we compute them once and save them to a df instead of having to calculate them each time.

We should always have the option to calculate on demand, but some stats will stay constant, like:
- primary language
- post counts used for modeling

So let's calculate and cache them to speed up other jobs.



# Imports & Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
import logging
import os
from pathlib import Path

import numpy as np
import pandas as pd
import plotly
import seaborn as sns

import mlflow
import hydra

import subclu
from subclu.eda.aggregates import compare_raw_v_weighted_language
from subclu.utils import set_working_directory, get_project_subfolder
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric, reorder_array,
)
from subclu.utils.mlflow_logger import MlflowLogger
from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl, 
    get_color_dict, base_colors_for_manual_labels,
    check_colors_used,
)
from subclu.data.data_loaders import LoadPosts, LoadSubreddits, create_sub_level_aggregates


# ===
# imports specific to this notebook
from collections import Counter
# import umap
# import openTSNE
# from openTSNE import TSNE

# import hdbscan

import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize  # if we normalize the data, euclidean distance is approx of cosine

from sklearn.cluster import KMeans, DBSCAN, OPTICS, AgglomerativeClustering

print_lib_versions([hydra, np, pd, plotly, sklearn, sns, subclu])

python		v 3.7.10
===
hydra		v: 1.1.0
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3
sklearn		v: 0.24.1
seaborn		v: 0.11.1
subclu		v: 0.4.0


In [3]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Set sqlite database as MLflow URI

In [4]:
# use new class to initialize mlflow
mlf = MlflowLogger(tracking_uri='sqlite')
mlflow.get_tracking_uri()

'sqlite:////home/jupyter/subreddit_clustering_i18n/mlflow_sync/djb-100-2021-04-28-djb-eda-german-subs/mlruns.db'

## Get list of experiments with new function

In [5]:
mlf.list_experiment_meta(output_format='pandas')

Unnamed: 0,experiment_id,name,artifact_location,lifecycle_stage
0,0,Default,./mlruns/0,active
1,1,fse_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/1,active
2,2,fse_vectorize_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/2,active
3,3,subreddit_description_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/3,active
4,4,fse_vectorize_v1.1,gs://i18n-subreddit-clustering/mlflow/mlruns/4,active
5,5,use_multilingual_v0.1_test,gs://i18n-subreddit-clustering/mlflow/mlruns/5,active
6,6,use_multilingual_v1,gs://i18n-subreddit-clustering/mlflow/mlruns/6,active
7,7,use_multilingual_v1_aggregates_test,gs://i18n-subreddit-clustering/mlflow/mlruns/7,active
8,8,use_multilingual_v1_aggregates,gs://i18n-subreddit-clustering/mlflow/mlruns/8,active
9,9,v0.3.2_use_multi_inference_test,gs://i18n-subreddit-clustering/mlflow/mlruns/9,active


## Get experiment IDs to use for clustering

There are two runs that completed and they both have the same parameters, so we should be able to use either one. For now, let's select:<br>
`0591fdae9b7d4da7ae3839767b8aab66`

In [6]:
%%time

df_mlf = mlf.search_all_runs(experiment_ids=[16])
df_mlf.shape

CPU times: user 57.4 ms, sys: 13.6 ms, total: 71 ms
Wall time: 1.04 s


(13, 86)

In [7]:
mask_finished = df_mlf['status'] == 'FINISHED'
mask_df_similarity_complete = ~df_mlf['metrics.df_sub_level_agg_a_post_only_similarity-rows'].isnull()

df_mlf_clustering_candidates = df_mlf[mask_finished & mask_df_similarity_complete]
df_mlf_clustering_candidates.shape

(2, 86)

In [8]:
cols_with_multiple_vals = df_mlf_clustering_candidates.columns[df_mlf_clustering_candidates.nunique(dropna=False) > 1]

df_mlf_clustering_candidates[cols_with_multiple_vals]

Unnamed: 0,run_id,artifact_uri,start_time,end_time,metrics.memory_free,metrics.memory_used,metrics.vectorizing_time_minutes,metrics.memory_used_percent,params.memory_free,params.f_log_file,params.memory_used,params.memory_used_percent,params.run_name,tags.mlflow.runName
0,cbb12818e82345dda96928bfdab8b16b,gs://i18n-subreddit-clustering/mlflow/mlruns/16/cbb12818e82345dda96928bfdab8b16b/artifacts,2021-10-12 10:46:05.235000+00:00,2021-10-12 16:41:33.492000+00:00,3465918.0,702999.0,355.468028,0.181436,3465918,logs/AggregateEmbeddings/2021-10-12_10-46-05_agg_full_lc_false_pd-2021-10-12_104604.log,278514,0.0718813699564913,agg_full_lc_false_pd-2021-10-12_104604,agg_full_lc_false_pd-2021-10-12_104604
1,0591fdae9b7d4da7ae3839767b8aab66,gs://i18n-subreddit-clustering/mlflow/mlruns/16/0591fdae9b7d4da7ae3839767b8aab66/artifacts,2021-10-12 10:27:33.324000+00:00,2021-10-12 16:40:41.501000+00:00,3681161.0,703208.0,373.134208,0.18149,3681161,logs/AggregateEmbeddings/2021-10-12_10-27-33_agg_full_lc_false-2021-10-12_102732.log,64759,0.0167135786244584,agg_full_lc_false-2021-10-12_102732,agg_full_lc_false-2021-10-12_102732


# Inspect config for clustering job

This config should include:
- metadata needed to filter  (e.g., exclude subreddits with fewer than 3 posts)
- data to load for clustering
- parameters for clustering algo
- hydra overrides to run jobs in parallel

In [9]:
test_experiment = 'v0.4.0_use_multi_clustering_test'

cfg_cluster_test_v040 = LoadHydraConfig(
    config_name='clustering_v0.4.0_base',
    config_path="../config",
    overrides=[
        f"mlflow_experiment_name={test_experiment}"
#         f"data_text_and_metadata=top_subreddits_2021_07_16",
#         f"data_embeddings_to_cluster=top_subs-2021_07_16-use_multi_lower_case_false_00",
    ],
)

print([k for k in cfg_cluster_test_v040.config_dict.keys()])

['data_text_and_metadata', 'data_embeddings_to_cluster', 'clustering_algo', 'embeddings_to_cluster', 'mlflow_tracking_uri', 'mlflow_experiment_name', 'pipeline']


In [10]:
# data with embeddings
# cfg_cluster_test_v040.config_dict['data_embeddings_to_cluster']

In [11]:
# clustering algo
cfg_cluster_test_v040.config_dict['clustering_algo']

{'model_name': 'AgglomerativeClustering',
 'model_kwargs': {'n_clusters': 100,
  'affinity': 'euclidean',
  'linkage': 'ward',
  'connectivity': False}}

In [12]:
# data with raw text & metadata
d_config_text_and_meta = cfg_cluster_test_v040.config_dict['data_text_and_metadata']
d_config_text_and_meta

{'dataset_name': 'v0.4.0 inputs - Top Subreddits (no Geo) + Geo-relevant subs, comments: TBD',
 'bucket_name': 'i18n-subreddit-clustering',
 'folder_subreddits_text_and_meta': 'subreddits/top/2021-09-24',
 'folder_posts_text_and_meta': 'posts/top/2021-09-27',
 'folder_comments_text_and_meta': 'comments/top/2021-10-04',
 'comments_vectorized_mlflow_uuids': ['5f10cd75334142168a6ebb787e477c1f',
  '2fcfefc3d5af43328168d3478b4fdeb6'],
 'comments_vectorized_gcs': ['gs://i18n-subreddit-clustering/mlflow/mlruns/14/5f10cd75334142168a6ebb787e477c1f/artifacts/df_vect_comments/*.parquet',
  'gs://i18n-subreddit-clustering/mlflow/mlruns/14/2fcfefc3d5af43328168d3478b4fdeb6/artifacts/df_vect_comments/*.parquet'],
 'comments_vectorized_mlflow_uuids_lowercase': None,
 'posts_vectorized_mlflow_uuids': ['8eef951842a34a6e81d176b15ae74afd'],
 'posts_vectorized_mlflow_uuids_lowercase': ['537514ab3c724b10903000501802de0e'],
 'subreddit_meta_vectorized_mlflow_uuids': ['8eef951842a34a6e81d176b15ae74afd'],
 'su

# Load post & sub data 

Ok, the takeaways seem to be that it's faster to use `dask` for both:
- load data
- compute aggregates at post-level 
- the gain from loading as daks was mostly because in pandas I was force-checking unique values
    - Once we remove the uniqueness check, the time between pandas & dask is almost the same

Otherwise, loading the data with `dask` only saves a few seconds (1 to 3).


So when we call `df_posts` inside the LoadSubreddits() class:
- change to computing everything in dask up until we need to merge the data
- turn-off check for unique post IDs

## Baselines

How long does the basline actually take to load & compute?

if it's less than ~2 minutes maybe it's not worth all the additional overhead

**Subreddits only**
```bash
# Subreddits only, pandas reader (old/deprecated)
CPU times: user 192 ms, sys: 181 ms, total: 372 ms
Wall time: 1.2 s
(19262, 37)


# Subreddits only, dask reader
CPU times: user 193 ms, sys: 94.9 ms, total: 288 ms
Wall time: 417 ms
(19262, 37)
```

**POSTS only**
```bash

```


**Subreddits with implicit POSTS**
```bash
# Count only, when loading as dask then converting to pandas
# ** THIS IS THE NEW FORMAT TO USE WHEN RUNNING CLUSTERING EMBEDDINGS AT SUBREDDIT LEVEL **
CPU times: user 11.3 s, sys: 1.98 s, total: 13.3 s
Wall time: 11.4 s

# Counts only, when loading as pandas (and with unique check)
CPU times: user 19.8 s, sys: 2.53 s, total: 22.3 s
Wall time: 22.5 s
(19262, 38)


# All aggs, loading as pandas (old/deprecated) (with unique check)
CPU times: user 57.4 s, sys: 7.82 s, total: 1min 5s
Wall time: 1min 1s
(19262, 94)


# All aggs, loading as dask w/o unique check


```


### Subs only (no post aggregates) - `1 second`
Only load subreddit data, no dependency on posts. 
But this data isn't good for filtering b/c it doesn't include how many posts were used in the model.

In [78]:
%%time
# pandas reader
col_manual_labels = 'manual_topic_and_rating'

df_subs = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    col_new_manual_topic=col_manual_labels,
    read_fxn='pandas',
    df_format='pandas',
    unique_check=True,
).read_and_apply_transformations()

df_subs.shape

21:34:51 | INFO | "Reading raw data..."
21:34:51 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 7108.99it/s]
21:34:52 | INFO | "  Applying transformations..."


CPU times: user 194 ms, sys: 108 ms, total: 302 ms
Wall time: 453 ms


(19262, 37)

In [81]:
%%time
# pandas reader
col_manual_labels = 'manual_topic_and_rating'

df_subs = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    col_new_manual_topic=col_manual_labels,
    read_fxn='pandas',
    df_format='pandas',
    unique_check=False,
).read_and_apply_transformations()

df_subs.shape

21:36:04 | INFO | "Reading raw data..."
21:36:04 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 7025.63it/s]
21:36:05 | INFO | "  Applying transformations..."


CPU times: user 169 ms, sys: 114 ms, total: 283 ms
Wall time: 415 ms


(19262, 37)

In [79]:
%%time
# pandas reader
col_manual_labels = 'manual_topic_and_rating'

df_subs = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    col_new_manual_topic=col_manual_labels,
    read_fxn='dask',
    df_format='pandas',
    unique_check=True,
    verbose=True,
).read_and_apply_transformations()

df_subs.shape

21:35:12 | INFO | "Reading raw data..."
21:35:12 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 7449.92it/s]
21:35:12 | INFO | "  Read fxn: <function read_parquet at 0x7fd5c5c29b00>"
21:35:12 | INFO | "  df format: pandas"
21:35:12 | INFO | "  Reading with dask and converting to pandas..."
21:35:13 | INFO | "  Checking ID uniqueness..."
21:35:13 | INFO | "  Applying transformations..."


CPU times: user 219 ms, sys: 53.1 ms, total: 272 ms
Wall time: 402 ms


(19262, 37)

In [80]:
%%time
# pandas reader
col_manual_labels = 'manual_topic_and_rating'

df_subs = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    col_new_manual_topic=col_manual_labels,
    read_fxn='dask',
    df_format='pandas',
    unique_check=False,
    verbose=True,
).read_and_apply_transformations()

df_subs.shape

21:35:53 | INFO | "Reading raw data..."
21:35:53 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 6393.76it/s]
21:35:54 | INFO | "  Read fxn: <function read_parquet at 0x7fd5c5c29b00>"
21:35:54 | INFO | "  df format: pandas"
21:35:54 | INFO | "  Reading with dask and converting to pandas..."
21:35:54 | INFO | "  Applying transformations..."


CPU times: user 214 ms, sys: 81.6 ms, total: 296 ms
Wall time: 460 ms


(19262, 37)

### Subs + post aggregates (only counts) - down to `11 seconds` from `22 seconds` (mostly bc/ of unique check & pandas loading times)
This one calculates:
- post counts (how many posts are in the training data)

It includes only the bare minimum number of columns to get post-counts.  It took a bunch of refactoring the aggregate function, but it should work and it's relatively fast.

#### ** THIS IS THE NEW FORMAT TO USE WHEN RUNNING CLUSTERING EMBEDDINGS AT SUBREDDIT LEVEL **

In [98]:
%%time
col_manual_labels = 'manual_topic_and_rating'

df_subs_count_dd = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    col_new_manual_topic=col_manual_labels,
).read_apply_transformations_and_merge_post_aggs(
    cols_post='post_count_only_',
    df_format='dask',
    read_fxn='dask',
    unique_check=False,
)

df_subs_count_dd.shape

21:56:47 | INFO | "Loading df_posts from: posts/top/2021-09-27"
21:56:47 | INFO | "Reading raw data..."
21:56:47 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 43273.29it/s]
21:56:47 | INFO | "  Applying transformations..."
21:56:47 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"
21:56:51 | INFO | "  reading sub-level data & merging with aggregates..."
21:56:51 | INFO | "Reading raw data..."
21:56:51 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 6887.20it/s]
21:56:51 | INFO | "  Applying transformations..."


CPU times: user 11.3 s, sys: 2.12 s, total: 13.4 s
Wall time: 11.4 s


(19262, 38)

#### Other slow calls

In [97]:
%%time

df_subs_count = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    col_new_manual_topic=col_manual_labels,
).read_apply_transformations_and_merge_post_aggs(
    cols_post='post_count_only_',
    df_format='pandas',
    read_fxn='pandas',
    unique_check=False,
)

df_subs_count.shape

21:56:22 | INFO | "Loading df_posts from: posts/top/2021-09-27"
21:56:22 | INFO | "Reading raw data..."
21:56:22 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 48437.21it/s]
21:56:25 | INFO | "  Applying transformations..."
21:56:31 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"
21:56:31 | INFO | "Reading raw data..."
21:56:31 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 50220.05it/s]
21:56:34 | INFO | "  Applying transformations..."
21:56:39 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"
21:56:39 | INFO | "  reading sub-level data & merging with aggregates..."
21:56:39 | INFO | "Reading raw data..."
21:56:39 | INFO | "  Local folder to download artifact(s):
 

CPU times: user 24.5 s, sys: 4.13 s, total: 28.7 s
Wall time: 24.5 s


(19262, 38)

In [96]:
%%time

df_subs_count = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    col_new_manual_topic=col_manual_labels,
).read_apply_transformations_and_merge_post_aggs(
    cols_post='post_count_only_',
    df_format='pandas',
    read_fxn='pandas',
    unique_check=True,
)

df_subs_count.shape

21:55:27 | INFO | "Loading df_posts from: posts/top/2021-09-27"
21:55:27 | INFO | "Reading raw data..."
21:55:27 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 51900.19it/s]
21:55:35 | INFO | "  Applying transformations..."
21:55:41 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"
21:55:41 | INFO | "Reading raw data..."
21:55:41 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 48375.14it/s]
21:55:50 | INFO | "  Applying transformations..."
21:55:56 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"
21:55:56 | INFO | "  reading sub-level data & merging with aggregates..."
21:55:56 | INFO | "Reading raw data..."
21:55:56 | INFO | "  Local folder to download artifact(s):
 

CPU times: user 37.3 s, sys: 4.84 s, total: 42.1 s
Wall time: 38 s


(19262, 38)

### Subs + post aggregates (old/default) - `60 seconds`
This one calculates:
- language info (primary, secondary, top N language percent)
- post type (primary & percent per post type)
- post counts (how many posts are in the training data)

Each of those steps take a long longer when we have over 8 million posts.

fastest setting is:
```bash
# Old timing (pandas and with time check)
CPU times: user 57.4 s, sys: 7.82 s, total: 1min 5s
Wall time: 1min 1s
(19262, 94)


# Fastest new setting
.read_apply_transformations_and_merge_post_aggs(
    read_fxn='pandas',
    df_format='pandas',
    unique_check=False,
)
CPU times: user 48.9 s, sys: 6.74 s, total: 55.7 s
Wall time: 51.3 s
(19262, 94)

# pandas and dask are slower in this case
.read_apply_transformations_and_merge_post_aggs(
    read_fxn='dask',
    df_format='pandas',
    unique_check=False,
)
CPU times: user 50.9 s, sys: 6.9 s, total: 57.9 s
Wall time: 53.5 s
(19262, 94)
```

In [60]:
%%time
col_manual_labels = 'manual_topic_and_rating'

df_subs_agg = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    read_fxn='pandas',
    col_new_manual_topic=col_manual_labels,
).read_apply_transformations_and_merge_post_aggs()

df_subs_agg.shape

20:55:00 | INFO | "Loading df_posts from: posts/top/2021-09-27"
20:55:00 | INFO | "Reading raw data..."
20:55:00 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 48624.39it/s]
20:55:10 | INFO | "  Applying transformations..."
20:55:46 | INFO | "  reading sub-level data & merging with aggregates..."
20:55:46 | INFO | "Reading raw data..."
20:55:46 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 6017.65it/s]
20:55:47 | INFO | "  Applying transformations..."


CPU times: user 57.4 s, sys: 7.82 s, total: 1min 5s
Wall time: 1min 1s


(19262, 94)

In [181]:
%%time

df_subs_agg = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    read_fxn='dask',
    col_new_manual_topic=col_manual_labels,
).read_apply_transformations_and_merge_post_aggs(
    read_fxn='pandas',
    df_format='pandas',
    unique_check=False,
)

df_subs_agg.shape

03:04:10 | INFO | "Loading df_posts from: posts/top/2021-09-27"
03:04:10 | INFO | "Reading raw data..."
03:04:10 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 57281.85it/s]
03:04:14 | INFO | "  Applying transformations..."
03:04:47 | INFO | "  reading sub-level data & merging with aggregates..."
03:04:47 | INFO | "Reading raw data..."
03:04:47 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 8665.92it/s]
03:04:47 | INFO | "  Applying transformations..."


CPU times: user 48.9 s, sys: 6.74 s, total: 55.7 s
Wall time: 51.3 s


(19262, 94)

In [182]:
%%time

df_subs_agg = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    read_fxn='pandas',
    col_new_manual_topic=col_manual_labels,
).read_apply_transformations_and_merge_post_aggs(
    read_fxn='dask',
    df_format='pandas',
    unique_check=False,
)

df_subs_agg.shape

03:07:45 | INFO | "Loading df_posts from: posts/top/2021-09-27"
03:07:45 | INFO | "Reading raw data..."
03:07:45 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 49409.34it/s]
03:07:50 | INFO | "  Applying transformations..."
03:08:23 | INFO | "  reading sub-level data & merging with aggregates..."
03:08:23 | INFO | "Reading raw data..."
03:08:23 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 7244.05it/s]
03:08:24 | INFO | "  Applying transformations..."


CPU times: user 50.9 s, sys: 6.9 s, total: 57.9 s
Wall time: 53.5 s


(19262, 94)

In [183]:
%%time

df_subs_agg = LoadSubreddits(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_subreddits_text_and_meta'],
    folder_posts=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=None,
    read_fxn='pandas',
    col_new_manual_topic=col_manual_labels,
).read_apply_transformations_and_merge_post_aggs(
    read_fxn='dask',
    df_format='dask',
    unique_check=False,
)

df_subs_agg.shape

03:08:39 | INFO | "Loading df_posts from: posts/top/2021-09-27"
03:08:39 | INFO | "Reading raw data..."
03:08:39 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 46873.43it/s]
03:08:39 | INFO | "  Applying transformations..."
03:09:17 | INFO | "  reading sub-level data & merging with aggregates..."
03:09:17 | INFO | "Reading raw data..."
03:09:17 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 7358.43it/s]
03:09:17 | INFO | "  Applying transformations..."


CPU times: user 50.6 s, sys: 7.1 s, total: 57.7 s
Wall time: 53.3 s


(19262, 94)

In [16]:
# df_subs.head()

In [17]:
# df_subs['posts_for_modeling_count'].describe()

## Read posts - with pandas v. dask

#### Pandas

In [83]:
%%time

df_posts_pd = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    col_new_manual_topic=col_manual_labels,
    verbose=True,
    df_format='pandas',
    read_fxn='pandas',
    unique_check=True,
).read_and_apply_transformations()

df_posts_pd.shape

21:37:18 | INFO | "Reading raw data..."
21:37:18 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 53645.76it/s]
21:37:18 | INFO | "  Read fxn: <function read_parquet at 0x7fd5cec41d40>"
21:37:18 | INFO | "  df format: pandas"
21:37:18 | INFO | "  Reading without .compute(): <function read_parquet at 0x7fd5cec41d40>"
21:37:21 | INFO | "  Checking ID uniqueness..."
21:37:26 | INFO | "  Applying transformations..."
21:37:32 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 13.9 s, sys: 2.04 s, total: 15.9 s
Wall time: 13.9 s


(8439672, 3)

In [84]:
%%time

df_posts_pd = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    col_new_manual_topic=col_manual_labels,
    verbose=True,
    df_format='pandas',
    read_fxn='pandas',
    unique_check=False,
).read_and_apply_transformations()

df_posts_pd.shape

21:37:48 | INFO | "Reading raw data..."
21:37:48 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 56425.61it/s]
21:37:48 | INFO | "  Read fxn: <function read_parquet at 0x7fd5cec41d40>"
21:37:48 | INFO | "  df format: pandas"
21:37:48 | INFO | "  Reading without .compute(): <function read_parquet at 0x7fd5cec41d40>"
21:37:51 | INFO | "  Applying transformations..."
21:37:56 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 8.67 s, sys: 2.21 s, total: 10.9 s
Wall time: 8.66 s


(8439672, 3)

#### Read as dask & convert to pandas

In [85]:
%%time

df_posts_dd_to_pd_internal = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    col_new_manual_topic=col_manual_labels,
    verbose=True,
    read_fxn='dask',
    df_format='pandas',
    unique_check=True,
).read_and_apply_transformations()

df_posts_dd_to_pd_internal.shape

21:39:27 | INFO | "Reading raw data..."
21:39:27 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 56201.59it/s]
21:39:27 | INFO | "  Read fxn: <function read_parquet at 0x7fd5c5c29b00>"
21:39:27 | INFO | "  df format: pandas"
21:39:27 | INFO | "  Reading with dask and converting to pandas..."
21:39:31 | INFO | "  Checking ID uniqueness..."
21:39:36 | INFO | "  Applying transformations..."
21:39:42 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 14.4 s, sys: 2.38 s, total: 16.7 s
Wall time: 14.8 s


(8439672, 3)

In [86]:
%%time

df_posts_dd_to_pd_internal = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    col_new_manual_topic=col_manual_labels,
    verbose=True,
    read_fxn='dask',
    df_format='pandas',
    unique_check=False,
).read_and_apply_transformations()

df_posts_dd_to_pd_internal.shape

21:39:42 | INFO | "Reading raw data..."
21:39:42 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 50376.43it/s]
21:39:42 | INFO | "  Read fxn: <function read_parquet at 0x7fd5c5c29b00>"
21:39:42 | INFO | "  df format: pandas"
21:39:42 | INFO | "  Reading with dask and converting to pandas..."
21:39:45 | INFO | "  Applying transformations..."
21:39:51 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 9.52 s, sys: 2.2 s, total: 11.7 s
Wall time: 9.64 s


(8439672, 3)

In [72]:
%%time

df_posts_dd_to_pd = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    col_new_manual_topic=col_manual_labels,
    verbose=True,
    df_format='dask',
    read_fxn='dask',
    unique_check=False,
).read_and_apply_transformations().compute()

df_posts_dd_to_pd.shape

21:17:55 | INFO | "Reading raw data..."
21:17:55 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 56062.48it/s]
21:17:55 | INFO | "  Read fxn: <function read_parquet at 0x7fd5c5c29b00>"
21:17:55 | INFO | "  df format: dask"
21:17:55 | INFO | "  Applying transformations..."
21:17:55 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 4.28 s, sys: 1.58 s, total: 5.86 s
Wall time: 3.9 s


(8439672, 3)

# Can we speed up loading if we read with `dask` and then conver to `pandas`?

Given that there are `27` files for post-level data (8 million posts), is it 
- faster to load them with `dask` (which is supposed to load them in parallel)?
- or does dask add too much overhead and it's faster to load them straight into pandas?

### **Finding:** 
Loading as dask can shave off 10 seconds of loading time! (from 13 seconds down to 3)
```bash
# pandas
13.7 s ± 109 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# read dask, then convert to pandas with .compute()
3.58 s ± 37.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

```

### What to do:
Change behavior: always use dask to load raw data and only convert to pandas (with `.compute()`) if `df_format='pandas'`

## Load straight to pandas

only with columns needed for post counts

In [19]:
col_manual_labels = 'manual_topic_and_rating'

cols_post_count = [
    'subreddit_name',
    'subreddit_id',
    'post_id',
]

In [25]:
%%timeit

df_posts = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    df_format='pandas',
    col_new_manual_topic=col_manual_labels,
).read_and_apply_transformations()

# df_posts.shape

20:07:34 | INFO | "Reading raw data..."
20:07:34 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 50108.94it/s]
20:07:42 | INFO | "  Applying transformations..."
20:07:48 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"
20:07:48 | INFO | "Reading raw data..."
20:07:48 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 33644.15it/s]
20:07:57 | INFO | "  Applying transformations..."
20:08:02 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"
20:08:03 | INFO | "Reading raw data..."
20:08:03 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 46091.2

13.7 s ± 109 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [87]:
%%time

df_posts_pd = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    col_new_manual_topic=col_manual_labels,
    read_fxn='pandas',
    df_format='pandas',
    unique_check=True,
).read_and_apply_transformations()

df_posts_pd.shape

21:40:57 | INFO | "Reading raw data..."
21:40:57 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 33239.27it/s]
21:41:05 | INFO | "  Applying transformations..."
21:41:11 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 13.8 s, sys: 2.03 s, total: 15.8 s
Wall time: 13.6 s


(8439672, 3)

In [88]:
%%time

df_posts_pd = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    col_new_manual_topic=col_manual_labels,
    read_fxn='pandas',
    df_format='pandas',
    unique_check=False,
).read_and_apply_transformations()

df_posts_pd.shape

21:41:11 | INFO | "Reading raw data..."
21:41:11 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 50443.75it/s]
21:41:14 | INFO | "  Applying transformations..."
21:41:19 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 8.49 s, sys: 2.28 s, total: 10.8 s
Wall time: 8.67 s


(8439672, 3)

## Load as parquet and covert to pandas INTERNALLY

In [92]:
%%time

df_posts_dd_to_pd_internal = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    col_new_manual_topic=col_manual_labels,
    read_fxn='dask',
    df_format='pandas',
    unique_check=False,
    verbose=True,
).read_and_apply_transformations()

df_posts_dd_to_pd_internal.shape

21:44:09 | INFO | "Reading raw data..."
21:44:09 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 56369.44it/s]
21:44:10 | INFO | "  Read fxn: <function read_parquet at 0x7fd5c5c29b00>"
21:44:10 | INFO | "  df format: pandas"
21:44:10 | INFO | "  Reading with dask and converting to pandas..."
21:44:13 | INFO | "  Applying transformations..."
21:44:19 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 9.76 s, sys: 2.29 s, total: 12.1 s
Wall time: 9.95 s


(8439672, 3)

## Load as parquet AND convert to pandas EXTERNALLY

only with columns needed for post counts

In [24]:
%%timeit

df_posts = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    df_format='dask',
    col_new_manual_topic=col_manual_labels,
).read_and_apply_transformations().compute()

# df_posts.shape

20:07:04 | INFO | "Reading raw data..."
20:07:04 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 44445.14it/s]
20:07:04 | INFO | "  Applying transformations..."
20:07:04 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"
20:07:07 | INFO | "Reading raw data..."
20:07:07 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 49647.61it/s]
20:07:08 | INFO | "  Applying transformations..."
20:07:08 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"
20:07:11 | INFO | "Reading raw data..."
20:07:11 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 36554.6

3.58 s ± 37.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [90]:
%%time

df_posts_dd_to_pd = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    col_new_manual_topic=col_manual_labels,
    read_fxn='dask',
    df_format='dask',
).read_and_apply_transformations().compute()

df_posts_dd_to_pd.shape

21:42:27 | INFO | "Reading raw data..."
21:42:27 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 46545.91it/s]
21:42:28 | INFO | "  Applying transformations..."
21:42:28 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 4.1 s, sys: 1.59 s, total: 5.68 s
Wall time: 3.67 s


(8439672, 3)

## CAVEAT - DATAFRAMES not equal if we include index in check

Because dask loads in parallel, there's no guarantee that output df will have the same `index` (if index is not part of dataframe!!).

If we ignore the index, we might expect the dfs to be the same.

But to be sure, we would need to sort (which can be expensive).

In [35]:
type(df_posts_pd)

pandas.core.frame.DataFrame

In [36]:
type(df_posts_dd_to_pd)

pandas.core.frame.DataFrame

In [46]:
%%time

df_posts_pd.equals(df_posts_dd_to_pd)

CPU times: user 14.5 ms, sys: 0 ns, total: 14.5 ms
Wall time: 13.5 ms


False

In [47]:
%%time

df_posts_dd_to_pd.equals(df_posts_pd)

CPU times: user 14.2 ms, sys: 283 µs, total: 14.5 ms
Wall time: 13.5 ms


False

#### `.equals()` also checks the index! 
so we need to drop it before checking equality

In [45]:
%%time

(
    df_posts_pd.reset_index(drop=True)
    .equals(
        df_posts_dd_to_pd.reset_index(drop=True)
    )
)

CPU times: user 1.94 s, sys: 91.9 ms, total: 2.03 s
Wall time: 2.01 s


True

In [48]:
%%time
cols_to_sort = ['subreddit_id', 'post_id']

df_posts_pd_sorted = df_posts_pd.sort_values(by=cols_to_sort)

df_posts_dd_to_pd_sorted = df_posts_dd_to_pd.sort_values(by=cols_to_sort)

CPU times: user 1min 17s, sys: 1.47 s, total: 1min 18s
Wall time: 1min 18s


In [49]:
df_posts_pd_sorted.equals(df_posts_dd_to_pd_sorted)

False

In [50]:
%%time

(
    df_posts_pd_sorted.reset_index(drop=True)
    .equals(
        df_posts_dd_to_pd_sorted.reset_index(drop=True)
    )
)

CPU times: user 2.26 s, sys: 169 ms, total: 2.43 s
Wall time: 2.42 s


True

# How to create a new dask column

In [126]:
import dask.array as da


In [147]:
%%time

ddf_posts = LoadPosts(
    bucket_name=d_config_text_and_meta['bucket_name'],
    folder_path=d_config_text_and_meta['folder_posts_text_and_meta'],
    columns=cols_post_count,
    df_format='dask',
    read_fxn='dask',
    verbose=True,
    unique_check=False,
    col_new_manual_topic=col_manual_labels,
).read_and_apply_transformations()

ddf_posts.shape

01:50:01 | INFO | "Reading raw data..."
01:50:01 | INFO | "  Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 54392.99it/s]
01:50:01 | INFO | "  Read fxn: <function read_parquet at 0x7fd5c5c29b00>"
01:50:01 | INFO | "  df format: dask"
01:50:01 | INFO | "  Reading without .compute(): <function read_parquet at 0x7fd5c5c29b00>"
01:50:01 | INFO | "  Applying transformations..."
01:50:01 | ERROR | "Error creating manual topic... 'combined_topic_and_rating'"


CPU times: user 50.8 ms, sys: 0 ns, total: 50.8 ms
Wall time: 253 ms


(Delayed('int-c94c27a3-06ad-456e-bc51-efd0094db53d'), 3)

In [148]:
ddf_posts.shape

(Delayed('int-8eacdf8b-d193-4148-9ab6-dfc706c813f4'), 3)

In [169]:
# ddf_posts.index

In [150]:
%%time

ddf_posts.index.compute()

CPU times: user 3.98 s, sys: 1.49 s, total: 5.47 s
Wall time: 3.27 s


Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,      9,
            ...
            361944, 361945, 361946, 361947, 361948, 361949, 361950, 361951, 361952, 361953], dtype='int64', length=8439672)

In [164]:
%%time

ddf_posts.index.compute().nunique()

CPU times: user 4.14 s, sys: 1.46 s, total: 5.6 s
Wall time: 3.45 s


409991

In [165]:
%%time

len(ddf_posts)

CPU times: user 831 ms, sys: 260 ms, total: 1.09 s
Wall time: 408 ms


8439672

In [168]:
%%time

ddf_posts = ddf_posts.reset_index(drop=True)

CPU times: user 3.24 ms, sys: 0 ns, total: 3.24 ms
Wall time: 3.17 ms


#### Reset index
lol... reset_index doesn't do what we'd expect it to do... freaking dask...

https://github.com/dask/dask/issues/3788
> ddf has three partitions and thus the reset_index builds 3 [0,1,3] indexes for each partition.
> 
> I'm going to close for now as this is quite a bit old but feel free to reopen if you are still having trouble




In [170]:
%%time

ddf_posts.index.compute().nunique()

CPU times: user 4.61 s, sys: 1.61 s, total: 6.22 s
Wall time: 3.9 s


409991

In [171]:
%%time

len(ddf_posts)

CPU times: user 4.25 s, sys: 1.59 s, total: 5.84 s
Wall time: 3.69 s


8439672

Reset index with a range

In [174]:
%%time
ddf_posts.index = da.from_array(len(ddf_posts))

AttributeError: 'Array' object has no attribute 'divisions'

In [151]:
ddf_posts.tail()

Unnamed: 0,subreddit_name,subreddit_id,post_id
361949,memesenespanol,t5_1009a3,t3_p0b1v4
361950,memesenespanol,t5_1009a3,t3_pgurz7
361951,memesenespanol,t5_1009a3,t3_psau1m
361952,memesenespanol,t5_1009a3,t3_peuwnu
361953,memesenespanol,t5_1009a3,t3_p74s9j


In [152]:
ddf_posts.where(ddf_posts['subreddit_name'] == 'memesenespanol').compute()

Unnamed: 0,subreddit_name,subreddit_id,post_id
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
361949,memesenespanol,t5_1009a3,t3_p0b1v4
361950,memesenespanol,t5_1009a3,t3_pgurz7
361951,memesenespanol,t5_1009a3,t3_psau1m
361952,memesenespanol,t5_1009a3,t3_peuwnu


In [153]:
%%time

da.where(ddf_posts['subreddit_name'] == 'memesenespanol', 'yay', 'nay')

CPU times: user 2.51 s, sys: 568 ms, total: 3.08 s
Wall time: 1.7 s


Unnamed: 0,Array,Chunk
Bytes,12 B,12.0 B
Shape,(),()
Count,83 Tasks,1 Chunks
Type,numpy.ndarray,
Array Chunk Bytes 12 B 12.0 B Shape () () Count 83 Tasks 1 Chunks Type numpy.ndarray,,

Unnamed: 0,Array,Chunk
Bytes,12 B,12.0 B
Shape,(),()
Count,83 Tasks,1 Chunks
Type,numpy.ndarray,


In [154]:
%%time

da.where(ddf_posts['subreddit_name'] == 'memesenespanol', 'yay', 'nay').compute()

CPU times: user 6.87 s, sys: 2.18 s, total: 9.05 s
Wall time: 5.45 s


array(['nay', 'nay', 'nay', ..., 'yay', 'yay', 'yay'], dtype='<U3')

In [156]:
%%time

ddf_posts['test_col'] = da.where(ddf_posts['subreddit_name'] == 'memesenespanol', 'yay', 'nay')

ValueError: Number of partitions do not match (1 != 27)

In [157]:
%%time

ddf_posts['test_col'] = np.where(ddf_posts['subreddit_name'] == 'memesenespanol', 'yay', 'nay')

TypeError: Column assignment doesn't support type numpy.ndarray

In [158]:
%%time

ddf_posts.assign(test_col=da.where(ddf_posts['subreddit_name'] == 'memesenespanol', 'yay', 'nay'))

ValueError: Number of partitions do not match (1 != 27)

In [159]:
np.where(ddf_posts['subreddit_name'] == 'memesenespanol', 'yay', 'nay')

array(['nay', 'nay', 'nay', ..., 'yay', 'yay', 'yay'], dtype='<U3')

### `.assign()` doesn't seem to work  either

In [139]:
%%time

ddf_posts.assign(
    test_col=lambda x: np.where(x['subreddit_name'] == 'memesenespanol', 'yay', 'nay')
)

ValueError: Length of values (8439672) does not match length of index (2)

In [161]:
%%time

ddf_posts.assign(
    test_col=lambda x: pd.Series(np.where(x['subreddit_name'] == 'memesenespanol', 'yay', 'nay'), index=x.index.compute())
)

ValueError: cannot reindex from a duplicate axis

In [144]:
%%time

ddf_posts_new = ddf_posts.assign(
    test_col=lambda x: da.where(x['subreddit_name'] == 'memesenespanol', 
                                'yay', 
                                'nay')
)

CPU times: user 2.53 s, sys: 594 ms, total: 3.12 s
Wall time: 1.69 s


In [145]:
ddf_posts_new.head()

ValueError: Length of values (8439672) does not match length of index (292752)

In [146]:
ddf_posts.tail()

ValueError: Length of values (8439672) does not match length of index (361954)

Try using a `mask` instead of `np.where` or `da.where`

In [137]:
%%time

ddf_posts['test_col_'] = (
    ddf_posts['subreddit_name'] == 'memesenespanol', 'yay', 'nay')
)

ValueError: Number of partitions do not match (1 != 27)

# Other generic stats

In [18]:
value_counts_and_pcts(
    df_subs['geo_relevant_countries'],
    top_n=25,
    # reset_index=True,
)

Unnamed: 0,geo_relevant_countries-count,geo_relevant_countries-percent,geo_relevant_countries-pct_cumulative_sum
,15050,78.1%,78.1%
India,970,5.0%,83.2%
Germany,787,4.1%,87.3%
Mexico,400,2.1%,89.3%
United Kingdom,392,2.0%,91.4%
Brazil,381,2.0%,93.3%
Canada,235,1.2%,94.6%
France,213,1.1%,95.7%
Italy,178,0.9%,96.6%
Spain,100,0.5%,97.1%
