# Purpose

### 2022-07-14
In this notebook we'll get the subreddits relevant to a country and apply the new automated QA process using CA models that predict rating and primary topic.

See this dashboard for more information about the model coverage & filters.
https://app.mode.com/reddit/reports/b99c94984018


# Imports & notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

# Register bigquery magic (only needed for laptop/local, not colab)
# %load_ext google.cloud.bigquery

In [2]:
# colab auth for BigQuery, google drive, & google sheets (gspread)
from google.colab import auth, files, drive
from google.auth import default
import sys  # need sys for mounting gdrive path

auth.authenticate_user()
print('Authenticated')

Authenticated


## Install custom library

### Append google drive path so we can install library from there

In [3]:
# Attach google drive & import my python utility functions
# if drive.mount() fails, you can also:
#   MANUALLY CLICK ON "Mount Drive"
import sys


g_drive_root = '/content/drive'

try:
    drive.mount(g_drive_root, force_remount=True)
    print('   Authenticated & mounted Google Drive')
    
except Exception as e:
    try:
        drive._mount(g_drive_root, force_remount=True)
        print('   Authenticated & mounted Google Drive')
    except Exception as e:
        print(e)
        raise Exception('You might need to manually mount google drive to colab')

l_paths_to_append = [
    f'{g_drive_root}/MyDrive/Colab Notebooks',

    # need to append the path to subclu so that colab can import things properly
    f'{g_drive_root}/MyDrive/Colab Notebooks/subreddit_clustering_i18n'
]
for path_ in l_paths_to_append:
    if path_ in sys.path:
        sys.path.remove(path_)
    print(f" Appending path: {path_}")
    sys.path.append(path_)

Mounted at /content/drive
   Authenticated & mounted Google Drive
 Appending path: /content/drive/MyDrive/Colab Notebooks
 Appending path: /content/drive/MyDrive/Colab Notebooks/subreddit_clustering_i18n


### Install library

In [4]:
# install subclu & libraries needed to read parquet files from GCS & spreadsheets
#  make sure to use the [colab] `extra` because it includes colab-specific libraries
module_path = f"{g_drive_root}/MyDrive/Colab Notebooks/subreddit_clustering_i18n/[colab]"

!pip install -e $"$module_path" --quiet

## Regular Imports

In [5]:
import os
from datetime import datetime

from google.cloud import bigquery

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib_venn import venn2_unweighted, venn3_unweighted
from tqdm import tqdm

# auth for google sheets
import gspread


creds_, _ = default()
gc = gspread.authorize(creds_)

# os.environ['GOOGLE_CLOUD_PROJECT'] = 'data-science-prod-218515'
os.environ['GOOGLE_CLOUD_PROJECT'] = 'data-prod-165221'

## Custom imports

In [6]:
# subclu imports
import subclu
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric, reorder_array,
)
from subclu.models.clustering_utils import (
    create_dynamic_clusters,
    convert_distance_or_ab_to_list_for_fpr,
    reshape_df_to_get_1_cluster_per_row,
    get_primary_topic_mix_cols,
    create_dynamic_clusters_clean,
)

from subclu.models.reshape_clusters_v041 import (
    keep_only_target_labels,
    get_table_for_optimal_dynamic_cluster_params,
    get_dynamic_cluster_summary,
)
from subclu.models.reshape_clusters_v050 import (
    apply_qa_filters_for_fpr
)

setup_logging()
notebook_display_config()
print_lib_versions([gspread, pd, np])

python		v 3.7.13
===
gspread		v: 4.0.1
pandas		v: 1.3.5
numpy		v: 1.21.6


# Checklist to re-run for a country:

- change country name for google sheet name
- change country initial in google sheet
    - add google sheet KEY (after created)
- change country name in geo-relevance query

## Create google sheet for country outputs


In [7]:
%%time

country_name_sheet_ = 'Spain'
target_abbrev_ = 'ES'
GSHEET_KEY = '1X1-cO6QS-RhFyT_00z_Fyq1qpHaNn7JQsB0IxRvGHYk'  #'1vr8O_Jw7gLFoaNJ4t9Uxyg0MnFbnOI6laL_q1_7AUMY'
GSHEET_NAME = f'i18n {country_name_sheet_} subreddits and clusters - model v0.5.0'


d_wsh_names = {
    'qa_ready': {
        'name': 'subs_after_ca_qa',
    },
    'clusters_t2t_fpr_raw': {
        'name': f'fpr_clusters_ca_qa_{target_abbrev_}_{target_abbrev_}',
    },
    'clusters_t2t_list_raw': {
        'name': f'clusters_list_ca_qa_{target_abbrev_}_{target_abbrev_}',
    },
    # 'clusters_t2t_fpr_after_qa': {
    #     'name': f'fpr_clusters_after_qa_{target_abbrev_}_{target_abbrev_}',
    # },
    # 'sub_raw': {
    #     'name': 'raw_data_per_subreddit',
    # },
}

if GSHEET_KEY is not None:
    sh = gc.open_by_key(GSHEET_KEY)
    print(f"Opening google worksheet: {GSHEET_NAME} ...")
else:
    print(f"** Creating google worksheet: {GSHEET_NAME} ...")
    sh = gc.create(GSHEET_NAME)

# create worksheets:
for _, d_ in d_wsh_names.items():
    sh_name = d_['name']
    try:
        d_['worksheet'] = sh.worksheet(sh_name)
        print(f"  Opening tab/sheet: {sh_name} ...")
    except Exception as e:
        print(f"  ** Creating tab/sheet: {sh_name} ...")
        d_['worksheet'] = sh.add_worksheet(sh_name, rows=5, cols=5)

print(f"https://docs.google.com/spreadsheets/d/{sh.id}")
if GSHEET_KEY is None:
    print(f"\n*** New sheet ID (assign it to GSHEET_KEY variable): ***\n{sh.id}\n")

Opening google worksheet: i18n Spain subreddits and clusters - model v0.5.0 ...
  Opening tab/sheet: subs_after_ca_qa ...
  Opening tab/sheet: fpr_clusters_ca_qa_ES_ES ...
  Opening tab/sheet: clusters_list_ca_qa_ES_ES ...
https://docs.google.com/spreadsheets/d/1X1-cO6QS-RhFyT_00z_Fyq1qpHaNn7JQsB0IxRvGHYk
CPU times: user 47.5 ms, sys: 3 ms, total: 50.5 ms
Wall time: 922 ms


# Load data from BigQuery

## Load subreddit geo-relevance & cultural relevance metadata

This data is already in bigQuery so read it straight from there. We'll use it to filter out geo-relevant (German) subs.

Also add the latest ratings so that we can filter based on those.

English-speaking countries don't have ambassador subs right now, so we should be able to create a standard template and replace the country name for these queries.

### SQL geo & cultural

In [17]:
%%time
%%bigquery df_geo --project data-science-prod-218515 

-- Get country-relevant subreddits for FPRs + flags from CA QA
DECLARE TARGET_COUNTRY STRING DEFAULT "Spain";
DECLARE MIN_COUNTRY_STANDARDIZED_RELEVANCE NUMERIC DEFAULT 2.3;
DECLARE MIN_USERS_PCT_L28_REL NUMERIC DEFAULT 0.14;

DECLARE PARTITION_DT DATE DEFAULT (CURRENT_DATE() - 2);

-- Check sensitive topics in case labels have changed since CA QA step
DECLARE SENSITIVE_TOPICS DEFAULT [
    'Addiction Support'
    , 'Activism'
    , 'Culture, Race, and Ethnicity', 'Fitness and Nutrition'
    , 'Gender', 'Mature Themes and Adult Content', 'Medical and Mental Health'
    , 'Military'
    , "Men's Health", 'Politics', 'Sexual Orientation'
    , 'Trauma Support', "Women's Health"
];


SELECT
    geo.subreddit_id
    , ars.users_l7
    , geo.geo_country_code
    , geo.country_name
    , geo.subreddit_name
    , geo.geo_relevance_default
    , geo.relevance_combined_score
    , geo.users_percent_by_subreddit_l28
    , geo.users_percent_by_country_standardized
    , nt.primary_topic
    , nt.rating_short
    , qa.predicted_rating
    , qa.predicted_topic
    , slo.allow_discovery
    , slo.over_18
    , qa.combined_filter_detail
    , qa.combined_filter
    , qa.combined_filter_reason
    , qa.taxonomy_action

    , geo.relevance_percent_by_subreddit
    , geo.relevance_percent_by_country_standardized

FROM `reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags` AS qa
    LEFT JOIN (
        SELECT *
        FROM `reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220705`
        WHERE country_name = TARGET_COUNTRY
    ) AS geo
        ON geo.subreddit_id = qa.subreddit_id

    LEFT JOIN (
        SELECT *
        FROM `data-prod-165221.all_reddit.all_reddit_subreddits`
        WHERE DATE(pt) = PARTITION_DT
    ) AS ars
        ON qa.subreddit_name = LOWER(ars.subreddit_name)

    LEFT JOIN (
        SELECT * FROM `data-prod-165221.cnc.shredded_crowdsource_topic_and_rating`
        WHERE pt = PARTITION_DT
    ) AS nt
        ON qa.subreddit_id = nt.subreddit_id
    LEFT JOIN (
        SELECT *
        FROM `data-prod-165221.ds_v2_postgres_tables.subreddit_lookup`
        -- Get latest partition
        WHERE dt = PARTITION_DT
    ) AS slo
        ON qa.subreddit_id = slo.subreddit_id

WHERE 1=1
    AND qa.pt = "2022-07-16"
    -- Pick subreddits relevant to target country under at least one metric/threshold
    --   Use the numeric values in case the defined threshold change
    AND geo.country_name = TARGET_COUNTRY
    AND (
        geo_relevance_default = TRUE
        OR users_percent_by_subreddit_l28 >= MIN_USERS_PCT_L28_REL
        OR users_percent_by_country_standardized >= MIN_COUNTRY_STANDARDIZED_RELEVANCE
        -- Try the combined score to include a few more relevant subreddits
        OR relevance_combined_score >= 0.175
    )

    -- Exclude subs we should recommend
    AND (
        qa.combined_filter = 'recommend'
        -- We can still use allow_discover=f for seeds
        OR (
            qa.combined_filter = 'remove'
            AND qa.combined_filter_reason = 'allow_discovery_f'
        )
    )
    AND qa.subreddit_name != 'profile'
    AND COALESCE(slo.type, '') IN ('private', 'public', 'restricted')
    AND COALESCE(slo.verdict, 'f') != 'admin-removed'
    AND COALESCE(is_spam, FALSE) = FALSE
    AND COALESCE(slo.over_18, 'f') = 'f'
    AND COALESCE(quarantine, FALSE) = FALSE
    AND COALESCE(nt.rating_short, '') = "E"
    AND COALESCE(nt.primary_topic, '') NOT IN UNNEST(SENSITIVE_TOPICS)

ORDER BY geo.relevance_combined_score DESC, geo.users_percent_by_subreddit_l28 DESC
;

CPU times: user 287 ms, sys: 26.3 ms, total: 313 ms
Wall time: 11.6 s


### Check df with geo + language information

In [18]:
print(df_geo.shape)

(225, 21)


In [19]:
df_geo.iloc[:4, :11]

Unnamed: 0,subreddit_id,users_l7,geo_country_code,country_name,subreddit_name,geo_relevance_default,relevance_combined_score,users_percent_by_subreddit_l28,users_percent_by_country_standardized,primary_topic,rating_short
0,t5_2s049,5115,ES,Spain,catalunya,True,0.859134,0.823917,9.476904,Place,E
1,t5_2tjlk,889,ES,Spain,catalan,True,0.83726,0.796575,7.686603,Learning and Education,E
2,t5_11rkix,51456,ES,Spain,lmdshow,True,0.809596,0.761995,10.88828,Internet Culture and Memes,E
3,t5_2sau7,1237,ES,Spain,catalonia,True,0.790836,0.738544,8.907599,Place,E


In [20]:
value_counts_and_pcts(
    df_geo,
    ['rating_short', 'combined_filter_detail']
)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,percent,cumulative_percent
rating_short,combined_filter_detail,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E,recommend-predictions_clean,132,58.7%,58.7%
E,recommend-predictions_missing,70,31.1%,89.8%
E,recommend-missing_topic,15,6.7%,96.4%
E,remove-allow_discovery_f,8,3.6%,100.0%


In [21]:
value_counts_and_pcts(
    df_geo,
    ['rating_short', 'combined_filter_detail', 'allow_discovery']
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,percent,cumulative_percent
rating_short,combined_filter_detail,allow_discovery,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E,recommend-predictions_clean,,117,52.0%,52.0%
E,recommend-predictions_missing,,64,28.4%,80.4%
E,recommend-predictions_clean,t,15,6.7%,87.1%
E,recommend-missing_topic,,14,6.2%,93.3%
E,remove-allow_discovery_f,f,8,3.6%,96.9%
E,recommend-predictions_missing,t,6,2.7%,99.6%
E,recommend-missing_topic,t,1,0.4%,100.0%


In [22]:
value_counts_and_pcts(
    df_geo['allow_discovery']
)

Unnamed: 0,allow_discovery-count,allow_discovery-percent,allow_discovery-pct_cumulative_sum
,195,86.7%,86.7%
t,22,9.8%,96.4%
f,8,3.6%,100.0%


## Load model labels (clusters)

The clusters now live in a Big Query table and have standardized names, so pull the data from there.

### SQL labels


In [23]:
%%time
%%bigquery df_labels --project data-science-prod-218515 

-- select subreddit clusters from bigQuery

SELECT
    sc.subreddit_id
    , sc.subreddit_name

    -- Exclude clusters that are overly broad... these don't provide
    --  meaningful recommendations
    , sc.* EXCEPT(
        subreddit_id, subreddit_name, primary_topic, __index_level_0__
        , k_0010_label, k_0012_label, k_0020_label, k_0025_label, k_0030_label, k_0040_label
        , k_0049_label
        -- , k_0050_label, k_0052_label, k_0060_label, k_0066_label
        , k_0010_majority_primary_topic, k_0012_majority_primary_topic, k_0020_majority_primary_topic
        , k_0025_majority_primary_topic, k_0030_majority_primary_topic, k_0040_majority_primary_topic
        , k_0049_majority_primary_topic
        -- , k_0050_majority_primary_topic, k_0052_majority_primary_topic
        -- , k_0060_majority_primary_topic, k_0066_majority_primary_topic
    )
FROM `reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_full` sc
;

CPU times: user 42.8 s, sys: 767 ms, total: 43.6 s
Wall time: 1min 29s


### Check label outputs

In [24]:
print(df_labels.shape)
df_labels.iloc[:4, :9]

(81970, 136)


Unnamed: 0,subreddit_id,subreddit_name,model_sort_order,posts_for_modeling_count,k_0050_label,k_0052_label,k_0060_label,k_0066_label,k_0070_label
0,t5_6fz1n2,repacklaba1,38789,196.0,18,18,22,23,24
1,t5_6fz1pv,unfitgirla1,38787,176.0,18,18,22,23,24
2,t5_6fz2qw,repacklab10,38792,182.0,18,18,22,23,24
3,t5_6fz31o,unfitgirl10,38788,178.0,18,18,22,23,24


In [25]:
counts_describe(df_labels.iloc[:, :9])

Unnamed: 0,dtype,count,unique,unique-percent,null-count,null-percent
subreddit_id,object,81970,81970,100.00%,0,0.00%
subreddit_name,object,81970,81970,100.00%,0,0.00%
model_sort_order,int64,81970,81970,100.00%,0,0.00%
posts_for_modeling_count,float64,81970,2703,3.30%,0,0.00%
k_0050_label,int64,81970,50,0.06%,0,0.00%
k_0052_label,int64,81970,52,0.06%,0,0.00%
k_0060_label,int64,81970,60,0.07%,0,0.00%
k_0066_label,int64,81970,66,0.08%,0,0.00%
k_0070_label,int64,81970,70,0.09%,0,0.00%


# Reshape data
Apply reshaping fxns so that we can export the data in a format that's good for QA.

## Keep only labels for Target subreddits


In [26]:
%%time
df_labels_target = keep_only_target_labels(
    df_labels=df_labels,
    df_geo=df_geo,
    col_sort_order='model_sort_order',
    l_ix_subs=['subreddit_id', 'subreddit_name'],
    l_cols_to_front=None,
    geo_cols_to_drop=['None'],
)

0 <- subs to drop b/c they're not in model
(225, 155) <- df_labels_target.shape
CPU times: user 423 ms, sys: 3.29 ms, total: 426 ms
Wall time: 427 ms


In [27]:
counts_describe(df_labels_target.iloc[:, :15])

Unnamed: 0,dtype,count,unique,unique-percent,null-count,null-percent
model_sort_order,int64,225,225,100.00%,0,0.00%
subreddit_id,object,225,225,100.00%,0,0.00%
subreddit_name,object,225,225,100.00%,0,0.00%
primary_topic,object,209,27,12.92%,16,7.11%
rating_short,object,225,1,0.44%,0,0.00%
over_18,object,51,1,1.96%,174,77.33%
posts_for_modeling_count,float64,225,151,67.11%,0,0.00%
k_0050_label,int64,225,28,12.44%,0,0.00%
k_0052_label,int64,225,30,13.33%,0,0.00%
k_0060_label,int64,225,34,15.11%,0,0.00%


## Run loop to find "optimal" min_num of subreddits for dynamic clusters


We want to balance two things:
- prevent orphan subreddits
- prevent clusters that are too large to be meaningful

In order to do this at a country level, we'll be better off starting with smallest cluster size and roll up until we have at least N subreddits in one cluster.

Find optimal `min_subreddits_in_cluster` based on:
- `orphan count`, 
- `number of clusters`,
- & other info

number might be different for each country and even within a country it might differ by when we filter NSFW subs.

In [28]:
col_new_cluster_val = 'cluster_label'
col_new_cluster_name = 'cluster_label_k'
col_new_cluster_prim_topic = 'cluster_majority_primary_topic'
col_new_cluster_topic_mix = 'cluster_topic_mix'

### Loop

In [29]:
%%time

df_optimal_min_check, n_min_subs_in_cluster_optimal = get_table_for_optimal_dynamic_cluster_params(
        df_labels_target=df_labels_target,
        col_new_cluster_val=col_new_cluster_val,
        col_new_cluster_name=col_new_cluster_name,
        col_new_cluster_prim_topic=col_new_cluster_prim_topic,
        col_new_cluster_topic_mix=col_new_cluster_topic_mix,
        min_subs_in_cluster_list=np.arange(3, 11),
        verbose=False,
        return_optimal_min_subs_in_cluster=True,
)

  0%|          | 0/8 [00:00<?, ?it/s]
  0%|          | 0/60 [00:00<?, ?it/s][A
  5%|▌         | 3/60 [00:00<00:01, 29.96it/s][A
 10%|█         | 6/60 [00:00<00:01, 28.57it/s][A
 15%|█▌        | 9/60 [00:00<00:01, 27.62it/s][A
 20%|██        | 12/60 [00:00<00:01, 26.46it/s][A
 25%|██▌       | 15/60 [00:00<00:01, 26.44it/s][A
 30%|███       | 18/60 [00:00<00:01, 26.18it/s][A
 35%|███▌      | 21/60 [00:00<00:01, 25.73it/s][A
 40%|████      | 24/60 [00:00<00:01, 25.52it/s][A
 45%|████▌     | 27/60 [00:01<00:01, 25.95it/s][A
 50%|█████     | 30/60 [00:01<00:01, 25.16it/s][A
 55%|█████▌    | 33/60 [00:01<00:01, 26.19it/s][A
 60%|██████    | 36/60 [00:01<00:00, 26.25it/s][A
 65%|██████▌   | 39/60 [00:01<00:00, 26.55it/s][A
 72%|███████▏  | 43/60 [00:01<00:00, 28.25it/s][A
 77%|███████▋  | 46/60 [00:01<00:00, 27.93it/s][A
 82%|████████▏ | 49/60 [00:01<00:00, 27.91it/s][A
 87%|████████▋ | 52/60 [00:01<00:00, 27.06it/s][A
 92%|█████████▏| 55/60 [00:02<00:00, 26.94it/s][A
100%|

CPU times: user 28.5 s, sys: 607 ms, total: 29.1 s
Wall time: 31.2 s


In [30]:
assert (
    n_min_subs_in_cluster_optimal == df_optimal_min_check.loc[
        df_optimal_min_check['num_orphan_subreddits'] == df_optimal_min_check['num_orphan_subreddits'].min(),
        'min_subreddits_in_cluster'
    ].values[0]
)

In [31]:
df_optimal_min_check

Unnamed: 0,subs_to_cluster_count,min_subreddits_in_cluster,cluster_count,num_orphan_subreddits,num_subreddits_per_cluster_min,num_subreddits_per_cluster_mean,num_subreddits_per_cluster_median,num_subreddits_per_cluster_max,num_clusters_with_mature_primary_topic,cluster_ids_with_orphans
0,225,3,62,9,1,3.629032,4.0,8,0,"0019, 0025, 0027, 0037, 0040, 0041, 0045, 0048, 0049"
1,225,4,53,7,1,4.245283,5.0,8,0,"0025, 0037, 0040, 0041, 0045, 0048, 0049"
2,225,5,47,7,1,4.787234,6.0,10,0,"0037, 0038, 0040, 0041, 0045, 0048, 0049"
3,225,6,44,7,1,5.113636,5.0,10,0,"0037, 0038, 0040, 0041, 0045, 0048, 0049"
4,225,7,43,7,1,5.232558,5.0,10,0,"0024, 0037, 0040, 0041, 0045, 0048, 0049"
5,225,8,39,6,1,5.769231,5.0,16,0,"0037, 0040, 0041, 0045, 0048, 0049"
6,225,9,38,6,1,5.921053,4.5,16,0,"0037, 0040, 0041, 0045, 0048, 0049"
7,225,10,35,6,1,6.428571,5.0,18,0,"0037, 0040, 0041, 0045, 0048, 0049"


### Display loop results

In [32]:
def highlight_below_threshold(val, threshold=1):
    if val <= threshold:
        return "color:purple; font-weight: bold; background-color:yellow;"
    else:
        return ''
print(n_min_subs_in_cluster_optimal)

col_num_orph_subs = 'num_orphan_subreddits'
# col_num_subs_mean = 'num_subreddits_per_cluster_mean'
col_num_subs_median = 'num_subreddits_per_cluster_median'

style_df_numeric(
    df_optimal_min_check,
    rename_cols_for_display=True,
    l_bar_simple=[col_num_orph_subs,
                  col_num_subs_median,]
).applymap(highlight_below_threshold, subset=[col_num_orph_subs.replace('_', ' ')])

8


Unnamed: 0,subs to cluster count,min subreddits in cluster,cluster count,num orphan subreddits,num subreddits per cluster min,num subreddits per cluster mean,num subreddits per cluster median,num subreddits per cluster max,num clusters with mature primary topic,cluster ids with orphans
,,,,,,,,,,
0.0,225.0,3.0,62.0,9.0,1.0,3.63,4.0,8.0,0.0,"0019, 0025, 0027, 0037, 0040, 0041, 0045, 0048, 0049"
1.0,225.0,4.0,53.0,7.0,1.0,4.25,5.0,8.0,0.0,"0025, 0037, 0040, 0041, 0045, 0048, 0049"
2.0,225.0,5.0,47.0,7.0,1.0,4.79,6.0,10.0,0.0,"0037, 0038, 0040, 0041, 0045, 0048, 0049"
3.0,225.0,6.0,44.0,7.0,1.0,5.11,5.0,10.0,0.0,"0037, 0038, 0040, 0041, 0045, 0048, 0049"
4.0,225.0,7.0,43.0,7.0,1.0,5.23,5.0,10.0,0.0,"0024, 0037, 0040, 0041, 0045, 0048, 0049"
5.0,225.0,8.0,39.0,6.0,1.0,5.77,5.0,16.0,0.0,"0037, 0040, 0041, 0045, 0048, 0049"
6.0,225.0,9.0,38.0,6.0,1.0,5.92,4.5,16.0,0.0,"0037, 0040, 0041, 0045, 0048, 0049"
7.0,225.0,10.0,35.0,6.0,1.0,6.43,5.0,18.0,0.0,"0037, 0040, 0041, 0045, 0048, 0049"


## Get dyanimc clusters (apply optimal num from above)

side bar: about 57% of subreddits in Australia only had a single primary topic as their `topic_mix`, so combining `primary topic` might not give us as much info as we hoped.

At the same time, for 43% of subs we might get additional detail by combining the primary topics.


In [33]:
print(f"Optimal n: {n_min_subs_in_cluster_optimal}")

n_mix_start = 4
l_ix = ['subreddit_id', 'subreddit_name']
col_new_cluster_topic_mix = 'cluster_topic_mix'
col_subreddit_topic_mix = 'subreddit_full_topic_mix'
col_full_depth_mix_count = 'subreddit_full_topic_mix_count'
suffix_new_topic_mix = '_topic_mix_nested'
col_new_cluster_val_int = 'cluster_label_int'

df_labels_target_dynamic_raw = create_dynamic_clusters(
    df_labels_target,
    agg_strategy='aggregate_small_clusters',
    min_subreddits_in_cluster=n_min_subs_in_cluster_optimal,
    l_cols_labels_input=None,
    col_new_cluster_val=col_new_cluster_val,
    col_new_cluster_name=col_new_cluster_name,
    col_new_cluster_prim_topic=col_new_cluster_prim_topic,
    n_mix_start=n_mix_start,
    col_new_cluster_topic_mix=col_new_cluster_topic_mix,
    col_subreddit_topic_mix=col_subreddit_topic_mix,
    col_full_depth_mix_count=col_full_depth_mix_count,
    suffix_new_topic_mix=suffix_new_topic_mix,
    l_ix=l_ix,
    verbose=True,
)

17:06:00 | INFO | "Concat'ing nested cluster labels..."


Optimal n: 8


17:06:01 | INFO | "Getting topic mix at different depths..."
17:06:01 | INFO | "  Assigning base topic mix cols"
17:06:01 | INFO | "  Creating deepest base topic mix col..."
17:06:01 | INFO | "  Iterating through additional subs with multiple topics..."
100%|██████████| 60/60 [00:05<00:00, 11.05it/s]
17:06:07 | INFO | "Initializing values for strategy: aggregate_small_clusters"
17:06:07 | INFO | "  Looping to roll-up clusters from smallest to largest..."
100%|██████████| 65/65 [00:02<00:00, 23.32it/s]
17:06:10 | INFO | "(225, 294) <- output shape"


In [34]:
style_df_numeric(
    get_dynamic_cluster_summary(
        df_labels_target_dynamic_raw,
        return_dict=False,
    ),
    rename_cols_for_display=True,
)

Unnamed: 0,cluster count,num orphan subreddits,num subreddits per cluster min,num subreddits per cluster mean,num subreddits per cluster median,num subreddits per cluster max,num clusters with mature primary topic,cluster ids with orphans
,,,,,,,,
0.0,39.0,6.0,1.0,5.77,5.0,16.0,0.0,"0037, 0040, 0041, 0045, 0048, 0049"


### Minor QA checks

In [35]:
# # check column order
# style_df_numeric(
#     df_labels_target_dynamic_raw.iloc[70:74, -22:],
#     rename_cols_for_display=True,
#     int_labels=['total_users_in', 'num_of_countries_', 'users_in_subreddit_from_country_l28',
#                     'by_country_rank',
#                     ],
#     pct_cols=['b_users_percent_by_subreddit',
#                   'c_users_percent_by_country',
#                   'users_percent_by_country_avg',
#                   ],
#     pct_labels='',
# )

In [36]:
value_counts_and_pcts(
    df_labels_target_dynamic_raw[col_new_cluster_topic_mix],
    top_n=9,
)

Unnamed: 0,cluster_topic_mix-count,cluster_topic_mix-percent,cluster_topic_mix-pct_cumulative_sum
Gaming,69,30.7%,30.7%
Place,45,20.0%,50.7%
Internet Culture and Memes | Funny/Humor,16,7.1%,57.8%
Music,13,5.8%,63.6%
Hobbies,11,4.9%,68.4%
"Business, Economics, and Finance",10,4.4%,72.9%
"Reading, Writing, and Literature",9,4.0%,76.9%
Gaming | Tabletop Games,9,4.0%,80.9%
Podcasts and Streamers,8,3.6%,84.4%


In [37]:
# how many final clusters have multiple topics?
value_counts_and_pcts(
    df_labels_target_dynamic_raw[col_new_cluster_topic_mix].str.count('\|')
)

Unnamed: 0,cluster_topic_mix-count,cluster_topic_mix-percent,cluster_topic_mix-pct_cumulative_sum
0,200,88.9%,88.9%
1,25,11.1%,100.0%


In [38]:
# how many SUBREDDITS have multiple topics? (when we check the deepest clusters)
#  these two calls are equivalent

# value_counts_and_pcts(
#     df_labels_target_dynamic_raw[col_subreddit_topic_mix].str.count('\|')
# )

value_counts_and_pcts(
    df_labels_target_dynamic_raw[col_full_depth_mix_count]
)

Unnamed: 0,subreddit_full_topic_mix_count-count,subreddit_full_topic_mix_count-percent,subreddit_full_topic_mix_count-pct_cumulative_sum
1,143,63.6%,63.6%
2,52,23.1%,86.7%
3,22,9.8%,96.4%
4,4,1.8%,98.2%
5,2,0.9%,99.1%
6,1,0.4%,99.6%
8,1,0.4%,100.0%


In [39]:
style_df_numeric(
    df_labels_target_dynamic_raw
    [df_labels_target_dynamic_raw[col_full_depth_mix_count] >= 5]
    .iloc[-5:, :9]
    ,
    rename_cols_for_display=True,
)

Unnamed: 0,subreddit id,subreddit name,cluster label int,cluster topic mix,primary topic,rating short,subreddit full topic mix,over 18,geo relevance default
,,,,,,,,,
40.0,t5_hro2c,picturepunches,32.0,Gaming,Internet Culture and Memes,E,Gaming | Sports | Podcasts and Streamers | Internet Culture and Memes | Funny/Humor,f,True
41.0,t5_5ahutt,culturepop,32.0,Gaming,Art,E,Gaming | Sports | Podcasts and Streamers | Internet Culture and Memes | Place | Art,-,True
43.0,t5_ylxdt,spanishmeme,32.0,Gaming,Funny/Humor,E,"Gaming | Technology | Reading, Writing, and Literature | Funny/Humor | Art",-,False
184.0,t5_2wz8g,rivers,32.0,Place,Outdoors and Nature,E,"World News | Science | Technology | Business, Economics, and Finance | History | Activism | Food and Drink | Learning and Education",-,True


## Re-assign orphan subreddits (optional)

If there are subreddits that are orphan (see summary above), check them out to see if we can re-assign them w/o too much work. if we can't skip and move to the next country.

In [40]:
# check subs around orphan sub
n_plus_minus_ = 2
orphan_clusters_ = get_dynamic_cluster_summary(
        df_labels_target_dynamic_raw,
        return_dict=True,
)['cluster_ids_with_orphans'].split(', ')

print(f"{len(orphan_clusters_)} <- Clusters with orphan subs")
for oc_ in orphan_clusters_:
    print(f"\nCLUSTER: {oc_}")
    try:
        ix_orphan_ = (
            df_labels_target_dynamic_raw
            [df_labels_target_dynamic_raw[col_new_cluster_val] == oc_]
            .index
        )[0]
    except IndexError as e:
        ix_orphan_ = (
            df_labels_target_dynamic_raw
            [df_labels_target_dynamic_raw[col_new_cluster_val] == oc_]
            .index
        )
    n_min_ = max(0, ix_orphan_ - n_plus_minus_)
    n_max_ = min(max(df_labels_target_dynamic_raw.index), (ix_orphan_ + 1 + n_plus_minus_))
    # print(ix_orphan_)
    # print(n_min_)
    # print(n_max_)
    display(
        df_labels_target_dynamic_raw.iloc[n_min_:n_max_, :13]
    )

6 <- Clusters with orphan subs

CLUSTER: 0037


Unnamed: 0,subreddit_id,subreddit_name,cluster_label_int,cluster_topic_mix,primary_topic,rating_short,subreddit_full_topic_mix,over_18,geo_relevance_default,relevance_percent_by_subreddit,relevance_percent_by_country_standardized,model_sort_order,posts_for_modeling_count
186,t5_34ubfa,ciencias_castellano,33,Religion and Spirituality,Science,E,Internet Culture and Memes | Meta/Reddit | Science,,False,True,False,61566,15.0
187,t5_5xioyj,moderacion,33,Religion and Spirituality,Meta/Reddit,E,Internet Culture and Memes | Meta/Reddit,,False,True,False,62045,9.0
188,t5_11f3q1,tinderes,37,Family and Relationships,Family and Relationships,E,Family and Relationships,,False,True,True,66242,6.0
189,t5_2rstq,riverside,38,Place,Place,E,Place,,False,False,True,66767,81.0
190,t5_3328n,menorca,38,Place,Travel,E,Place,,False,True,False,67763,10.0



CLUSTER: 0040


Unnamed: 0,subreddit_id,subreddit_name,cluster_label_int,cluster_topic_mix,primary_topic,rating_short,subreddit_full_topic_mix,over_18,geo_relevance_default,relevance_percent_by_subreddit,relevance_percent_by_country_standardized,model_sort_order,posts_for_modeling_count
203,t5_2ucv9,caminodesantiago,38,Place,Place,E,Place | Travel | Outdoors and Nature,,False,False,True,67822,146.0
204,t5_gmkqb,fuerteventura,38,Place,,E,Place,f,False,True,True,67869,5.0
205,t5_tigm8,spainfire,40,"Business, Economics, and Finance","Business, Economics, and Finance",E,"Business, Economics, and Finance",f,True,True,True,70386,14.0
206,t5_3c23m,crtgaming,41,Technology,Gaming,E,Technology | Music,,False,False,True,70977,1495.0
207,t5_3n7y6,dazn_ca,42,Technology,Sports,E,Technology | Television | Sports,,False,False,True,72224,23.0



CLUSTER: 0041


Unnamed: 0,subreddit_id,subreddit_name,cluster_label_int,cluster_topic_mix,primary_topic,rating_short,subreddit_full_topic_mix,over_18,geo_relevance_default,relevance_percent_by_subreddit,relevance_percent_by_country_standardized,model_sort_order,posts_for_modeling_count
204,t5_gmkqb,fuerteventura,38,Place,,E,Place,f,False,True,True,67869,5.0
205,t5_tigm8,spainfire,40,"Business, Economics, and Finance","Business, Economics, and Finance",E,"Business, Economics, and Finance",f,True,True,True,70386,14.0
206,t5_3c23m,crtgaming,41,Technology,Gaming,E,Technology | Music,,False,False,True,70977,1495.0
207,t5_3n7y6,dazn_ca,42,Technology,Sports,E,Technology | Television | Sports,,False,False,True,72224,23.0
208,t5_31d70,gnulinuxesp,42,Technology,Technology,E,Technology,,True,True,False,72453,41.0



CLUSTER: 0045


Unnamed: 0,subreddit_id,subreddit_name,cluster_label_int,cluster_topic_mix,primary_topic,rating_short,subreddit_full_topic_mix,over_18,geo_relevance_default,relevance_percent_by_subreddit,relevance_percent_by_country_standardized,model_sort_order,posts_for_modeling_count
207,t5_3n7y6,dazn_ca,42,Technology,Sports,E,Technology | Television | Sports,,False,False,True,72224,23.0
208,t5_31d70,gnulinuxesp,42,Technology,Technology,E,Technology,,True,True,False,72453,41.0
209,t5_2jfmjx,crotonplants,45,Home and Garden,Home and Garden,E,Home and Garden,,True,False,False,75785,10.0
210,t5_3e0dj,britishsuperbikes,48,Sports,,E,Gaming | Cars and Motor Vehicles | Sports,,True,False,False,78547,19.0
211,t5_5tbzkj,interiorismo,49,Hobbies,Home and Garden,E,Hobbies | Home and Garden,,True,False,True,79536,10.0



CLUSTER: 0048


Unnamed: 0,subreddit_id,subreddit_name,cluster_label_int,cluster_topic_mix,primary_topic,rating_short,subreddit_full_topic_mix,over_18,geo_relevance_default,relevance_percent_by_subreddit,relevance_percent_by_country_standardized,model_sort_order,posts_for_modeling_count
208,t5_31d70,gnulinuxesp,42,Technology,Technology,E,Technology,,True,True,False,72453,41.0
209,t5_2jfmjx,crotonplants,45,Home and Garden,Home and Garden,E,Home and Garden,,True,False,False,75785,10.0
210,t5_3e0dj,britishsuperbikes,48,Sports,,E,Gaming | Cars and Motor Vehicles | Sports,,True,False,False,78547,19.0
211,t5_5tbzkj,interiorismo,49,Hobbies,Home and Garden,E,Hobbies | Home and Garden,,True,False,True,79536,10.0
212,t5_3bm54o,spainreps,50,Fashion,Fashion,E,Fashion,,True,True,True,80239,90.0



CLUSTER: 0049


Unnamed: 0,subreddit_id,subreddit_name,cluster_label_int,cluster_topic_mix,primary_topic,rating_short,subreddit_full_topic_mix,over_18,geo_relevance_default,relevance_percent_by_subreddit,relevance_percent_by_country_standardized,model_sort_order,posts_for_modeling_count
209,t5_2jfmjx,crotonplants,45,Home and Garden,Home and Garden,E,Home and Garden,,True,False,False,75785,10.0
210,t5_3e0dj,britishsuperbikes,48,Sports,,E,Gaming | Cars and Motor Vehicles | Sports,,True,False,False,78547,19.0
211,t5_5tbzkj,interiorismo,49,Hobbies,Home and Garden,E,Hobbies | Home and Garden,,True,False,True,79536,10.0
212,t5_3bm54o,spainreps,50,Fashion,Fashion,E,Fashion,,True,True,True,80239,90.0
213,t5_4cdlnz,repbudgetfashion,50,Fashion,Fashion,E,Fashion,,False,False,True,80245,122.0


In [41]:
# # check other subs that are in the same cluster as orphan sub (at broadest level)
# l_cols_orphan_check = (
#     [
#         'subreddit_id',
#         col_new_cluster_topic_mix, 
#         # col_new_cluster_val,  # this can be really long and makes comparing harder
#         # col_subreddit_topic_mix,
#         'subreddit_name', 
#         col_new_cluster_name
#     ] +
#     l_cols_labels[:-5]
# )

# style_df_numeric(
#     df_labels_target_dynamic_raw
#     [df_labels_target_dynamic_raw['k_0013_label'] == 11]
#     [l_cols_orphan_check]
#     .iloc[3:14, :50]
#     ,
#     l_bar_simple=[c for c in l_cols_orphan_check[4:] if c.endswith('_label')],
#     rename_cols_for_display=True,

# )

In [42]:
# label_k_to_reassign_ = 'k_0320_label'
# label_val_to_reassign_ = '0011-0018-0032-0043-0046-0058-0062-0087-0244'
# subreddit_id_orphan_ = 't5_2tt7r'

# mask_orphan_and_new_group = (
#     (df_labels_target_dynamic_raw['subreddit_id'] == subreddit_id_orphan_) |
#     (
#         (df_labels_target_dynamic_raw[col_new_cluster_name] == label_k_to_reassign_) &
#         (df_labels_target_dynamic_raw[col_new_cluster_val] == label_val_to_reassign_)
#     )
# )

# # assign is similar to what we do in the dynamic function
# label_k_new_ = 'k_0118_label'
# label_val_new_col_ = f"{label_k_new_}_nested"
# new_prim_topic_col_ = label_k_new_.replace('_label', '_majority_primary_topic')
# c_update_topic_mix_ = label_k_new_.replace('_label', suffix_new_topic_mix)

# df_labels_target_dynamic_raw.loc[
#     mask_orphan_and_new_group,
#     col_new_cluster_name
# ] = label_k_new_

# df_labels_target_dynamic_raw.loc[
#     mask_orphan_and_new_group,
#     col_new_cluster_val
# ] = df_labels_target_dynamic_raw[mask_orphan_and_new_group][label_val_new_col_]

# df_labels_target_dynamic_raw.loc[
#     mask_orphan_and_new_group,
#     col_new_cluster_prim_topic
# ] = df_labels_target_dynamic_raw[mask_orphan_and_new_group][new_prim_topic_col_]

# df_labels_target_dynamic_raw.loc[
#     mask_orphan_and_new_group,
#     col_new_cluster_topic_mix
# ] = df_labels_target_dynamic_raw[mask_orphan_and_new_group][c_update_topic_mix_]

# del mask_orphan_and_new_group, label_k_to_reassign_, label_val_to_reassign_
# del label_k_new_, label_val_new_col_, new_prim_topic_col_

In [43]:
# # check again, num of orphans should be lower than before
# style_df_numeric(
#     get_dynamic_cluster_summary(
#         df_labels_target_dynamic_raw,
#         return_dict=False,
#     ),
#     rename_cols_for_display=True,
# )

In [44]:
# value_counts_and_pcts(
#     df_labels_target_dynamic_raw,
#     ['cluster_label'],
#     top_n=None,
#     return_df=True
# )['count'].describe()

## Get cluster for humans (list of subs in a cluster in a cell)
Here we get 1 cluster per row. 
Use cases:
- It makes it easier to quickly check NSFW clusters that we'll filter out
- we'll append the list of subreddit names from here to the final table for QA (makes it easier to evaluate whether the cluster makes sense).


In [45]:
col_subs_in_cluster_count = 'subs_in_cluster_count'
col_list_cluster_names = 'list_cluster_subreddit_names'

df_cluster_for_humans = reshape_df_to_get_1_cluster_per_row(
    df_labels_target_dynamic_raw,
    col_counterpart_count=col_subs_in_cluster_count,
    col_list_cluster_names=col_list_cluster_names,
    col_list_cluster_ids='list_cluster_subreddit_ids',
    col_new_cluster_val=col_new_cluster_val,
    col_new_cluster_name=col_new_cluster_name,
    col_new_cluster_val_int=col_new_cluster_val_int,
    col_new_cluster_topic=col_new_cluster_topic_mix,
    verbose=False,
    get_one_column_per_sub_id=False,
)

(39, 7)  <- df.shape


In [46]:
df_cluster_for_humans.iloc[3:9, :-1] # .iloc[40:45, :]

Unnamed: 0,cluster_label,cluster_label_k,cluster_topic_mix,cluster_label_int,subs_in_cluster_count,list_cluster_subreddit_names
3,0016,k_0050_label,Internet Culture and Memes,16,2,"shitpost, yo_elvr"
4,0017,k_0050_label,Internet Culture and Memes,17,4,"sorrylag, folagoro, rangugamer, valzoak"
5,0017-0017-0021-0022-0023-0025-0026-0029-0030-0031-0038-0040-0047-0053-0058-0080-0090-0122-0149-0181-0182,k_0603_label,Internet Culture and Memes | Funny/Humor,182,16,"wismichu, ibai, radiopirata, ilutv, abby, lmdshow, memesesp, dazbrrr, shitpostesp, caliebre, auronplay, ubius, chusommontero, iamcristinini, orslokx, renezz"
6,0018,k_0050_label,Podcasts and Streamers,18,2,"clips, youtubeespanol"
7,0019-0019-0023-0024-0025-0027-0028-0031-0032,k_0094_label,Gaming,32,10,"deepspacegame, picturepunches, culturepop, laresistencia, spanishmeme, emulacionandroide, leaksandrumors, nintendoswitchindie, monsterhunterps2, fortnitees"
8,0021,k_0050_label,Podcasts and Streamers,21,6,"wonhosolo, hanabie, rosalia, dragrace_espana, dragracelatam, dragracees"


In [47]:
df_cluster_for_humans.iloc[-9:, :-1]

Unnamed: 0,cluster_label,cluster_label_k,cluster_topic_mix,cluster_label_int,subs_in_cluster_count,list_cluster_subreddit_names
30,0038-0039-0046-0050-0053-0057-0059-0068-0070-0074-0093-0098-0114-0130-0150-0200-0228-0305-0375-0451-0453-0529-0598-0626-0665-0745-0749-0938-1084-1126-1316-1449-1517-1533-1705-1822-1909-2101-2156-2300-2386-2504-2632-2713-2913-2921-3022-3...,k_4250_label,Place,3292,11,"tenerife, ibiza, seville, mallorca, malaga, barcelona, madrid, valencia, alicante, granada, girona"
31,0040,k_0050_label,"Business, Economics, and Finance",40,1,spainfire
32,0041,k_0050_label,Technology,41,1,crtgaming
33,0042,k_0050_label,Technology,42,2,"dazn_ca, gnulinuxesp"
34,0045,k_0050_label,Home and Garden,45,1,crotonplants
35,0048,k_0050_label,Sports,48,1,britishsuperbikes
36,0049,k_0050_label,Hobbies,49,1,interiorismo
37,0050,k_0050_label,Fashion,50,3,"spainreps, repbudgetfashion, repbudgetsneakers"
38,0050-0052-0060-0066-0070,k_0070_label,Hobbies,70,10,"heroquest, swlegion, middleearthminiatures, killteam, warhammerinstructions, marvelcrisisprotocol, knittingandcrochet, crochet_espanol, nintendostitch, numismatology"


### Check clusters that have mature topics

Update for v0.5.0: With the new modeling QA/filtering, there should be few or no mature clusters

In [48]:
mask_mature_clusters_ = (
    df_cluster_for_humans[col_new_cluster_topic_mix].str.lower()
    .str.contains('mature')
)
print(f"{mask_mature_clusters_.sum()} <- clusters with 'Mature' in topic mix")

0 <- clusters with 'Mature' in topic mix


In [49]:
# check the first few clusters
(
    df_cluster_for_humans
    [mask_mature_clusters_]
    .iloc[:5, :-1]
)

Unnamed: 0,cluster_label,cluster_label_k,cluster_topic_mix,cluster_label_int,subs_in_cluster_count,list_cluster_subreddit_names


In [50]:
# (
#     df_cluster_for_humans
#     [mask_mature_clusters_]
#     .iloc[-12:, :]
# )

In [51]:
# sorted(_L_MATURE_CLUSTERS_TO_EXCLUDE_FROM_QA_)

### Add the flag to exclude from QA & the list of sub names to df-raw

Update for v0.5.0:
<br>This QA already happed upstream in the QA table (see dashboard for details).

In [52]:
val_exclude_from_qa = 'exclude from QA'
col_exclude_from_qa = 'exclude_from_qa'

df_labels_target_dynamic_raw[col_exclude_from_qa] = ''


## Create new df_clean 

- Add list of subreddits to target-CLEAN, b/c we'll need it for rating final
- Add new columns & update order


### Copy baseline cols for clean

In [53]:
col_model_sort_order = 'model_sort_order'

df_labels_target_dynamic_raw, df_labels_target_dynamic_clean = create_dynamic_clusters_clean(
    df_dynamic_raw=df_labels_target_dynamic_raw,
    col_new_cluster_val=col_new_cluster_val,
    col_new_cluster_name=col_new_cluster_name,
    col_new_cluster_prim_topic=col_new_cluster_prim_topic,
    col_new_cluster_topic_mix=col_new_cluster_topic_mix,
    col_subreddit_topic_mix=col_subreddit_topic_mix,
)

# check num of orphans for clean
display(style_df_numeric(
    get_dynamic_cluster_summary(
        df_labels_target_dynamic_clean.rename(columns={c: c.replace(' ', '_') for c in df_labels_target_dynamic_clean.columns}),
        return_dict=False,
    ),
    rename_cols_for_display=True,
).set_caption(f"Summary for df-CLEAN"))


# print(f"{len(l_cols_clean_final_for_qa)} <- expected final col count")
print(f"{df_labels_target_dynamic_raw.shape} <- df raw shape")
print(f"{df_labels_target_dynamic_clean.shape} <- df clean shape")

Unnamed: 0,cluster count,num orphan subreddits,num subreddits per cluster min,num subreddits per cluster mean,num subreddits per cluster median,num subreddits per cluster max,num clusters with mature primary topic,cluster ids with orphans
,,,,,,,,
0.0,39.0,6.0,1.0,5.77,5.0,16.0,0.0,"0037, 0040, 0041, 0045, 0048, 0049"


(225, 297) <- df raw shape
(225, 46) <- df clean shape


In [54]:
# [c for c in df_labels_target_dynamic_clean.columns if 'cluster' in c]

In [55]:
# df_labels_target_dynamic_clean.iloc[:5, 28:40]

In [56]:
# df_labels_target_dynamic_raw.iloc[:5, 30:40]

In [57]:
# df_labels_target_dynamic_clean.iloc[:5, -10:]

## Create target to target list for FPR example

In [47]:
%%time
print(datetime.utcnow())

df_target_to_target_list = convert_distance_or_ab_to_list_for_fpr(
    df_labels_target_dynamic_clean.rename(columns={c: c.replace(' ', '_') for c in df_labels_target_dynamic_clean.columns}),
    convert_to_ab=True,
    col_counterpart_count='counterpart_count',
    col_list_cluster_names='list_cluster_subreddit_names',
    col_list_cluster_ids='list_cluster_subreddit_ids',
    l_cols_for_seeds=None,
    l_cols_for_clusters=None,
    col_new_cluster_val=col_new_cluster_val,
    col_new_cluster_name=col_new_cluster_name,
    col_new_cluster_prim_topic=col_new_cluster_prim_topic,
    verbose=True,
)
df_target_to_target_list.shape

2022-07-15 16:07:07.056558
['subreddit_id', 'subreddit_name', 'model_sort_order', 'primary_topic', 'cluster_label', 'cluster_label_k', 'cluster_majority_primary_topic']
['subreddit_id', 'subreddit_name', 'cluster_label']
  (1945, 9) <- df_ab.shape raw
  (1728, 9) <- df_ab.shape after removing matches to self
  Groupby cols:
    ['model_sort_order', 'subreddit_id_seed', 'subreddit_name_seed', 'cluster_label', 'cluster_label_k', 'primary_topic', 'cluster_majority_primary_topic']
  (197, 9) <- df_a_to_b.shape
CPU times: user 27.7 ms, sys: 984 µs, total: 28.7 ms
Wall time: 30.7 ms


In [48]:
# df_target_to_target_list.drop('list_cluster_subreddit_ids', axis=1).iloc[:5, :11]

### Check missing subreddits

We expect some orphans, but are there more subs excluded than expected?

In [49]:
l_subs_missing_from_fpr = list(
    set(df_labels_target_dynamic_clean['subreddit name']) -
    set(df_target_to_target_list['subreddit_name_seed'])
)
len(l_subs_missing_from_fpr)
(
    df_labels_target_dynamic_clean[
        df_labels_target_dynamic_clean['subreddit name'].isin(l_subs_missing_from_fpr)
    ].iloc[:, :9]
)

Unnamed: 0,subreddit id,subreddit name,cluster label int,cluster topic mix,not country relevant,rated E,relevant to cluster/ other subreddits in cluster,safe to show in relation to cluster,country relevance notes
5,t5_2v8rv,musicaenespanol,14,Music,,True,,,
14,t5_2sfpz,wirefoxterriers,15,Animals and Pets,,True,,,
34,t5_26div5,rangugamer,17,Internet Culture and Memes,,True,,,
54,t5_2z6wh,imaginarysports,22,Anime,,True,,,
58,t5_4viev2,silksong,23,Art,,True,,,
75,t5_2re6h,endlessocean,26,Gaming,,True,,,
123,t5_3lur8,tunicgame,150,Gaming,,True,,,
136,t5_4xj98s,gmecanada,153,"Business, Economics, and Finance",,True,,,
137,t5_4wjblz,nzgme,153,"Business, Economics, and Finance",,True,,,
175,t5_t1xw3,bancars,32,Place,,True,,,


# Export data

Note that we have to use `fillna('')`

Otherwise, we'll get errors because the gspread library doesn't know how to handle `pd.NaN` or `np.Nan` (nulls).

## Define variables to create/access google sheet doc & worksheets

Moved the defintion to the top of the sheet so it's easier to automate.

## Save: Clean sheet to rate

In [50]:
d_wsh_names.keys()

dict_keys(['qa_ready', 'clusters_t2t_fpr_raw', 'clusters_t2t_list_raw'])

In [51]:
%%time
print(datetime.utcnow())
(
    d_wsh_names['qa_ready']['worksheet']
    .update([df_labels_target_dynamic_clean.columns.values.tolist()] + 
             df_labels_target_dynamic_clean.fillna('').values.tolist())
)

2022-07-15 16:07:07.224384
CPU times: user 14.8 ms, sys: 1.08 ms, total: 15.9 ms
Wall time: 658 ms


## Save: df cluster for humans

In [52]:
%%time
print(datetime.utcnow())
(
    d_wsh_names['clusters_t2t_list_raw']['worksheet']
    .update(
        [df_cluster_for_humans.rename(columns={c: c.replace('_', ' ') for c in df_cluster_for_humans}).columns.values.tolist()] + 
        df_cluster_for_humans.fillna('').values.tolist()
    )
)

2022-07-15 16:07:07.913499
CPU times: user 7.59 ms, sys: 20 µs, total: 7.61 ms
Wall time: 244 ms


## Save: target raw dynamic


In [53]:
l_cols_to_drop = (
    ['table_creation_date'] +
    [c for c in df_labels_target_dynamic_raw.columns if c.endswith('_nested')]
)
print(len(l_cols_to_drop))
# df_labels_target_dynamic_raw.columns.to_list()

133


In [54]:
# %%time
# print(datetime.utcnow())
# (
#     d_wsh_names['sub_raw']['worksheet']
#     .update([df_labels_target_dynamic_raw.drop(l_cols_to_drop, axis=1).columns.values.tolist()] + 
#              df_labels_target_dynamic_raw.drop(l_cols_to_drop, axis=1).fillna('').values.tolist())
# )

## Save: FPR target-2-target as list

Even though data isn't fully ready, want to have the output ready to make sure it's in the right format that we need.

UPDATE: stop exporting this for now because it adds noise and could create confusion between it and the final QA sheet.

In [55]:
df_target_to_target_list.iloc[:5, :8]

Unnamed: 0,subreddit_id_seed,subreddit_name_seed,cluster_label,cluster_label_k,primary_topic,cluster_majority_primary_topic,counterpart_count,list_cluster_subreddit_names
0,t5_3di14p,frankiewitchfingers,0013,k_0050_label,Music,Music,2,"primaverasound, madcoolfestival"
1,t5_2vznd,primaverasound,0013,k_0050_label,Music,Music,2,"madcoolfestival, frankiewitchfingers"
2,t5_9c154,madcoolfestival,0013,k_0050_label,Music,Music,2,"primaverasound, frankiewitchfingers"
3,t5_23z4s3,framehero,0014-0014,k_0052_label,Gaming,Music,9,"weirdspotifyplaylists, tameimpala, heardle, pcmusic, reggaeton, flamenco, untaggedbeats, operacirclejerk, musicaenespanol"
4,t5_2qoa7,flamenco,0014-0014,k_0052_label,Music,Music,9,"weirdspotifyplaylists, tameimpala, heardle, pcmusic, reggaeton, framehero, untaggedbeats, operacirclejerk, musicaenespanol"


In [56]:
%%time
print(datetime.utcnow())
(
    d_wsh_names['clusters_t2t_fpr_raw']['worksheet']
    .update(
        [df_target_to_target_list.columns.values.tolist()] + 
        df_target_to_target_list.fillna('').values.tolist()
    )
)

2022-07-15 16:07:08.310524
CPU times: user 10.7 ms, sys: 1.93 ms, total: 12.7 ms
Wall time: 493 ms


### We can read the data back to confirm it's as expected

In [57]:
# Here's how to get the records as a dataframe
pd.DataFrame(
    d_wsh_names['qa_ready']['worksheet'].get_all_records()
).iloc[:5, :15]

Unnamed: 0,subreddit id,subreddit name,cluster label int,cluster topic mix,not country relevant,rated E,relevant to cluster/ other subreddits in cluster,safe to show in relation to cluster,country relevance notes,rating or cluster notes,link to sub,subs in cluster count,list cluster subreddit names,posts for modeling count,users l7
0,t5_2vznd,primaverasound,13,Music,,True,,,,,www.reddit.com/r/primaverasound,,,525,5192
1,t5_9c154,madcoolfestival,13,Music,,True,,,,,www.reddit.com/r/madcoolfestival,,,77,1285
2,t5_3di14p,frankiewitchfingers,13,Music,,True,,,,,www.reddit.com/r/frankiewitchfingers,,,7,60
3,t5_4pzwar,weirdspotifyplaylists,14,Music,,True,,,,,www.reddit.com/r/weirdspotifyplaylists,,,330,28311
4,t5_2t1l9,tameimpala,14,Music,,True,,,,,www.reddit.com/r/tameimpala,,,549,24779


# Appendix


## Additional checks on cluster depth

In [58]:
print(df_labels_target_dynamic_raw['cluster_label'].nunique())
display(
    value_counts_and_pcts(
        df_labels_target_dynamic_raw,
        ['cluster_label'],
        top_n=10,
        count_type='subreddit',
        rename_cols_for_display=True,
    )
)
value_counts_and_pcts(
    df_labels_target_dynamic_raw,
    ['cluster_label'],
    top_n=None,
    return_df=True,
)['count'].describe()

37


Unnamed: 0_level_0,subreddit count,percent of subreddit,cumulative percent of subreddit
cluster label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0017-0017-0021-0022-0023-0025-0026-0029-0030-0031-0038-0040-0047-0053-0058-0080-0090-0122-0149-0181-0182,16,7.4%,7.4%
0026-0026-0031-0034-0036-0038-0040-0045-0047,12,5.5%,12.9%
0031-0032-0038-0041-0044-0047-0049,12,5.5%,18.4%
0027-0027-0033-0036-0038-0041-0043-0048-0050-0053-0064-0067-0078-0086-0097-0133-0150,11,5.1%,23.5%
0038-0039-0046-0050-0053-0057-0059-0068-0070-0074-0093-0098-0114-0130-0150-0200-0228-0305-0375-0451-0453-0529-0598-0626-0665-0745-0749-0938-1084-1126-1316-1449-1517-1533-1705-1822-1909-2101-2156-2300-2386-2504-2632-2713-2913-2921-3022-3028-3099-3157-3292,11,5.1%,28.6%
0031-0032-0038-0041-0044-0047-0049-0055-0057-0060-0071-0076-0089-0100-0113-0154-0177-0238-0291-0350-0352-0410-0463-0487-0516-0580-0584-0730-0840-0870-1017-1119-1176-1190,11,5.1%,33.6%
0028-0028-0034-0037-0039-0042-0044-0049-0051-0054-0065-0068-0079-0087-0098-0135-0153,11,5.1%,38.7%
0027-0027-0033-0036-0038-0040-0042-0047-0049-0052-0062-0065-0076-0084-0095-0131-0147-0194-0241-0289-0290-0337,11,5.1%,43.8%
0014-0014,10,4.6%,48.4%
0027-0027-0033-0036-0038-0040-0042-0047-0049-0052-0062-0065-0076-0084-0095-0131,10,4.6%,53.0%


count    37.000000
mean      5.864865
std       4.321536
min       1.000000
25%       2.000000
50%       5.000000
75%      10.000000
max      16.000000
Name: count, dtype: float64

### How deep are the clusters?



In [59]:
print(df_labels_target_dynamic_raw[col_new_cluster_name].nunique())
value_counts_and_pcts(
    df_labels_target_dynamic_raw,
    [col_new_cluster_name],
    top_n=None,
    sort_index=True,
    count_type='subreddit',
    rename_cols_for_display=True,
)

11


Unnamed: 0_level_0,subreddit count,percent of subreddit,cumulative percent of subreddit
cluster label k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
k_0050_label,82,37.8%,37.8%
k_0052_label,10,4.6%,42.4%
k_0070_label,10,4.6%,47.0%
k_0080_label,12,5.5%,52.5%
k_0094_label,22,10.1%,62.7%
k_0266_label,10,4.6%,67.3%
k_0300_label,22,10.1%,77.4%
k_0603_label,16,7.4%,84.8%
k_0700_label,11,5.1%,89.9%
k_2016_label,11,5.1%,94.9%


In [60]:
# style_df_numeric(
#     df_labels_target.tail(10),
#     # rename_cols_for_display=True,
#     l_bar_simple=[c for c in df_labels_target.columns if '_label' in c]
# )