# Purpose

### 2022-07-18
In this notebook we'll run a batch of countries through a new FPR output process. 
Instead of saving data to google sheets, we'll save:
- FPR outputs to a GCS bucket (JSON)
- FPR summary to BigQuery
- FPR details to BigQuery

We can then use the summary & details in a Mode dashboard for inspection (if needed).

See this dashboard for more information about the model coverage & filters.
https://app.mode.com/reddit/reports/b99c94984018


# Imports & notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

# Register bigquery magic (only needed for laptop/local, not colab)
# %load_ext google.cloud.bigquery

In [2]:
# colab auth for BigQuery, google drive, & google sheets (gspread)
from google.colab import auth, files, drive
from google.auth import default
import sys  # need sys for mounting gdrive path

auth.authenticate_user()
print('Authenticated')

Authenticated


## Install custom library

### Append google drive path so we can install library from there

In [3]:
# Attach google drive & import my python utility functions
# if drive.mount() fails, you can also:
#   MANUALLY CLICK ON "Mount Drive"
import sys


g_drive_root = '/content/drive'

try:
    drive.mount(g_drive_root, force_remount=True)
    print('   Authenticated & mounted Google Drive')
    
except Exception as e:
    try:
        drive._mount(g_drive_root, force_remount=True)
        print('   Authenticated & mounted Google Drive')
    except Exception as e:
        print(e)
        raise Exception('You might need to manually mount google drive to colab')

l_paths_to_append = [
    f'{g_drive_root}/MyDrive/Colab Notebooks',

    # need to append the path to subclu so that colab can import things properly
    f'{g_drive_root}/MyDrive/Colab Notebooks/subreddit_clustering_i18n'
]
for path_ in l_paths_to_append:
    if path_ in sys.path:
        sys.path.remove(path_)
    print(f" Appending path: {path_}")
    sys.path.append(path_)

Mounted at /content/drive
   Authenticated & mounted Google Drive
 Appending path: /content/drive/MyDrive/Colab Notebooks
 Appending path: /content/drive/MyDrive/Colab Notebooks/subreddit_clustering_i18n


### Install library

In [4]:
# install subclu & libraries needed to read parquet files from GCS & spreadsheets
#  make sure to use the [colab] `extra` because it includes colab-specific libraries
module_path = f"{g_drive_root}/MyDrive/Colab Notebooks/subreddit_clustering_i18n/[colab]"

!pip install -e $"$module_path" --quiet

## Regular Imports

In [5]:
import os
from datetime import datetime

from google.cloud import bigquery

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib_venn import venn2_unweighted, venn3_unweighted
from tqdm import tqdm


# os.environ['GOOGLE_CLOUD_PROJECT'] = 'data-science-prod-218515'
os.environ['GOOGLE_CLOUD_PROJECT'] = 'data-prod-165221'

## Custom imports

In [123]:
# subclu imports
import subclu
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric, reorder_array,
)
from subclu.models.clustering_utils import (
    create_dynamic_clusters,
    convert_distance_or_ab_to_list_for_fpr,
    get_primary_topic_mix_cols,
)
from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.models.reshape_clusters_v050 import (
    get_table_for_optimal_dynamic_cluster_params,
    CreateFPRs,
)

setup_logging()
notebook_display_config()
print_lib_versions([pd, np])

python		v 3.7.13
===
pandas		v: 1.3.5
numpy		v: 1.21.6


# Checklist to re-run FPRs

With this new process the main diff between batches would be:
- Update list of countries to run (list of 1+ country_codes)
- Update date of pt_qa
- Update path to save outputs (in GCS)


## Load configurations

In [130]:
cfg_fpr_en = LoadHydraConfig(
    config_name='fpr_v050-2022-08-02-english_countries.yaml',
    config_path="../config",
    # overrides=[
    #     f"target_countries={l_target_countries}",
    #     f"partition_dt=2022-07-28",
    # ],
)

print([k for k in cfg_fpr_en.config_dict.keys()])
cfg_fpr_en.config_dict

['description', 'output_bucket', 'gcs_output_path', 'add_outputs_to_bq', 'cluster_labels_table', 'partition_dt', 'qa_table', 'qa_pt', 'geo_relevance_table', 'geo_min_users_percent_by_subreddit_l28', 'geo_min_country_standardized_relevance', 'target_countries']


{'add_outputs_to_bq': True,
 'cluster_labels_table': 'reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_full',
 'description': 'Base config to test FPR creation',
 'gcs_output_path': 'i18n_topic_model_batch/fpr/runs',
 'geo_min_country_standardized_relevance': 2.6,
 'geo_min_users_percent_by_subreddit_l28': 0.14,
 'geo_relevance_table': 'reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725',
 'output_bucket': 'i18n-subreddit-clustering',
 'partition_dt': '2022-08-02',
 'qa_pt': '2022-08-02',
 'qa_table': 'reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags',
 'target_countries': ['CA', 'GB', 'AU']}

In [148]:
cfg_fpr_not_en = LoadHydraConfig(
    config_name='fpr_v050-2022-08-03-non_english.yaml',
    config_path="../config",
    # overrides=[
    #     f"partition_dt=2022-07-28",
    # ],
)

print([k for k in cfg_fpr_not_en.config_dict.keys()])
cfg_fpr_not_en.config_dict

['description', 'output_bucket', 'gcs_output_path', 'add_outputs_to_bq', 'cluster_labels_table', 'partition_dt', 'qa_table', 'qa_pt', 'geo_relevance_table', 'geo_min_users_percent_by_subreddit_l28', 'geo_min_country_standardized_relevance', 'target_countries']


{'add_outputs_to_bq': True,
 'cluster_labels_table': 'reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_full',
 'description': "Base config to test FPR creation. IE (Ireland) is here because it's a small(er) country",
 'gcs_output_path': 'i18n_topic_model_batch/fpr/runs',
 'geo_min_country_standardized_relevance': 2.4,
 'geo_min_users_percent_by_subreddit_l28': 0.14,
 'geo_relevance_table': 'reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725',
 'output_bucket': 'i18n-subreddit-clustering',
 'partition_dt': '2022-08-02',
 'qa_pt': '2022-08-02',
 'qa_table': 'reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags',
 'target_countries': ['IE',
  'AT',
  'CH',
  'BR',
  'MX',
  'AR',
  'CO',
  'CR',
  'PA',
  'RO',
  'NL',
  'GR',
  'BE',
  'PL',
  'TR',
  'PH',
  'SE',
  'NO',
  'DK',
  'FI',
  'ZA']}

In [150]:
s_countries_old_config = set(cfg_fpr_not_en.config_dict['target_countries'])
print(len(s_countries_old_config))

s_countries_actual = set(
    [
        'MX', 'BR', 'TR', 'PH'
        , 'RO', 'GR', 'AT', 'IE', 'AR', 'CO'
        , 'BE', 'CH', 'PL', 'ZA', 'CR', 'PA'
        , 'SE', 'NO', 'DK', 'FI', 'NL'
        # , 'ES', 'IT'  # IT & ES already done
    ]
)

print(len(s_countries_actual))
print("countries to add\n  ", s_countries_actual - s_countries_old_config)

print("countries to REMOVE\n  ", s_countries_old_config - s_countries_actual)
print(s_countries_old_config == s_countries_actual)

21
21
countries to add
   set()
countries to REMOVE
   set()
True


# Run `create_fprs()` method

This method should do everything needed to create FPRs.

### Mainly English-speaking countries

In [52]:
%%time
fprs_en = CreateFPRs(
    **cfg_fpr_en.config_dict
)

# run all countries
fprs_en.create_fprs()

  0%|          | 0/3 [00:00<?, ?it/s]16:40:38 | INFO | "== Country: CA =="
16:40:38 | INFO | "Getting geo-relevant subreddits in model for CA..."
16:40:54 | INFO | " (2401, 160)  <- df_shape"
16:40:54 | INFO | " (2401, 161) <- Shape AFTER dropping subreddits with covid in title"
16:40:54 | INFO | "Finding optimal N (target # of subs per cluster)..."

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:09<00:39,  9.85s/it][A
 40%|████      | 2/5 [00:19<00:29,  9.77s/it][A
 60%|██████    | 3/5 [00:29<00:19,  9.85s/it][A
 80%|████████  | 4/5 [00:39<00:09,  9.87s/it][A
100%|██████████| 5/5 [00:49<00:00,  9.92s/it]
16:41:43 | INFO | "  8 <-- Optimal N"
16:41:44 | INFO | "Assigning clusters based on optimal N..."
16:41:54 | INFO | "Getting QA and summary at cluster_level..."
16:41:54 | INFO | "(224, 23)  <- df.shape full summary"
16:41:54 | INFO | "Adding metadata to df_top_level_summary..."
16:41:54 | INFO | "Creating FPR output..."
16:41:54 | INFO | "  (2363, 15) <- df_f

CPU times: user 2min 44s, sys: 2.56 s, total: 2min 47s
Wall time: 3min 25s


### Mainly non-English speaking countries

In [53]:
%%time
fprs_non_en = CreateFPRs(
    **cfg_fpr_not_en.config_dict
)

# run all countries
fprs_non_en.create_fprs()

  0%|          | 0/27 [00:00<?, ?it/s]16:44:03 | INFO | "== Country: IN =="
16:44:03 | INFO | "Getting geo-relevant subreddits in model for IN..."
16:44:14 | INFO | " (799, 160)  <- df_shape"
16:44:14 | INFO | " (799, 161) <- Shape AFTER dropping subreddits with covid in title"
16:44:14 | INFO | "Finding optimal N (target # of subs per cluster)..."

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:19,  4.91s/it][A
 40%|████      | 2/5 [00:09<00:14,  4.87s/it][A
 60%|██████    | 3/5 [00:14<00:09,  4.93s/it][A
 80%|████████  | 4/5 [00:19<00:04,  4.94s/it][A
100%|██████████| 5/5 [00:24<00:00,  4.95s/it]
16:44:39 | INFO | "  6 <-- Optimal N"
16:44:39 | INFO | "Assigning clusters based on optimal N..."
16:44:44 | INFO | "Getting QA and summary at cluster_level..."
16:44:44 | INFO | "(108, 23)  <- df.shape full summary"
16:44:44 | INFO | "Adding metadata to df_top_level_summary..."
16:44:44 | INFO | "Creating FPR output..."
16:44:45 | INFO | "  (691, 15) <- df_fpr

CPU times: user 7min 55s, sys: 5.47 s, total: 8min
Wall time: 12min 19s


# QA manual checks

In [None]:
BREAK

## TEST run on smaller countries & short list

Use this when checking that functions are running properly. Note that these will be saved to a `run_test` path in GCS and the they should have `add_outputs_to_bq=False`.

In [128]:
# create test config to check that new code works fine before kicking off full list of countries
cfg_fpr_not_en_test = LoadHydraConfig(
    config_name='fpr_v050-2022-08-02-non_english.yaml',
    config_path="../config",
    overrides=[
        # f"partition_dt=2022-08-02",
        f"add_outputs_to_bq=True",
        f"target_countries=['MX']",
        f"gcs_output_path=i18n_topic_model_batch/fpr/runs_test",
    ],
)
print(f"Config keys:\n  ", [k for k in cfg_fpr_not_en_test.config_dict.keys()])
cfg_fpr_not_en_test.config_dict

Config keys:
   ['description', 'output_bucket', 'gcs_output_path', 'add_outputs_to_bq', 'cluster_labels_table', 'partition_dt', 'qa_table', 'qa_pt', 'geo_relevance_table', 'geo_min_users_percent_by_subreddit_l28', 'geo_min_country_standardized_relevance', 'target_countries']


{'add_outputs_to_bq': True,
 'cluster_labels_table': 'reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_full',
 'description': "Base config to test FPR creation. IE (Ireland) is here because it's a small(er) country",
 'gcs_output_path': 'i18n_topic_model_batch/fpr/runs_test',
 'geo_min_country_standardized_relevance': 2.4,
 'geo_min_users_percent_by_subreddit_l28': 0.14,
 'geo_relevance_table': 'reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725',
 'output_bucket': 'i18n-subreddit-clustering',
 'partition_dt': '2022-08-02',
 'qa_pt': '2022-08-02',
 'qa_table': 'reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags',
 'target_countries': ['MX']}

In [129]:
%%time
fprs_test = CreateFPRs(
    **cfg_fpr_not_en_test.config_dict
)
# run all countries in config
fprs_test.create_fprs()

  0%|          | 0/1 [00:00<?, ?it/s]21:36:36 | INFO | "== Country: MX =="
21:36:36 | INFO | "Getting geo-relevant subreddits in model for MX..."
21:36:38 | INFO | " (245, 160)  <- df_shape"
21:36:38 | INFO | " (245, 161) <- Shape AFTER dropping subreddits with covid in title"
21:36:38 | INFO | "Finding optimal N (target # of subs per cluster)..."

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:11,  2.80s/it][A
 40%|████      | 2/5 [00:05<00:08,  2.76s/it][A
 60%|██████    | 3/5 [00:08<00:05,  2.75s/it][A
 80%|████████  | 4/5 [00:11<00:02,  2.76s/it][A
100%|██████████| 5/5 [00:13<00:00,  2.75s/it]
21:36:52 | INFO | "  8 <-- Optimal N"
21:36:52 | INFO | "Assigning clusters based on optimal N..."
21:36:55 | INFO | "Getting QA and summary at cluster_level..."
21:36:55 | INFO | "(39, 23)  <- df.shape full summary"
21:36:55 | INFO | "Adding metadata to df_top_level_summary..."
21:36:55 | INFO | "Creating FPR output..."
21:36:55 | INFO | "  (210, 15) <- df_fpr.s

CPU times: user 17.4 s, sys: 181 ms, total: 17.5 s
Wall time: 28.3 s


### Check test outputs

In [None]:
print(fprs_test.output_bucket)
print(fprs_test.gcs_output_path_df_fpr)

In [None]:
fprs_test.fpr_outputs.keys()

In [None]:
# fprs_test.fpr_outputs['MX']['df_fpr'].info()

In [81]:
# fprs_test.fpr_outputs['MX']['df_fpr'].columns.to_list()

In [82]:
fprs_test.fpr_outputs['MX']['df_fpr'].head()

Unnamed: 0,run_id,geo_country_code,country_name,subreddit_id_seed,subreddit_name_seed,subreddits_to_rec_count,cluster_subreddit_names_list,cluster_subreddit_ids_list,cluster_label,cluster_label_k,pt,qa_pt,cluster_label_int,qa_table,geo_relevance_table
16,2022-08-03_202826,MX,Mexico,t5_2csckx,cuartetodenos,6,"[cumbia, latinpopheads, dannyelfman, homeshake, marsargo, themarsvolta]","[t5_2x1po, t5_wfesz, t5_31291, t5_33h9w, t5_3brig, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725
126,2022-08-03_202826,MX,Mexico,t5_3brig,marsargo,5,"[cumbia, latinpopheads, dannyelfman, homeshake, themarsvolta]","[t5_2x1po, t5_wfesz, t5_31291, t5_33h9w, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725
92,2022-08-03_202826,MX,Mexico,t5_2v8rv,musicaenespanol,6,"[cumbia, latinpopheads, dannyelfman, homeshake, marsargo, themarsvolta]","[t5_2x1po, t5_wfesz, t5_31291, t5_33h9w, t5_3brig, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725
206,2022-08-03_202826,MX,Mexico,t5_wfesz,latinpopheads,5,"[cumbia, dannyelfman, homeshake, marsargo, themarsvolta]","[t5_2x1po, t5_31291, t5_33h9w, t5_3brig, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725
110,2022-08-03_202826,MX,Mexico,t5_31291,dannyelfman,5,"[cumbia, latinpopheads, homeshake, marsargo, themarsvolta]","[t5_2x1po, t5_wfesz, t5_33h9w, t5_3brig, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725


In [89]:
# %%time
# for name_, df_ in fprs_test.fpr_outputs['MX'].items():
#     print(f"\n{name_}")
#     try:
#         print(df_.shape)
#         if 'df_labels_target' == name_:
#             continue
#         display(df_.iloc[:5, :15])
#     except Exception as e:
#         print(f"  {df_.keys()}")

## QA load data from QA file to make sure list is still nested

Looks like BQ isn't reading the nested lists at all (they're all null). Is it an error in BQ or in saving the parquet file?

In [83]:
%%time
from dask import dataframe as dd
import pyarrow as pa

bucket = 'i18n-subreddit-clustering'
df_fpr_test = dd.read_parquet(
    f"gs://{bucket}/i18n_topic_model_batch/fpr/runs_test/2022-08-03_202826/df_fpr/*.parquet"
).compute()

print(df_fpr_test.shape)

(210, 15)
CPU times: user 471 ms, sys: 33.6 ms, total: 505 ms
Wall time: 997 ms


In [84]:
df_fpr_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   run_id                        210 non-null    object
 1   geo_country_code              210 non-null    object
 2   country_name                  210 non-null    object
 3   subreddit_id_seed             210 non-null    object
 4   subreddit_name_seed           210 non-null    object
 5   subreddits_to_rec_count       210 non-null    int64 
 6   cluster_subreddit_names_list  210 non-null    object
 7   cluster_subreddit_ids_list    210 non-null    object
 8   cluster_label                 210 non-null    object
 9   cluster_label_k               210 non-null    object
 10  pt                            210 non-null    object
 11  qa_pt                         210 non-null    object
 12  cluster_label_int             210 non-null    int64 
 13  qa_table            

In [85]:
df_fpr_test.head()

Unnamed: 0,run_id,geo_country_code,country_name,subreddit_id_seed,subreddit_name_seed,subreddits_to_rec_count,cluster_subreddit_names_list,cluster_subreddit_ids_list,cluster_label,cluster_label_k,pt,qa_pt,cluster_label_int,qa_table,geo_relevance_table
0,2022-08-03_202826,MX,Mexico,t5_2csckx,cuartetodenos,6,"[cumbia, latinpopheads, dannyelfman, homeshake, marsargo, themarsvolta]","[t5_2x1po, t5_wfesz, t5_31291, t5_33h9w, t5_3brig, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725
1,2022-08-03_202826,MX,Mexico,t5_3brig,marsargo,5,"[cumbia, latinpopheads, dannyelfman, homeshake, themarsvolta]","[t5_2x1po, t5_wfesz, t5_31291, t5_33h9w, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725
2,2022-08-03_202826,MX,Mexico,t5_2v8rv,musicaenespanol,6,"[cumbia, latinpopheads, dannyelfman, homeshake, marsargo, themarsvolta]","[t5_2x1po, t5_wfesz, t5_31291, t5_33h9w, t5_3brig, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725
3,2022-08-03_202826,MX,Mexico,t5_wfesz,latinpopheads,5,"[cumbia, dannyelfman, homeshake, marsargo, themarsvolta]","[t5_2x1po, t5_31291, t5_33h9w, t5_3brig, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725
4,2022-08-03_202826,MX,Mexico,t5_31291,dannyelfman,5,"[cumbia, latinpopheads, homeshake, marsargo, themarsvolta]","[t5_2x1po, t5_wfesz, t5_33h9w, t5_3brig, t5_2sdzc]",0014-0014-0017,k_0060_label,2022-08-02,2022-08-02,17,reddit-employee-datasets.david_bermejo.subclu_v0050_subreddit_clusters_c_qa_flags,reddit-employee-datasets.david_bermejo.subclu_subreddit_relevance_beta_20220725


## QA - Check dtype of parquet file for FPR summary

This table should have summary counts & list of some subreddits to check.

In [9]:
%%time
from dask import dataframe as dd
import pyarrow as pa

bucket = 'i18n-subreddit-clustering'
df_fpr_qa_summary = dd.read_parquet(
    f"gs://{bucket}/i18n_topic_model_batch/fpr/runs/2022-07-30_005122/df_fpr_qa_summary/*.parquet"
).compute()

print(df_fpr_qa_summary.shape)

(3, 25)
CPU times: user 1 s, sys: 78.9 ms, total: 1.08 s
Wall time: 6.01 s


In [None]:
# df_fpr_qa_summary.head()

In [10]:
df_fpr_qa_summary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 0 to 0
Data columns (total 25 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   pt                                          3 non-null      object
 1   geo_relevance_table                         3 non-null      object
 2   qa_pt                                       3 non-null      object
 3   qa_table                                    3 non-null      object
 4   run_id                                      3 non-null      object
 5   geo_country_code                            3 non-null      object
 6   country_name                                3 non-null      object
 7   relevant_subreddit_id_count                 3 non-null      int64 
 8   bucket                                      3 non-null      object
 9   path_fpr_json                               3 non-null      object
 10  path_df_fpr                   

In [11]:
pd_schema = pa.Schema.from_pandas(df_fpr_qa_summary)
pd_schema

pt: date32[day]
geo_relevance_table: string
qa_pt: string
qa_table: string
run_id: string
geo_country_code: string
country_name: string
relevant_subreddit_id_count: int64
bucket: string
path_fpr_json: string
path_df_fpr: string
path_df_fpr_qa_summary: string
path_df_fpr_cluster_summary: string
seed_subreddit_ids: list<item: string>
  child 0, item: string
seed_subreddit_ids_count: int64
recommend_subreddit_ids: list<item: string>
  child 0, item: string
recommend_subreddit_ids_count: int64
orphan_or_exclude_seed_subreddit_ids_list: list<item: string>
  child 0, item: string
orphan_or_exclude_seed_subreddit_ids_count: int64
orphan_seed_subreddit_ids_list: list<item: string>
  child 0, item: string
orphan_seed_subreddit_ids_count: int64
orphan_recommend_subreddit_ids_list: list<item: string>
  child 0, item: string
orphan_recommend_subreddit_ids_count: int64
clusters_total: int64
clusters_with_recommendations: int64
__index_level_0__: int64
-- schema metadata --
pandas: '{"index_columns"

In [13]:
pd_schema.metadata.keys()

dict_keys([b'pandas'])

In [18]:
pd_schema.metadata[b'pandas']

b'{"index_columns": ["__index_level_0__"], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "pt", "field_name": "pt", "pandas_type": "date", "numpy_type": "object", "metadata": null}, {"name": "geo_relevance_table", "field_name": "geo_relevance_table", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "qa_pt", "field_name": "qa_pt", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "qa_table", "field_name": "qa_table", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "run_id", "field_name": "run_id", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "geo_country_code", "field_name": "geo_country_code", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "country_name", "field_name": "country_name", "pandas_type": "unicode", "numpy_type": 

In [16]:
print(pd_schema.to_string())

pt: date32[day]
geo_relevance_table: string
qa_pt: string
qa_table: string
run_id: string
geo_country_code: string
country_name: string
relevant_subreddit_id_count: int64
bucket: string
path_fpr_json: string
path_df_fpr: string
path_df_fpr_qa_summary: string
path_df_fpr_cluster_summary: string
seed_subreddit_ids: list<item: string>
  child 0, item: string
seed_subreddit_ids_count: int64
recommend_subreddit_ids: list<item: string>
  child 0, item: string
recommend_subreddit_ids_count: int64
orphan_or_exclude_seed_subreddit_ids_list: list<item: string>
  child 0, item: string
orphan_or_exclude_seed_subreddit_ids_count: int64
orphan_seed_subreddit_ids_list: list<item: string>
  child 0, item: string
orphan_seed_subreddit_ids_count: int64
orphan_recommend_subreddit_ids_list: list<item: string>
  child 0, item: string
orphan_recommend_subreddit_ids_count: int64
clusters_total: int64
clusters_with_recommendations: int64
__index_level_0__: int64
-- schema metadata --
pandas: '{"index_columns"

In [24]:
fpr_qa_schema = pa.schema(
    [
        ('pt', pa.date32()),
        ('geo_relevance_table', pa.string()),
        ('qa_pt', pa.string()),
        ('qa_table', pa.string()),
        ('run_id', pa.string()),
        ('geo_country_code', pa.string()),
        ('country_name', pa.string()),
        ('relevant_subreddit_id_count', pa.int64()),
        ('bucket', pa.string()),
        ('path_fpr_json', pa.string()),
        ('path_df_fpr', pa.string()),
        ('path_df_fpr_qa_summary', pa.string()),
        ('path_df_fpr_cluster_summary', pa.string()),
        ('seed_subreddit_ids', pa.list_(pa.string())),
        ('seed_subreddit_ids_count', pa.int64()),
        ('recommend_subreddit_ids', pa.list_(pa.string())),
        ('recommend_subreddit_ids_count', pa.int64()),
        ('orphan_or_exclude_seed_subreddit_ids_list', pa.list_(pa.string())),
        ('orphan_or_exclude_seed_subreddit_ids_count', pa.int64()),
        ('orphan_seed_subreddit_ids_list', pa.list_(pa.string())),
        ('orphan_seed_subreddit_ids_count', pa.int64()),
        ('orphan_recommend_subreddit_ids_list', pa.list_(pa.string())),
        ('orphan_recommend_subreddit_ids_count', pa.int64()),
        ('clusters_total', pa.int64()),
        ('clusters_with_recommendations', pa.int64()),
    ],
    metadata={
        'pt': 'Partition time for subreddit_lookup & crowdsourced topic & ratings',
        'geo_relevance_table': 'Name of table used for geo-relevance scores',
        'qa_pt': 'PT for QA decisions',
        'qa_table': 'Table with QA logic',
        'run_id': 'Run ID (timestamp) for this FPR',
        'geo_country_code': 'Geo country code for relevance & FPR',
        'country_name': 'Country name (based on geo_country_code)',
        'relevant_subreddit_id_count': 'Num of subreddits relevant to country',
        'bucket': 'Bucket with FPR output',
        'path_fpr_json': 'Path for FPR JSON output',
        'path_df_fpr': 'Path for dataframe with FPR output',
        'path_df_fpr_qa_summary': 'Path for dataframe with FPR summary (this table!)',
        'path_df_fpr_cluster_summary': 'Path for df with cluster-level summary',
        'seed_subreddit_ids': 
            'List of seed IDs. If user subscribes to a SEED we will rec posts from "recommend list"',
        'seed_subreddit_ids_count': 'Count of seed subreddit IDs. Note: EXCLUDES orphans.',
        'recommend_subreddit_ids': 'List of subreddit IDs to recommend in country FPR',
        'recommend_subreddit_ids_count': 'Count of subreddits to recommend in country FPR. NOTE: excludes orphans & do not recommend',
        'orphan_or_exclude_seed_subreddit_ids_list': 'List of orphan or exclude SEED subreddit IDs',
        'orphan_or_exclude_seed_subreddit_ids_count': 'Count of oprhan or exclude SEED subredit IDs',
        'orphan_seed_subreddit_ids_list': "List of subreddits that are by themselves or can't be recommended to something else",
        'orphan_seed_subreddit_ids_count': "Count of subreddits that can't be recommended",
        'orphan_recommend_subreddit_ids_list': "List of subreddits that can be recommended, but are orphaned",
        'orphan_recommend_subreddit_ids_count': "Count of subreddits that can be recommended but are by themselves",
        'clusters_total': "Total num of clusters in country",
        'clusters_with_recommendations': "Num of clusters that have 1+ subreddits to recommend",
    },
)
fpr_qa_schema

pt: date32[day]
geo_relevance_table: string
qa_pt: string
qa_table: string
run_id: string
geo_country_code: string
country_name: string
relevant_subreddit_id_count: int64
bucket: string
path_fpr_json: string
path_df_fpr: string
path_df_fpr_qa_summary: string
path_df_fpr_cluster_summary: string
seed_subreddit_ids: list<item: string>
  child 0, item: string
seed_subreddit_ids_count: int64
recommend_subreddit_ids: list<item: string>
  child 0, item: string
recommend_subreddit_ids_count: int64
orphan_or_exclude_seed_subreddit_ids_list: list<item: string>
  child 0, item: string
orphan_or_exclude_seed_subreddit_ids_count: int64
orphan_seed_subreddit_ids_list: list<item: string>
  child 0, item: string
orphan_seed_subreddit_ids_count: int64
orphan_recommend_subreddit_ids_list: list<item: string>
  child 0, item: string
orphan_recommend_subreddit_ids_count: int64
clusters_total: int64
clusters_with_recommendations: int64
-- schema metadata --
pt: 'Partition time for subreddit_lookup & crowdso

In [26]:
df_fpr_qa_summary.to_parquet('test_df.parquet', engine='pyarrow', schema=fpr_qa_schema)

In [71]:
df_fpr_qa_summary[[c for c in df_fpr_qa_summary if '_list' in c]].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 0 to 0
Data columns (total 3 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   orphan_or_exclude_seed_subreddit_ids_list  3 non-null      object
 1   orphan_seed_subreddit_ids_list             3 non-null      object
 2   orphan_recommend_subreddit_ids_list        3 non-null      object
dtypes: object(3)
memory usage: 96.0+ bytes


In [72]:
(
    df_fpr_qa_summary
    [
        ['geo_country_code', 'country_name'] +
        [c for c in df_fpr_qa_summary if 'orphan' in c]
        # [c for c in df_fpr_qa_summary if '_list' in c]
    ]
)

Unnamed: 0,geo_country_code,country_name,orphan_or_exclude_seed_subreddit_ids_list,orphan_or_exclude_seed_subreddit_ids_count,orphan_seed_subreddit_ids_list,orphan_seed_subreddit_ids_count,orphan_recommend_subreddit_ids_list,orphan_recommend_subreddit_ids_count
0,AU,Australia,"[t5_2g1csa, t5_6h8k85, t5_2zbdz, t5_3j7kr, t5_446cgi, t5_2g1c73, t5_iioxt, t5_11jldg, t5_398gpn, t5_2p1ob9, t5_2tr5t, t5_2en8q7, t5_3a05c, t5_3lbmec, t5_2xmrc, t5_5h559x, t5_2w2p3, t5_2sz2k, t5_2sa9a, t5_6kspp0, t5_4wzn99, t5_3mh4d, t5_...",47,"[t5_2r434, t5_vv65o, t5_2saa4f, t5_5h559x, t5_3lbmec, t5_3465r, t5_6kspp0, t5_11jldg, t5_3j7iv, t5_3j7kr, t5_398gpn, t5_3mh4d, t5_2zbdz, t5_676p57, t5_3bwpc, t5_446cgi, t5_2p1ob9, t5_30fmq, t5_43efq4, t5_2xmrc, t5_2g1c73, t5_2g1csa, t5_...",42,"[t5_3lbmec, t5_2tr5t]",2
0,CA,Canada,"[t5_2g1csa, t5_2znli, t5_2qnub, t5_2dhvn5, t5_4ppwwv, t5_ybv6h, t5_10buou, t5_2rg21, t5_2rgdp, t5_2uqcm, t5_2sc6k, t5_2qh2a, t5_2rdee, t5_2s91q, t5_5agq0s, t5_e0m2r, t5_2yiq3, t5_2sja0, t5_3iy4z, t5_2zqxq, t5_2xc09b, t5_2qqjc, t5_2qh5i,...",38,"[t5_61so49, t5_4pb52f, t5_58qzn6, t5_j3ec6, t5_2sba1, t5_10buou, t5_2zqxq, t5_2tycb, t5_2qnub, t5_2rg21, t5_2rgdp, t5_55tcfe, t5_2qqjc, t5_ybv6h, t5_2s907, t5_2fi7fh, t5_2zenb, t5_2qh5i, t5_2v0p8, t5_3opn9, t5_2dhvn5, t5_2uqcm, t5_uyeh3...",29,[t5_2xc09b],1
0,GB,United Kingdom,"[t5_3085di, t5_2zpr4w, t5_3e2g7, t5_5omuri, t5_trgqh, t5_2s3ot, t5_4ppwwv, t5_3a9abr, t5_3fs3z, t5_3fw8s, t5_2t218, t5_313vp, t5_58lqpe, t5_3oz0v, t5_305lh, t5_2s16v, t5_2sdh1, t5_3p6ug, t5_378bu, t5_2ti58, t5_3n16h, t5_358dh, t5_31ma0,...",94,"[t5_2s4gm, t5_2t92d, t5_2ti58, t5_2ud6j, t5_6ayxug, t5_h667d, t5_r9j56, t5_3fs3z, t5_4bh6f, t5_257lcm, t5_3bayia, t5_3e2g7, t5_2qhkm, t5_gd2e1, t5_x2m26, t5_2sbyc, t5_2tz24, t5_2rlzle, t5_2sk9w, t5_2t218, t5_2zpr4w, t5_39e4g, t5_58lqpe,...",68,[],0
