# Purpose

### 2023-05-16

Get labeled data to train push-notification models.  Here we'll run a loop to run the same queries over different time horizons.


# Imports & notebook setup

In [1]:
%load_ext google.colab.data_table
%load_ext autoreload
%autoreload 2

In [38]:
# colab auth for BigQuery, google drive, & google sheets (gspread)
from google.colab import auth, files, drive
from google.auth import default


auth.authenticate_user()
print('Authenticated')

Authenticated


### General Imports


In [46]:
# Regular Imports
from datetime import datetime
import logging
import os
import string

from google.cloud import bigquery
from tqdm import tqdm
import pandas as pd
import gspread


logger = logging.getLogger(__name__)


# auth for google sheets
gc = gspread.authorize(default()[0])


# Set env variable needed by some libraries to get data from BigQuery. 
os.environ['GOOGLE_CLOUD_PROJECT'] = 'data-prod-165221'

# Official `gspread` Documentation
See here for official documentation: https://docs.gspread.org/en/latest/user-guide.html

---


### Using gspread with pandas
pandas is a popular library for data analysis. The simplest way to get data from a sheet to a pandas DataFrame is with `get_all_records()`:
```python
import pandas as pd

dataframe = pd.DataFrame(worksheet.get_all_records())
```

Here’s a basic example for writing a dataframe to a sheet. With update() we put the header of a dataframe into the first row of a sheet followed by the values of a dataframe:

```python
import pandas as pd

worksheet.update([dataframe.columns.values.tolist()] + dataframe.values.tolist())
```



# Define helper functions

### Open or Create Sheet
These helper functions help creating or opening existing sheets or worksheets (tabs).

The functions will try to open an existing sheet by:
- `GSHEET_KEY` first
- `GSHEET_NAME` second

If it can't find an existing sheet by key or name, it'll create a new one.

### Open or create tabs (worksheets)
Given a list of worksheet names, this function will open (if they exist) or create a new worksheet with the input names.

In [18]:
def open_or_create_sheet(
    gsheet_name: str,
    gsheet_key: str = None,
):
    """Open or create an existing google sheet"""
    # create worksheet or read existing
    if GSHEET_KEY is not None:
        sh = gc.open_by_key(GSHEET_KEY)
        print(f"Opening google worksheet: {sh.title} ...")
    else:
        try:
            # try to open by name:
            sh = gc.open(GSHEET_NAME)
            print(f"Opening google worksheet: {sh.title} ...")
            print(f"*** Sheet ID (assign it to GSHEET_KEY variable): ***\n{sh.id}\n")
        except Exception as e:
            print(f"** Creating google worksheet: {GSHEET_NAME} ...")
            sh = gc.create(GSHEET_NAME)
            print(f"\n*** Sheet ID (assign it to GSHEET_KEY variable): ***\n{sh.id}\n")
    print(f"Google sheet URL:\nhttps://docs.google.com/spreadsheets/d/{sh.id}")
    return sh


def open_or_create_worksheets(
    sheet_object,
    worksheet_names: list,
) -> dict:
    """Open or create worksheets in input sheet.
    Return a dictionary with worksheet objects
    """
    d_output = dict()

    for wsh_name in worksheet_names:
        try:
            d_output[wsh_name] = sheet_object.worksheet(wsh_name)
            print(f"  Opening tab/sheet: {wsh_name}")
        except Exception as e:
            print(f"  ** Creating tab/sheet: {wsh_name}")
            d_output[wsh_name] = sheet_object.add_worksheet(wsh_name, rows=5, cols=5)
    
    # get & return existing worksheets
    for wsh_ in sheet_object.worksheets():
        wsh_title = wsh_.title
        print(f"  Opening tab/sheet: {wsh_title}")
        d_output[wsh_title] = wsh_

    return d_output



# Get sheet with metadata for one-off PNs.

I (David) created this sheet to clean up data needed from the one-off sending tool here:
- https://channels-dashboard.kubernetes.ue1.snooguts.net/one_offs?page=5

## Load raw sheet

In [19]:
%%time

from google.auth import default
import gspread


# Auth for bigquery & google sheets
auth.authenticate_user()
print('Authenticated')
gc = gspread.authorize(default()[0])


# You can use f-strings to generate the sheet name based on input variables
GSHEET_NAME = f'PN tables & send dates for model training'

# Set to None if we want a new sheet.
#   if you want to open an existing one, assign string, like:
#   '1kiiuOqHPJ5chV3zmx4HsXfpOGzO0QcuxW0uHTc85Peg'
GSHEET_KEY = '1kiiuOqHPJ5chV3zmx4HsXfpOGzO0QcuxW0uHTc85Peg'

sh = open_or_create_sheet(
    gsheet_name=GSHEET_NAME,
    gsheet_key=GSHEET_KEY,
)

l_worksheet_names = []

d_worksheets = open_or_create_worksheets(
    sheet_object=sh,
    worksheet_names=l_worksheet_names,
)

Authenticated
Opening google worksheet: PN tables & send dates for model training ...
Google sheet URL:
https://docs.google.com/spreadsheets/d/1kiiuOqHPJ5chV3zmx4HsXfpOGzO0QcuxW0uHTc85Peg
  Opening tab/sheet: raw_tables_and_dates
  Opening tab/sheet: Pivot Table 1
  Opening tab/sheet: README
  Opening tab/sheet: zelda PN estimates
CPU times: user 49.8 ms, sys: 4.76 ms, total: 54.5 ms
Wall time: 1.14 s


In [27]:
%%time
df_campaigns_train_raw = pd.DataFrame(d_worksheets['raw_tables_and_dates'].get_all_records())
df_campaigns_train_raw = df_campaigns_train_raw.rename(columns={c: c.replace(' ', '_') for c in df_campaigns_train_raw.columns})
print(df_campaigns_train_raw.shape)

(38, 11)
CPU times: user 19.7 ms, sys: 0 ns, total: 19.7 ms
Wall time: 658 ms


In [28]:
df_campaigns_train_raw.head()

Unnamed: 0,date_sent_utc,full_table_name,train_v0,train_v1,day_of_week_sent,date_training,target_country,Expected_Sends,table_name_only,date_training_(extra_lag),CTR_check
0,12/1/2022,reddit-growth-prod.generated_one_offs.20221201...,,,Thursday,11/30/2022,gb,74883,20221201174412_vnicoleheard_gb_taskmasterstand...,,
1,12/2/2022,reddit-growth-prod.generated_one_offs.20221201...,,,Friday,12/1/2022,ca,78986,20221201193322_elizabethpollard_ca_canada_this...,,
2,12/2/2022,reddit-growth-prod.generated_one_offs.20221201...,1.0,,Friday,12/1/2022,de,32358,20221201193812_elizabethpollard_de_de_der_topb...,,
3,12/2/2022,reddit-growth-prod.generated_one_offs.20221201...,,,Friday,12/1/2022,ca,78986,20221201193322_elizabethpollard_ca_canada_this...,,
4,12/14/2022,reddit-growth-prod.generated_one_offs.20221214...,,,Wednesday,12/13/2022,fr,93361,20221214012134_elizabethpollard_fr_ligue1world...,,


## Filter to only pre-selected campaigns

In [29]:
df_campaigns_train = df_campaigns_train_raw[df_campaigns_train_raw['train_v0'] == 1].copy()
df_campaigns_train

Unnamed: 0,date_sent_utc,full_table_name,train_v0,train_v1,day_of_week_sent,date_training,target_country,Expected_Sends,table_name_only,date_training_(extra_lag),CTR_check
2,12/2/2022,reddit-growth-prod.generated_one_offs.20221201...,1,,Friday,12/1/2022,de,32358,20221201193812_elizabethpollard_de_de_der_topb...,,
13,3/1/2023,reddit-growth-prod.generated_one_offs.mando_no...,1,,Wednesday,2/28/2023,GB,44444,mando_nosub_relevant_view_contributor,,
14,3/1/2023,reddit-growth-prod.generated_one_offs.mando_su...,1,,Wednesday,2/28/2023,GB,44444,mando_sub_no_view_contributor,,
28,5/5/2023,reddit-growth-prod.generated_one_offs.20230505...,1,,Friday,5/4/2023,de,84169,20230505153239_malenafinguerut_de_rentnerzeige...,,
31,5/10/2023,reddit-growth-prod.generated_one_offs.20230510...,1,,Wednesday,5/9/2023,gb,2486,20230510101039_anaclarapaniago_gb_tearsoftheki...,,
32,5/10/2023,reddit-growth-prod.growth_team_tables.100viewsMay,1,,Wednesday,5/9/2023,??,25000,100viewsMay,,
33,5/10/2023,reddit-growth-prod.generated_one_offs.20230509...,1,,Wednesday,5/9/2023,in,76061,20230509063838_vpoonamsharma_in_indianhiphophe...,,
34,5/10/2023,reddit-growth-prod.generated_one_offs.20230510...,1,,Wednesday,5/9/2023,au,58121,20230510032114_vcharmainedesouza_au_askanaustr...,,
35,5/10/2023,reddit-growth-prod.generated_one_offs.20230509...,1,,Wednesday,5/9/2023,in,29899,20230509203555_vjennifergale_in_indiacricket_m...,,
36,5/10/2023,reddit-growth-prod.generated_one_offs.20230509...,1,,Wednesday,5/9/2023,ca,43734,20230509215916_vfatimananavati_ca_breath_of_th...,,


In [59]:
# from time import strftime
## Convert dates to ISO standard that SQL queries should understand better
for dt_ in ['date_sent_utc', 'date_training']:
    df_campaigns_train[dt_] = (
        pd.to_datetime(
            df_campaigns_train[dt_]
        )
        .dt.strftime("%Y-%m-%d")
    ) 

In [61]:
df_campaigns_train

Unnamed: 0,date_sent_utc,full_table_name,train_v0,train_v1,day_of_week_sent,date_training,target_country,Expected_Sends,table_name_only,date_training_(extra_lag),CTR_check
2,2022-12-02,reddit-growth-prod.generated_one_offs.20221201...,1,,Friday,2022-12-01,de,32358,20221201193812_elizabethpollard_de_de_der_topb...,,
13,2023-03-01,reddit-growth-prod.generated_one_offs.mando_no...,1,,Wednesday,2023-02-28,GB,44444,mando_nosub_relevant_view_contributor,,
14,2023-03-01,reddit-growth-prod.generated_one_offs.mando_su...,1,,Wednesday,2023-02-28,GB,44444,mando_sub_no_view_contributor,,
28,2023-05-05,reddit-growth-prod.generated_one_offs.20230505...,1,,Friday,2023-05-04,de,84169,20230505153239_malenafinguerut_de_rentnerzeige...,,
31,2023-05-10,reddit-growth-prod.generated_one_offs.20230510...,1,,Wednesday,2023-05-09,gb,2486,20230510101039_anaclarapaniago_gb_tearsoftheki...,,
32,2023-05-10,reddit-growth-prod.growth_team_tables.100viewsMay,1,,Wednesday,2023-05-09,??,25000,100viewsMay,,
33,2023-05-10,reddit-growth-prod.generated_one_offs.20230509...,1,,Wednesday,2023-05-09,in,76061,20230509063838_vpoonamsharma_in_indianhiphophe...,,
34,2023-05-10,reddit-growth-prod.generated_one_offs.20230510...,1,,Wednesday,2023-05-09,au,58121,20230510032114_vcharmainedesouza_au_askanaustr...,,
35,2023-05-10,reddit-growth-prod.generated_one_offs.20230509...,1,,Wednesday,2023-05-09,in,29899,20230509203555_vjennifergale_in_indiacricket_m...,,
36,2023-05-10,reddit-growth-prod.generated_one_offs.20230509...,1,,Wednesday,2023-05-09,ca,43734,20230509215916_vfatimananavati_ca_breath_of_th...,,


In [86]:
# convert to dictionary where I can use the strings to query the right data:

l_cols_for_extraction = [
    'date_sent_utc',
    'full_table_name',
]
l_campaign_pt_and_table = df_campaigns_train[l_cols_for_extraction].to_dict(orient='records')
print(len(l_campaign_pt_and_table))
l_campaign_pt_and_table

11


[{'date_sent_utc': '2022-12-02',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.20221201193812_elizabethpollard_de_de_der_topbeitrag_diese_woche_32358'},
 {'date_sent_utc': '2023-03-01',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.mando_nosub_relevant_view_contributor'},
 {'date_sent_utc': '2023-03-01',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.mando_sub_no_view_contributor'},
 {'date_sent_utc': '2023-05-05',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.20230505153239_malenafinguerut_de_rentnerzeigenaufdingehela_diese_woche_heiß_84169'},
 {'date_sent_utc': '2023-05-10',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.20230510101039_anaclarapaniago_gb_tearsofthekingdombreath_of_zelda_ai_2486'},
 {'date_sent_utc': '2023-05-10',
  'full_table_name': 'reddit-growth-prod.growth_team_tables.100viewsMay'},
 {'date_sent_utc': '2023-05-10',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.20230509063838_vpoonam

# Create SQL query to create or insert data into training table

General set up:
- Declare variables (e.g., PT window start & end)
- CREATE or INSERT statement
- SELECT statement for data to insert (CTEs & other)

NOTE: we need to add `r` before the string to show that it's a `raw` string -- we don't want it to mess with the regexes in the SQL queries!!

## Define SQL queries

In [84]:
#@title
train_data_table = 'reddit-employee-datasets.david_bermejo.pn_training_data_20230515'

SQL_DEFINE_VARS = r"""
-- Get labels for receives & clicks

-- Only look at click events 5 days afert send
DECLARE PT_WINDOW_START DATE DEFAULT DATE("${date_sent_utc}");
DECLARE PT_WINDOW_END DATE DEFAULT PT_WINDOW_START + 5;

DECLARE RX_GET_SUBREDDIT_NAME STRING DEFAULT r"(?i)\br\/([a-zA-Z0-9]\w{2,30}\b)";
"""

SQL_CREATE_TABLE = r"""
CREATE TABLE `${train_data_table}`
CLUSTER BY pn_id
AS (
"""

SQL_INSERT_INTO_TABLE = r"""
-- Delete data we're trying to re-insert
-- This delete takes too long! almost as long as inserting the data! Should be faster with new pn_id & clustering
# DELETE
#     `${train_data_table}`
# WHERE
#     pn_id = (
#         SELECT DISTINCT
#         CONCAT(
#             CAST(PT_WINDOW_START AS STRING)
#             , "-"
#             , notification_title
#             , "-"
#             , deeplink_uri
#         ) AS pn_id
#         FROM `${full_table_name}`
#     )
# ;

-- Insert latest data
INSERT INTO `${train_data_table}`
(
"""

SQL_SELECT_DATA = r"""
WITH
send_long AS (
    SELECT DISTINCT
        a.correlation_id
        , notification_title
        , notification_type
        , deeplink_uri
        , REGEXP_EXTRACT(deeplink_uri, RX_GET_SUBREDDIT_NAME, 1) AS target_subreddit
        , a.user_id
        , a.app_name
        , b.device_id

    FROM `data-prod-165221.channels.pn_sends` a
        INNER JOIN  `${full_table_name}` b
        ON a.user_id = b.user_id
            AND a.notification_title = b.title
            AND a.notification_type = b.campaign_type
    WHERE 1=1
        AND DATE(a.pt) = PT_WINDOW_START
    -- GROUP BY 1,2,3,4,5,6,7
)
, send_wide AS (
    SELECT
        correlation_id
        , user_id
        , notification_title
        , notification_type
        , target_subreddit
        , deeplink_uri

        , 1 AS send
        # , ARRAY_CONCAT_AGG(DISTINCT device_id) AS device_ids
        , COUNT(correlation_id) AS send_count
        , COUNT(DISTINCT (CASE WHEN app_name = 'ios' THEN correlation_id
            ELSE NULL
        END
        )) AS send_ios
        , COUNT(DISTINCT (CASE WHEN app_name = 'android' THEN correlation_id
            ELSE NULL
        END
        )) AS send_android
        , COUNT(CASE WHEN app_name NOT IN ('android', 'ios') THEN correlation_id
            ELSE NULL
        END
        ) AS send_other

    FROM send_long
    GROUP BY 1,2,3,4,5,6
)
, receive_long as (
    SELECT
        a.correlation_id
        , a.user_id
        , a.app_name
        -- If ANY receive was supressed, count ALL as supressed
        , SUM(IF(a.is_suppressed = True, 1, 0)) AS supressed_count
    FROM `data-prod-165221.channels.pn_receives` AS a
        INNER JOIN send_wide AS b
        ON a.user_id = b.user_id
            AND a.correlation_id = b.correlation_id
    WHERE 1=1
        AND DATE(pt) between PT_WINDOW_START and PT_WINDOW_END

    GROUP BY 1,2,3
)
, receive_wide AS (
    SELECT
        user_id
        , correlation_id

        , 1 AS receive
        , COUNT(correlation_id) AS receive_count
        , SUM(supressed_count) AS suppressed_count

        , COUNT(DISTINCT (CASE WHEN app_name = 'ios' THEN correlation_id
            ELSE NULL
        END)) AS receive_ios
        , COUNT(DISTINCT (CASE WHEN app_name = 'android' THEN correlation_id
            ELSE NULL
        END)) AS receive_android
        , COUNT(CASE WHEN app_name NOT IN ('android', 'ios') THEN correlation_id
            ELSE NULL
        END
        ) AS receive_other

        , COUNT(
            DISTINCT(
                CASE WHEN app_name = 'ios' AND supressed_count >= 1 THEN correlation_id
                ELSE NULL
                END
            )
        ) AS suppressed_ios
        , COUNT(
            DISTINCT(
                CASE WHEN app_name = 'android' AND supressed_count >= 1 THEN correlation_id
                ELSE NULL
                END
            )
        ) AS suppressed_android

    FROM receive_long
    GROUP By 1,2
)
, click_long as (
    SELECT
        a.correlation_id,
        a.user_id,
        a.app_name
    FROM `data-prod-165221.channels.pn_clicks` AS a
        INNER JOIN send_wide AS b
        ON a.user_id = b.user_id
            AND a.correlation_id = b.correlation_id
    WHERE 1=1
        AND DATE(pt) between PT_WINDOW_START and PT_WINDOW_END

    GROUP BY 1,2,3
)
, click_wide AS (
SELECT
    user_id
    , correlation_id

    , 1 AS click
    , COUNT(correlation_id) AS click_count

    , COUNT(DISTINCT (CASE WHEN app_name = 'ios' THEN correlation_id
        ELSE NULL
    END)) AS click_ios
    , COUNT(DISTINCT (CASE WHEN app_name = 'android' THEN correlation_id
        ELSE NULL
    END)) AS click_android
    , COUNT(CASE WHEN app_name NOT IN ('android', 'ios') THEN correlation_id
        ELSE NULL
    END
    ) AS click_other
FROM click_long
GROUP BY 1,2
)
, all_data_wide AS (
    SELECT
        -- Note that we're getting the target_subreddit from the deeplink URI
        s.correlation_id
        , s.user_id
        , s.target_subreddit
        , s.notification_title
        , s.notification_type
        , s.send
        , r.receive
        , c.click
        , CASE
            WHEN (
                r.receive = 1
                -- Keep receives when we have more receives in (android & iOS) than suppressed_receives
                AND (
                    (COALESCE(receive_ios, 0) + COALESCE(receive_android, 0)) >
                    (COALESCE(suppressed_ios, 0) + COALESCE(suppressed_android, 0))
                )
            ) THEN 1
            WHEN r.receive IS NOT NULL THEN 0
            ELSE NULL
        END AS receive_not_suppressed


        , c.* EXCEPT(correlation_id, user_id, click)
        , r.* EXCEPT(correlation_id, user_id, receive)
        , s.* EXCEPT(correlation_id, user_id, notification_title, notification_type, send, target_subreddit)

    FROM send_wide AS s
        LEFT JOIN receive_wide AS r
            ON s.user_id = r.user_id AND s.correlation_id = r.correlation_id
        LEFT JOIN click_wide AS c
            ON s.user_id = c.user_id AND s.correlation_id = c.correlation_id
)

-- Final select for TABLE
SELECT 
    PT_WINDOW_START AS pt_send
    -- Create new campaign-id column so it's easier to find & delete campaigns
    , CONCAT(
        CAST(PT_WINDOW_START AS STRING)
        , "-"
        , notification_title
        , "-"
        , deeplink_uri
    ) AS pn_id
    , *
FROM all_data_wide
);  -- Close CREATE TABLE parens
"""

#  replace escape character b/c we sometimes need to use it with
#   regex or in JSON_EXTRACT() function
SQL_FULL_CREATE = (
    SQL_DEFINE_VARS + SQL_CREATE_TABLE + SQL_SELECT_DATA
    .replace("$.", "$$.")
    .replace("$|", "$$|")
    .replace('$"', '$$"')
)

SQL_FULL_INSERT = (
    SQL_DEFINE_VARS + SQL_INSERT_INTO_TABLE + SQL_SELECT_DATA
    .replace("$.", "$$.")
    .replace("$|", "$$|")
    .replace('$"', '$$"')
)

## Run queries for campaigns selected for v1-train 

In [85]:
log_query = False
bigquery_client = bigquery.Client()

for d_table_ in tqdm(l_campaign_pt_and_table[1:]):
    try:
        bigquery_client.get_table(train_data_table) 
        print("Table {} already exists.".format(train_data_table))
        template = string.Template(SQL_FULL_INSERT)
    except Exception as e:
        print("Table {} is NOT found.".format(train_data_table))
        template = string.Template(SQL_FULL_CREATE)
        
    sql = template.substitute(
        **d_table_,
        **{'train_data_table': train_data_table}
    )
    if log_query:
        print(sql)

    print(f"Running query for params:...\n  {d_table_}")
    query_start_time = datetime.utcnow()
    print(f"  {query_start_time} | query START time")

    query_job = bigquery_client.query(sql)
    query_job.result()
    query_end_time = datetime.utcnow()
    print(f"  {query_end_time} | query END time")
    print(f"  {query_end_time - query_start_time} | query ELAPSED time")

  0%|          | 0/10 [00:00<?, ?it/s]

Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-03-01', 'full_table_name': 'reddit-growth-prod.generated_one_offs.mando_nosub_relevant_view_contributor'}
  2023-05-16 19:33:38.167098 | query START time


 10%|█         | 1/10 [00:51<07:46, 51.83s/it]

  2023-05-16 19:34:29.603644 | query END time
  0:00:51.436546 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-03-01', 'full_table_name': 'reddit-growth-prod.generated_one_offs.mando_sub_no_view_contributor'}
  2023-05-16 19:34:29.784690 | query START time


 20%|██        | 2/10 [01:34<06:13, 46.63s/it]

  2023-05-16 19:35:12.594391 | query END time
  0:00:42.809701 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-05', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230505153239_malenafinguerut_de_rentnerzeigenaufdingehela_diese_woche_heiß_84169'}
  2023-05-16 19:35:13.028378 | query START time


 30%|███       | 3/10 [02:30<05:54, 50.59s/it]

  2023-05-16 19:36:07.900147 | query END time
  0:00:54.871769 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-10', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230510101039_anaclarapaniago_gb_tearsofthekingdombreath_of_zelda_ai_2486'}
  2023-05-16 19:36:08.119253 | query START time


 40%|████      | 4/10 [03:33<05:34, 55.75s/it]

  2023-05-16 19:37:11.551520 | query END time
  0:01:03.432267 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-10', 'full_table_name': 'reddit-growth-prod.growth_team_tables.100viewsMay'}
  2023-05-16 19:37:11.701612 | query START time


 50%|█████     | 5/10 [04:39<04:55, 59.18s/it]

  2023-05-16 19:38:16.819339 | query END time
  0:01:05.117727 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-10', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230509063838_vpoonamsharma_in_indianhiphopheadsgoaind_live_ama_with_nikhil_76061'}
  2023-05-16 19:38:16.974708 | query START time


 60%|██████    | 6/10 [05:38<03:56, 59.14s/it]

  2023-05-16 19:39:15.882554 | query END time
  0:00:58.907846 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-10', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230510032114_vcharmainedesouza_au_askanaustralianpizzamas_vegemite_on_pizza_yn_58121'}
  2023-05-16 19:39:16.032290 | query START time


 70%|███████   | 7/10 [06:19<02:39, 53.19s/it]

  2023-05-16 19:39:56.829337 | query END time
  0:00:40.797047 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-10', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230509203555_vjennifergale_in_indiacricket_mumbai_indians_on_the_rise_29899'}
  2023-05-16 19:39:56.967254 | query START time


 80%|████████  | 8/10 [07:03<01:41, 50.56s/it]

  2023-05-16 19:40:41.739929 | query END time
  0:00:44.772675 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-10', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230509215916_vfatimananavati_ca_breath_of_the_wildaiart_zelda_ai_43734'}
  2023-05-16 19:40:41.910377 | query START time


 90%|█████████ | 9/10 [07:50<00:49, 49.30s/it]

  2023-05-16 19:41:28.279332 | query END time
  0:00:46.368955 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-12', 'full_table_name': 'reddit-employee-datasets.sahil_verma.totk_pn_ml_targeting_20230512'}
  2023-05-16 19:41:28.455940 | query START time


100%|██████████| 10/10 [08:21<00:00, 50.20s/it]

  2023-05-16 19:41:59.759512 | query END time
  0:00:31.303572 | query ELAPSED time





## Inspect output of tables

In [87]:
%%time
%%bigquery df_train_check1

SELECT 
    -- pt_send
    pn_id
    , SUM(send) AS send_total
    , SUM(receive) AS receive_total
    , SUM(receive_not_suppressed) AS receive_not_suppressed_total
    , SUM(click) AS click_total
    , SAFE_DIVIDE(SUM(click), SUM(receive)) AS ctr_receive
    , SAFE_DIVIDE(SUM(click), SUM(receive_not_suppressed)) AS ctr_receive_no_suppressed
FROM `reddit-employee-datasets.david_bermejo.pn_training_data_20230515` 
GROUP BY 1
ORDER BY send_total DESC
;

Query is running:   0%|          |

Downloading:   0%|          |

CPU times: user 136 ms, sys: 11.7 ms, total: 148 ms
Wall time: 2.96 s


In [88]:
df_train_check1

Unnamed: 0,pn_id,send_total,receive_total,receive_not_suppressed_total,click_total,ctr_receive,ctr_receive_no_suppressed
0,2022-12-02-Der Top-Beitrag diese Woche 🏆-https...,285817,253583,178170,13229,0.052168,0.074249
1,2023-05-10-Zelda + AI = 🤯-https://www.reddit.c...,154904,142498,108173,11359,0.079713,0.105008
2,2023-05-10-Live! AMA with Nikhil Chinapa-https...,109511,99571,75929,1979,0.019875,0.026064
3,2023-05-10-Mumbai Indians on the rise ⬆️-https...,106270,93574,72302,3390,0.036228,0.046887
4,2023-05-10-Vegemite on pizza: Y/N? 🤔-https://w...,72160,66482,51595,2099,0.031572,0.040682
5,2023-03-01-TheMandolorianTV:-https://www.reddi...,69098,64366,51236,3904,0.060653,0.076196
6,2023-05-12-r/zelda is finally here!-https://ww...,49850,45574,42100,3122,0.068504,0.074157
7,2023-05-05-Diese Woche heiß 🔥-https://www.redd...,46912,42724,32931,3089,0.072301,0.093802
8,2023-03-01-The wait is over-https://www.reddit...,6894,6458,5047,311,0.048157,0.061621


In [91]:
df_train_check1[[
    'send_total',
    'receive_total', 'receive_not_suppressed_total',
    'click_total'
]].sum()

send_total                      901416
receive_total                   814830
receive_not_suppressed_total    617483
click_total                      42482
dtype: int64

In [92]:
42482 / 814830

0.052136028374016666

# Define queries for v2 iteration
Getting the data for the first ~9 campaigns didn't take too long, so expand the list to get even more campaigns to train on

## Load raw sheet

In [93]:
%%time

# You can use f-strings to generate the sheet name based on input variables
GSHEET_NAME = f'PN tables & send dates for model training'

# Set to None if we want a new sheet.
#   if you want to open an existing one, assign string, like:
#   '1kiiuOqHPJ5chV3zmx4HsXfpOGzO0QcuxW0uHTc85Peg'
GSHEET_KEY = '1kiiuOqHPJ5chV3zmx4HsXfpOGzO0QcuxW0uHTc85Peg'

sh = open_or_create_sheet(
    gsheet_name=GSHEET_NAME,
    gsheet_key=GSHEET_KEY,
)

l_worksheet_names = []

d_worksheets = open_or_create_worksheets(
    sheet_object=sh,
    worksheet_names=l_worksheet_names,
)

Opening google worksheet: PN tables & send dates for model training ...
Google sheet URL:
https://docs.google.com/spreadsheets/d/1kiiuOqHPJ5chV3zmx4HsXfpOGzO0QcuxW0uHTc85Peg
  Opening tab/sheet: raw_tables_and_dates
  Opening tab/sheet: Pivot Table 1
  Opening tab/sheet: README
  Opening tab/sheet: zelda PN estimates
CPU times: user 23.7 ms, sys: 2.13 ms, total: 25.8 ms
Wall time: 1.03 s


In [94]:
%%time
df_campaigns_train_raw = pd.DataFrame(d_worksheets['raw_tables_and_dates'].get_all_records())
df_campaigns_train_raw = df_campaigns_train_raw.rename(columns={c: c.replace(' ', '_') for c in df_campaigns_train_raw.columns})
print(df_campaigns_train_raw.shape)

(39, 12)
CPU times: user 7.82 ms, sys: 0 ns, total: 7.82 ms
Wall time: 255 ms


In [95]:
df_campaigns_train_raw.head()

Unnamed: 0,date_sent_utc,full_table_name,train_v0,train_v1,day_of_week_sent,date_training,target_country,Expected_Sends,table_name_only,date_training_(extra_lag),CTR_check,Unnamed: 12
0,12/1/2022,reddit-growth-prod.generated_one_offs.20221201...,,,Thursday,11/30/2022,gb,74883,20221201174412_vnicoleheard_gb_taskmasterstand...,,,
1,12/2/2022,reddit-growth-prod.generated_one_offs.20221201...,,,Friday,12/1/2022,ca,78986,20221201193322_elizabethpollard_ca_canada_this...,,,
2,12/2/2022,reddit-growth-prod.generated_one_offs.20221201...,1.0,0.0,Friday,12/1/2022,de,32358,20221201193812_elizabethpollard_de_de_der_topb...,,,
3,12/2/2022,reddit-growth-prod.generated_one_offs.20221201...,,,Friday,12/1/2022,ca,78986,20221201193322_elizabethpollard_ca_canada_this...,,,
4,12/14/2022,reddit-growth-prod.generated_one_offs.20221214...,,,Wednesday,12/13/2022,fr,93361,20221214012134_elizabethpollard_fr_ligue1world...,,,


## Filter to only pre-selected campaigns

In [96]:
df_campaigns_train1 = df_campaigns_train_raw[df_campaigns_train_raw['train_v1'] == 1].copy()
df_campaigns_train1

Unnamed: 0,date_sent_utc,full_table_name,train_v0,train_v1,day_of_week_sent,date_training,target_country,Expected_Sends,table_name_only,date_training_(extra_lag),CTR_check,Unnamed: 12
11,2/20/2023,reddit-growth-prod.generated_one_offs.20230215...,0.0,1,Monday,2/19/2023,br,82052,20230215191916_vbrunamarreiros_br_brdevconvers...,,,
21,4/18/2023,reddit-growth-prod.generated_one_offs.20230418...,,1,Tuesday,4/17/2023,de,59562,20230418142549_vmalenafinguerut_de_aktienperso...,,,
26,4/25/2023,reddit-growth-prod.generated_one_offs.20230425...,,1,Tuesday,4/24/2023,ca,48060,20230425184211_vfatimananavati_ca_nhledmontono...,,,
29,5/5/2023,reddit-growth-prod.generated_one_offs.20230504...,,1,Friday,5/4/2023,au,65320,20230504235355_vcharmainedesouza_au_eurovision...,,,
30,5/8/2023,reddit-growth-prod.generated_one_offs.20230508...,,1,Monday,5/7/2023,gb,57549,20230508163457_vjennifergale_gb_eurovisionpoph...,,,
31,5/9/2023,reddit-growth-prod.generated_one_offs.20230509...,,1,Tuesday,5/8/2023,us,20203,20230509190939_vjennifergale_us_formuladank_ho...,,,


In [97]:
# from time import strftime
## Convert dates to ISO standard that SQL queries should understand better
for dt_ in ['date_sent_utc', 'date_training']:
    df_campaigns_train1[dt_] = (
        pd.to_datetime(
            df_campaigns_train1[dt_]
        )
        .dt.strftime("%Y-%m-%d")
    ) 

In [99]:
# convert to dictionary where I can use the strings to query the right data:

l_cols_for_extraction = [
    'date_sent_utc',
    'full_table_name',
]
l_campaign_pt_and_table1 = df_campaigns_train1[l_cols_for_extraction].to_dict(orient='records')
print(len(l_campaign_pt_and_table1))
l_campaign_pt_and_table1

6


[{'date_sent_utc': '2023-02-20',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.20230215191916_vbrunamarreiros_br_brdevconversas_quer_fugir_do_carnaval_82052'},
 {'date_sent_utc': '2023-04-18',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.20230418142549_vmalenafinguerut_de_aktienpersonalfinance_live_ama_mit_saidi_59562'},
 {'date_sent_utc': '2023-04-25',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.20230425184211_vfatimananavati_ca_nhledmontonoilershockey_is_this_the_future_of_48060'},
 {'date_sent_utc': '2023-05-05',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.20230504235355_vcharmainedesouza_au_eurovisionperthtriplej_voyager_ama_douze_points_65320'},
 {'date_sent_utc': '2023-05-08',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.20230508163457_vjennifergale_gb_eurovisionpopheads_live_ama_with_jamala_57549'},
 {'date_sent_utc': '2023-05-09',
  'full_table_name': 'reddit-growth-prod.generated_one_offs.2023050919093

## Loop to get training data for new campaigns

In [100]:
log_query = False
bigquery_client = bigquery.Client()

for d_table_ in tqdm(l_campaign_pt_and_table1):
    try:
        bigquery_client.get_table(train_data_table) 
        print("Table {} already exists.".format(train_data_table))
        template = string.Template(SQL_FULL_INSERT)
    except Exception as e:
        print("Table {} is NOT found.".format(train_data_table))
        template = string.Template(SQL_FULL_CREATE)
        
    sql = template.substitute(
        **d_table_,
        **{'train_data_table': train_data_table}
    )
    if log_query:
        print(sql)

    print(f"Running query for params:...\n  {d_table_}")
    query_start_time = datetime.utcnow()
    print(f"  {query_start_time} | query START time")

    query_job = bigquery_client.query(sql)
    query_job.result()
    query_end_time = datetime.utcnow()
    print(f"  {query_end_time} | query END time")
    print(f"  {query_end_time - query_start_time} | query ELAPSED time")

  0%|          | 0/6 [00:00<?, ?it/s]

Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-02-20', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230215191916_vbrunamarreiros_br_brdevconversas_quer_fugir_do_carnaval_82052'}
  2023-05-16 21:51:25.649181 | query START time


 17%|█▋        | 1/6 [00:17<01:27, 17.47s/it]

  2023-05-16 21:51:42.743764 | query END time
  0:00:17.094583 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-04-18', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230418142549_vmalenafinguerut_de_aktienpersonalfinance_live_ama_mit_saidi_59562'}
  2023-05-16 21:51:42.925024 | query START time


 33%|███▎      | 2/6 [01:19<02:54, 43.60s/it]

  2023-05-16 21:52:44.632899 | query END time
  0:01:01.707875 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-04-25', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230425184211_vfatimananavati_ca_nhledmontonoilershockey_is_this_the_future_of_48060'}
  2023-05-16 21:52:44.821726 | query START time


 50%|█████     | 3/6 [01:32<01:29, 29.70s/it]

  2023-05-16 21:52:57.782844 | query END time
  0:00:12.961118 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-05', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230504235355_vcharmainedesouza_au_eurovisionperthtriplej_voyager_ama_douze_points_65320'}
  2023-05-16 21:52:58.005807 | query START time


 67%|██████▋   | 4/6 [01:45<00:46, 23.12s/it]

  2023-05-16 21:53:10.813564 | query END time
  0:00:12.807757 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-08', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230508163457_vjennifergale_gb_eurovisionpopheads_live_ama_with_jamala_57549'}
  2023-05-16 21:53:11.003217 | query START time


 83%|████████▎ | 5/6 [01:57<00:19, 19.26s/it]

  2023-05-16 21:53:23.243711 | query END time
  0:00:12.240494 | query ELAPSED time
Table reddit-employee-datasets.david_bermejo.pn_training_data_20230515 already exists.
Running query for params:...
  {'date_sent_utc': '2023-05-09', 'full_table_name': 'reddit-growth-prod.generated_one_offs.20230509190939_vjennifergale_us_formuladank_hot_post_in_rformula1_20203'}
  2023-05-16 21:53:23.409687 | query START time


100%|██████████| 6/6 [02:13<00:00, 22.20s/it]

  2023-05-16 21:53:38.496667 | query END time
  0:00:15.086980 | query ELAPSED time





## Check output after adding new campaigns

In [101]:
%%time
%%bigquery df_train_check2

SELECT 
    -- pt_send
    pn_id
    , SUM(send) AS send_total
    , SUM(receive) AS receive_total
    , SUM(receive_not_suppressed) AS receive_not_suppressed_total
    , SUM(click) AS click_total
    , SAFE_DIVIDE(SUM(click), SUM(receive)) AS ctr_receive
    , SAFE_DIVIDE(SUM(click), SUM(receive_not_suppressed)) AS ctr_receive_no_suppressed
FROM `reddit-employee-datasets.david_bermejo.pn_training_data_20230515` 
GROUP BY 1
ORDER BY send_total DESC
;

Query is running:   0%|          |

Downloading:   0%|          |

CPU times: user 92.8 ms, sys: 10.9 ms, total: 104 ms
Wall time: 2.12 s


In [102]:
df_train_check2

Unnamed: 0,pn_id,send_total,receive_total,receive_not_suppressed_total,click_total,ctr_receive,ctr_receive_no_suppressed
0,2022-12-02-Der Top-Beitrag diese Woche 🏆-https...,285817,253583,178170,13229,0.052168,0.074249
1,2023-05-10-Zelda + AI = 🤯-https://www.reddit.c...,154904,142498,108173,11359,0.079713,0.105008
2,2023-05-10-Live! AMA with Nikhil Chinapa-https...,109511,99571,75929,1979,0.019875,0.026064
3,2023-05-10-Mumbai Indians on the rise ⬆️-https...,106270,93574,72302,3390,0.036228,0.046887
4,2023-04-25-Is this the future of hockey? 🤖-htt...,89288,80518,62967,4458,0.055367,0.070799
5,2023-05-09-Hot post in r/formula1 🔥-https://ww...,80088,71273,60023,5359,0.07519,0.089282
6,"2023-05-05-Voyager AMA, douze points!-https://...",79253,72078,54407,1506,0.020894,0.02768
7,2023-02-20-Quer fugir do Carnaval?-https://www...,78447,71781,58368,2373,0.033059,0.040656
8,2023-04-18-Live! AMA mit Saidi Sulilatu 📈-http...,74027,67817,51759,1448,0.021352,0.027976
9,2023-05-10-Vegemite on pizza: Y/N? 🤔-https://w...,72160,66482,51595,2099,0.031572,0.040682


In [103]:
df_train_check2[[
    'send_total',
    'receive_total', 'receive_not_suppressed_total',
    'click_total'
]].sum()

send_total                      1372949
receive_total                   1242125
receive_not_suppressed_total     957504
click_total                       58739
dtype: int64

In [104]:
df_train_check2[[
    'ctr_receive',
    'ctr_receive_no_suppressed'
]].mean()

ctr_receive                  0.046165
ctr_receive_no_suppressed    0.058417
dtype: float64