In [None]:
import sys

#!{sys.executable} -m pip install google-cloud-bigquery[pandas]

In [None]:
%load_ext google.cloud.bigquery

In [3]:
QUERY = '''
SELECT
  customer_id,
  order_date,
  order_value,
  order_qty_articles
FROM
(
  SELECT
    CustomerID AS customer_id,
    PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)) AS order_date,
    ROUND(SUM(UnitPrice * Quantity), 2) AS order_value,
    SUM(Quantity) AS order_qty_articles,
    (
      SELECT
        MAX(PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)))
      FROM
        `<<project_id>>.<<dataset_id>>.data_source` tl
      WHERE
        tl.CustomerID = t.CustomerID
    ) latest_order
  FROM
    `<<project_id>>.<<dataset_id>>.data_source` t
  GROUP BY
      CustomerID,
      order_date
) a

INNER JOIN (
  -- Only customers with more than one positive order values before threshold.
  SELECT
    CustomerID
  FROM (
    -- Customers and how many positive order values  before threshold.
    SELECT
      CustomerID,
      SUM(positive_value) cnt_positive_value
    FROM (
      -- Customer with whether order was positive or not at each date.
      SELECT
        CustomerID,
        (
          CASE
            WHEN SUM(UnitPrice * Quantity) > 0 THEN 1
            ELSE 0
          END ) positive_value
      FROM
        `<<project_id>>.<<dataset_id>>.data_source`
      WHERE
        PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)) < DATE("<<threshold_date>>")
      GROUP BY
        CustomerID,
        SUBSTR(InvoiceDate, 0, 8) )
    GROUP BY
      CustomerID )
  WHERE
    cnt_positive_value > 1
  ) b
ON
  a.customer_id = b. CustomerID
--[START common_clean]
WHERE
  -- Bought in the past 3 months
  DATE_DIFF(DATE("<<predict_date>>"), latest_order, DAY) <= 90
  -- Make sure returns are consistent.
  AND (
    (order_qty_articles > 0 and order_Value > 0) OR
    (order_qty_articles < 0 and order_Value < 0)
  )
'''



threshold_date = '2011-08-08'
predict_date = '2011-12-12'
project_id = 'sandbox-235500'
dataset_id = "CLVDataset"

QUERY = QUERY.replace("<<threshold_date>>", threshold_date)
QUERY = QUERY.replace("<<predict_date>>", predict_date)
QUERY = QUERY.replace("<<project_id>>", project_id)
QUERY = QUERY.replace("<<dataset_id>>", dataset_id)
QUERY

'\nSELECT\n  customer_id,\n  order_date,\n  order_value,\n  order_qty_articles\nFROM\n(\n  SELECT\n    CustomerID AS customer_id,\n    PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)) AS order_date,\n    ROUND(SUM(UnitPrice * Quantity), 2) AS order_value,\n    SUM(Quantity) AS order_qty_articles,\n    (\n      SELECT\n        MAX(PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)))\n      FROM\n        `sandbox-235500.CLVDataset.data_source` tl\n      WHERE\n        tl.CustomerID = t.CustomerID\n    ) latest_order\n  FROM\n    `sandbox-235500.CLVDataset.data_source` t\n  GROUP BY\n      CustomerID,\n      order_date\n) a\n\nINNER JOIN (\n  -- Only customers with more than one positive order values before threshold.\n  SELECT\n    CustomerID\n  FROM (\n    -- Customers and how many positive order values  before threshold.\n    SELECT\n      CustomerID,\n      SUM(positive_value) cnt_positive_value\n    FROM (\n      -- Customer with whether order was positive or not at each date.\n      

In [4]:
from google.cloud import bigquery

client = bigquery.Client()

job_config = bigquery.QueryJobConfig()
table_ref = client.dataset(dataset_id).table('data_cleaned')
job_config.destination = table_ref
job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE



query_job = client.query(QUERY, location='US', job_config=job_config)  # API request
rows = iter(query_job.result())  # Waits for query to finish

for _ in range(5):
    print(next(rows))
    

Row(('16525', datetime.date(2011, 5, 10), 124.28, 356), {'customer_id': 0, 'order_date': 1, 'order_value': 2, 'order_qty_articles': 3})
Row(('14397', datetime.date(2011, 5, 10), 165.44, 184), {'customer_id': 0, 'order_date': 1, 'order_value': 2, 'order_qty_articles': 3})
Row(('16357', datetime.date(2011, 5, 10), 137.9, 22), {'customer_id': 0, 'order_date': 1, 'order_value': 2, 'order_qty_articles': 3})
Row(('12901', datetime.date(2011, 5, 10), -208.8, -1200), {'customer_id': 0, 'order_date': 1, 'order_value': 2, 'order_qty_articles': 3})
Row(('13592', datetime.date(2011, 5, 10), 95.94, 38), {'customer_id': 0, 'order_date': 1, 'order_value': 2, 'order_qty_articles': 3})


In [7]:
QUERY = '''
SELECT
  tf.customer_id,
  -- For training period
  -- Copying the calculations from Lifetimes where first orders are ignored
  -- See https://github.com/CamDavidsonPilon/lifetimes/blob/master/lifetimes/utils.py#L246
--[START features_target]
  tf.monetary_dnn,
  tf.cnt_orders AS frequency_dnn,
  tf.cnt_orders - 1 AS frequency_btyd,
  tf.recency,
  tf.T,
  ROUND(tf.recency/cnt_orders, 2) AS time_between,
  ROUND(tf.avg_basket_value, 2) AS avg_basket_value,
  ROUND(tf.avg_basket_size, 2) AS avg_basket_size,
  tf.cnt_returns,
  -- Target calculated for overall period
  ROUND(tt.target_monetary, 2) as target_monetary
--[END features_target]
FROM
  -- This SELECT uses only data before threshold to make features.
  (
    SELECT
      customer_id,
      SUM(order_value) AS monetary_dnn,
      DATE_DIFF(MAX(order_date), MIN(order_date), DAY) AS recency,
      DATE_DIFF(DATE('<<threshold_date>>'), MIN(order_date), DAY) AS T,
      COUNT(DISTINCT order_date) AS cnt_orders,
      AVG(order_qty_articles) avg_basket_size,
      AVG(order_value) avg_basket_value,
      SUM(CASE
          WHEN order_value < 1 THEN 1
          ELSE 0 END) AS cnt_returns
    FROM
      -- Makes the order value = 0 if it is the first one
      (
        SELECT
          a.*,
          (CASE
              WHEN a.order_date = c.order_date_min THEN 0
              ELSE a.order_value END) AS order_value_btyd
        FROM
          `<<project_id>>.<<dataset_id>>.data_cleaned` a
        INNER JOIN (
          SELECT
            customer_id,
            MIN(order_date) AS order_date_min
          FROM
            `<<project_id>>.<<dataset_id>>.data_cleaned`
          GROUP BY
            customer_id) c
        ON
          c.customer_id = a.customer_id
      )
    WHERE
      order_date <= DATE('<<threshold_date>>')
    GROUP BY
      customer_id) tf,

  -- This SELECT uses all records to calculate the target (could also use data after threshold )
  (
    SELECT
      customer_id,
      SUM(order_value) target_monetary
    FROM
      `<<project_id>>.<<dataset_id>>.data_cleaned`
      --WHERE order_date > DATE('<<threshold_date>>')
    GROUP BY
      customer_id) tt
WHERE
  tf.customer_id = tt.customer_id
  AND tf.monetary_dnn > 0
  AND tf.monetary_dnn <= <<max_monetary>>

'''

max_monetary = "15000"

QUERY = QUERY.replace("<<threshold_date>>", threshold_date)
QUERY = QUERY.replace("<<predict_date>>", predict_date)
QUERY = QUERY.replace("<<project_id>>", project_id)
QUERY = QUERY.replace("<<dataset_id>>", dataset_id)
QUERY = QUERY.replace("<<max_monetary>>", max_monetary)
QUERY

"\nSELECT\n  tf.customer_id,\n  -- For training period\n  -- Copying the calculations from Lifetimes where first orders are ignored\n  -- See https://github.com/CamDavidsonPilon/lifetimes/blob/master/lifetimes/utils.py#L246\n--[START features_target]\n  tf.monetary_dnn,\n  tf.cnt_orders AS frequency_dnn,\n  tf.cnt_orders - 1 AS frequency_btyd,\n  tf.recency,\n  tf.T,\n  ROUND(tf.recency/cnt_orders, 2) AS time_between,\n  ROUND(tf.avg_basket_value, 2) AS avg_basket_value,\n  ROUND(tf.avg_basket_size, 2) AS avg_basket_size,\n  tf.cnt_returns,\n  -- Target calculated for overall period\n  ROUND(tt.target_monetary, 2) as target_monetary\n--[END features_target]\nFROM\n  -- This SELECT uses only data before threshold to make features.\n  (\n    SELECT\n      customer_id,\n      SUM(order_value) AS monetary_dnn,\n      DATE_DIFF(MAX(order_date), MIN(order_date), DAY) AS recency,\n      DATE_DIFF(DATE('2011-08-08'), MIN(order_date), DAY) AS T,\n      COUNT(DISTINCT order_date) AS cnt_orders,\

In [8]:
feature_table = 'features'
project_id = 'sandbox-235500'
dataset_id = "CLVDataset"

job_config = bigquery.QueryJobConfig()
table_ref = client.dataset(dataset_id).table(feature_table)
job_config.destination = table_ref
job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE

query_job = client.query(QUERY, location='US', job_config=job_config)  # API request
rows = iter(query_job.result())  # Waits for query to finish


for _ in range(5):
    print(next(rows))

Row(('13461', 884.2, 2, 1, 25, 40, 12.5, 442.1, 418.0, 0, 1445.0), {'customer_id': 0, 'monetary_dnn': 1, 'frequency_dnn': 2, 'frequency_btyd': 3, 'recency': 4, 'T': 5, 'time_between': 6, 'avg_basket_value': 7, 'avg_basket_size': 8, 'cnt_returns': 9, 'target_monetary': 10})
Row(('16442', 437.03, 2, 1, 133, 237, 66.5, 218.51, 177.0, 0, 616.79), {'customer_id': 0, 'monetary_dnn': 1, 'frequency_dnn': 2, 'frequency_btyd': 3, 'recency': 4, 'T': 5, 'time_between': 6, 'avg_basket_value': 7, 'avg_basket_size': 8, 'cnt_returns': 9, 'target_monetary': 10})
Row(('14456', 636.65, 2, 1, 11, 19, 5.5, 318.32, 288.0, 0, 3047.63), {'customer_id': 0, 'monetary_dnn': 1, 'frequency_dnn': 2, 'frequency_btyd': 3, 'recency': 4, 'T': 5, 'time_between': 6, 'avg_basket_value': 7, 'avg_basket_size': 8, 'cnt_returns': 9, 'target_monetary': 10})
Row(('17516', 455.21, 2, 1, 68, 162, 34.0, 227.6, 154.5, 0, 768.08), {'customer_id': 0, 'monetary_dnn': 1, 'frequency_dnn': 2, 'frequency_btyd': 3, 'recency': 4, 'T': 5, 't