In [None]:
!/opt/conda/envs/iu_py27/bin/pip install cassandra-driver

In [1]:
# packages
from datetime import datetime, timedelta
from pyspark.sql.functions import (
    col,
    count,
    countDistinct,
    create_map,
    dayofweek,
    posexplode,
    from_unixtime,
    greatest,
    hour,
    lit,
    max,
    minute,
    when,
)
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

# Set the variables for cassandra
hostnames = {
    'single_node' : '10.241.0.110',
    'cluster' : '10.128.175.71'
}
hostname = 'cluster'
username = 'featurestore-superuser'
password = 'HlOmCDx4d6wv7uSL6NtV'
auth_provider = PlainTextAuthProvider(
    username=username, password=password
)

target_date = datetime(2022,7,1).strftime('%Y%m%d')
target_table = "demographics"
target_keyspace = "featurestore"

In [7]:
#====================================================================#
# create target table                                                #
#====================================================================#

if hostname == 'single_node':
    cluster = Cluster([hostnames[hostname]])
if hostname == 'cluster':
    cluster = Cluster(
        [hostnames[hostname]],
        port=9042,
        auth_provider=auth_provider
    )

session = cluster.connect()

create_keyspace = """
CREATE KEYSPACE IF NOT EXISTS featurestore
WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : '1' }
AND durable_writes = false
"""

session.execute(create_keyspace)

create_table = """
CREATE TABLE IF NOT EXISTS featurestore.{target_table}
( mid text
, log_date text
, age_range text
, sex text
, region text
, occupation text
, PRIMARY KEY ((log_date, mid))
)
"""

session.execute(
    create_table.format(target_table=target_table)
)

<cassandra.cluster.ResultSet at 0x7f7aa8645a10>

In [2]:
#====================================================================#
# source tables from IU                                              #
#====================================================================#
query1 = """
select
    mid,
    age_range,
    sex,
    region,
    log_date
from
    linewallet_business_pro.w_user_base
where
    log_date='{target_date}'
"""

query2 = """
select
    mid,
    occupation,
    dt as log_date
from
    linewallet_pro.wallet_user_segments
where
    dt='{target_date}'
"""

w_user_base = spark.sql(query1.format(target_date=target_date))
wallet_user_segments = (
    spark.sql(query2.format(target_date=target_date))
)

#====================================================================#
# Join the features to the IU source table                           #
#====================================================================#
result = w_user_base.join(
    wallet_user_segments,
    on=['mid', 'log_date'],
    how='leftouter',
)

In [11]:
#====================================================================#
# Insert the features to cassandra                                   #
#====================================================================#

print(
    "size of the {target_table} in IU : {size}"
    .format(
        target_table = target_table,
        size = format(result.count(), ',')
    )
)
print(
    "[{start_time}] Insert START - table : {target_table}"
    .format(
        start_time = datetime.now(),
        target_table = target_table
    )
)

# extract 10 samples
sample = result.head(10)
df_sample = spark.createDataFrame(sample)

# write to cassandra
(
    df_sample
    .write
    .format("org.apache.spark.sql.cassandra")
    .mode("overwrite")
    .option("confirm.truncate", "true")
    .options(table=target_table, keyspace=target_keyspace)
    .save()
)
print(
    "[{end_time}] Insert DONE - table : {target_table}"
    .format(
        end_time = datetime.now(),
        target_table = target_table
    )
)

#====================================================================#
# Verify the migrated features in cassandra                          #
#====================================================================#

df_cassandra = (
    spark
    .read
    .format("org.apache.spark.sql.cassandra")
    .options(table=target_table, keyspace=target_keyspace)
    .load()
)
df_cassandra.show()

size of the demographics in IU : 16,534,604
[2022-08-11 14:44:38.739034] Insert START - table : demographics
[2022-08-11 14:55:59.722925] Insert DONE - table : demographics
+--------------------+--------+---------+---+------+----------+
|                 mid|log_date|age_range|sex|region|occupation|
+--------------------+--------+---------+---+------+----------+
|u0000610c0dccbfd8...|20220701|    20-24|  m|    JP|         2|
|u0000e07e0bebd61e...|20220701|    25-29|  m|    JP|         2|
|u0001ac038cec74a1...|20220701|    25-29|  f|    TH|      null|
|u000227dc7e2be041...|20220701|    45-49|  f|    JP|         5|
|u0002b986c4f4dd6a...|20220701|    35-39|  f|    JP|         3|
|u0002bb891c9de44f...|20220701|    45-49|  f|    TH|      null|
|u0002c31acbb31169...|20220701|      50-|  m|    TH|      null|
|u0002e2b2d78a7ba9...|20220701|      50-|  m|    TH|      null|
|u0002e9193af290e3...|20220701|      50-|  f|    TW|      null|
|u000395fb0a2820d8...|20220701|      50-|  f|    JP|       

In [3]:
#====================================================================#
# Calculate statistics information                                   #
#====================================================================#

base_features_label = ['age_range', 'sex', 'region', 'occupation']
base_features_continuous = []

for feature in base_features_label:
    statistics = (
        result
        .groupBy(feature).count().orderBy(feature)
        .withColumn('total', lit(result.count()))
        .withColumnRenamed(feature, 'label')
    )
    statistics = (
        statistics
        .withColumn(
            'probability', statistics['count']/statistics['total']
        )
        .withColumn('base_feature_group', lit(target_table))
        .withColumn('base_feature', lit(feature))
        .drop('total')
    )
    statistics.show()

+-------+-------+--------------------+------------------+------------+
|  label|  count|         probability|base_feature_group|base_feature|
+-------+-------+--------------------+------------------+------------+
|    -14| 596976|0.036104644538206054|      demographics|   age_range|
|  15-19| 973014| 0.05884713053908034|      demographics|   age_range|
|  20-24|1632986| 0.09876172419974497|      demographics|   age_range|
|  25-29|1890018| 0.11430681980651003|      demographics|   age_range|
|  30-34|1776041| 0.10741357942409749|      demographics|   age_range|
|  35-39|1509270|  0.0912794766660272|      demographics|   age_range|
|  40-44|1586085| 0.09592518816900604|      demographics|   age_range|
|  45-49|1561801| 0.09445651072139376|      demographics|   age_range|
|    50-|4988504| 0.30170084508827666|      demographics|   age_range|
|unknown|  19909|0.001204080847657434|      demographics|   age_range|
+-------+-------+--------------------+------------------+------------+

+----