In [None]:
!/opt/conda/envs/iu_py27/bin/pip install cassandra-driver

In [1]:
# packages
from itertools import chain
from datetime import datetime, timedelta
from pyspark.sql.functions import (
    col,
    count,
    countDistinct,
    create_map,
    dayofweek,
    posexplode,
    from_unixtime,
    greatest,
    hour,
    lit,
    max,
    minute,
    when,
)
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

# Set the variables for cassandra
hostnames = {
    'single_node' : '10.241.0.110',
    'cluster' : '10.128.175.71'
}
hostname = 'cluster'
username = 'featurestore-superuser'
password = 'HlOmCDx4d6wv7uSL6NtV'
auth_provider = PlainTextAuthProvider(
    username=username, password=password
)

target_date = datetime(2022,7,1).strftime('%Y%m%d')
start_date = (
    datetime.strptime(target_date, '%Y%m%d') + timedelta(days=-27)
).strftime('%Y%m%d')

target_table = "ladm"
target_keyspace = "featurestore"

In [None]:
#====================================================================#
# create target table                                                #
#====================================================================#

if hostname == 'single_node':
    cluster = Cluster([hostnames[hostname]])
if hostname == 'cluster':
    cluster = Cluster(
        [hostnames[hostname]],
        port=9042,
        auth_provider=auth_provider
    )

session = cluster.connect()
    
create_keyspace = """
CREATE KEYSPACE IF NOT EXISTS featurestore
WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : '1' }
AND durable_writes = false
"""

session.execute(create_keyspace)

create_table = """
CREATE TABLE IF NOT EXISTS featurestore.{target_table}
( mid text
, module_name text
, region text
, user_segment text
, view_count int
, click_count int
, ctr float
, log_date text
, PRIMARY KEY ((log_date, mid))
)
"""

session.execute(
    create_table.format(target_table=target_table)
)

In [6]:
#====================================================================#
# source tables from IU                                              #
#====================================================================#

query1 = """
select
    mid,
    user_segment,
    log_date
from
    linewallet_business_pro.w_user_base
where
    log_date between '{start_date}' and '{target_date}'
"""

query2 = """
select
    mid,
    module_name,
    action,
    region,
    log_date
from
    linewallet_business_pro.w_tslog_base
where
    module_name in ('Ladm', 'LadmV3-2')
    and log_date between '{start_date}' and '{target_date}'
"""

w_user_base = spark.sql(
    query1.format(start_date=start_date, target_date=target_date)
)
w_tslog_base = spark.sql(
    query2.format(start_date=start_date, target_date=target_date)
)

#====================================================================#
# Generate the features to migrate                                   #
#====================================================================#

# user_segment
user_segment = w_user_base.dropDuplicates(['mid']).drop('log_date')

# view & click
view_click = (
    w_tslog_base
    .groupBy('mid')
    .agg(
        count(when(col('action') == 'view', True)).alias('view_count'),
        count(when(col('action') == 'click', True)).alias('click_count'),
    )
)

# ctr
view_click = (
    view_click.withColumn(
        'ctr', view_click['click_count'] / view_click['view_count']
    )
)

#====================================================================#
# Join the features to the IU source table                           #
#====================================================================#

result = (
    w_tslog_base
    .drop('log_date')
    .drop('action')
    .dropDuplicates(['mid'])
    .join(user_segment, on=['mid'], how='leftouter')
    .join(view_click, on=['mid'], how='leftouter')
    .withColumn('module_name', lit('Ladm'))
    .withColumn('log_date', lit(target_date))
)

In [7]:
#====================================================================#
# Insert the features to cassandra                                   #
#====================================================================#

print(
    "size of the {target_table} in IU : {size}"
    .format(
        target_table = target_table,
        size = format(result.count(), ',')
    )
)
print(
    "[{start_time}] Insert START - table : {target_table}"
    .format(
        start_time = datetime.now(),
        target_table = target_table
    )
)

# extract 10 samples
sample = result.head(10)
df_sample = spark.createDataFrame(sample)

# write to cassandra
(
    result
    .write
    .format("org.apache.spark.sql.cassandra")
    .mode("overwrite")
    .option("confirm.truncate", "true")
    .options(table=target_table, keyspace=target_keyspace)
    .save()
)
print(
    "[{end_time}] Insert DONE - table : {target_table}"
    .format(
        end_time = datetime.now(),
        target_table = target_table
    )
)

#====================================================================#
# Verify the migrated features in cassandra                          #
#====================================================================#

df_cassandra = (
    spark
    .read
    .format("org.apache.spark.sql.cassandra")
    .options(table=target_table, keyspace=target_keyspace)
    .load()
)
df_cassandra.show()

Insert DONE - table : ladm
size of the ladm in IU : 35,772,851
size of the ladm in Cassandra : 10
+--------+--------------------+-----------+---+-----------+------+------------+----------+
|log_date|                 mid|click_count|ctr|module_name|region|user_segment|view_count|
+--------+--------------------+-----------+---+-----------+------+------------+----------+
|20220701|u0000436bd98fb322...|          0|0.0|       Ladm|    TH|New/Comeback|         1|
|20220701|u0001549713da2d09...|          0|0.0|       Ladm|    JP|New/Comeback|         1|
|20220701|u000094f4e9fa8774...|          0|0.0|       Ladm|    TH|    Wanderer|         4|
|20220701|u00011af123590a90...|          0|0.0|       Ladm|    JP|   Potential|         2|
|20220701|u00024fea2e6e6f1e...|          0|0.0|       Ladm|    TH|New/Comeback|         1|
|20220701|u0003369605e646e5...|          0|0.0|       Ladm|    TW|    Wanderer|         3|
|20220701|u00005093b7a2f372...|          0|0.0|       Ladm|    TW| Go-straight|    