In [None]:
!/opt/conda/envs/iu_py27/bin/pip install cassandra-driver

In [6]:
# packages
from datetime import datetime, timedelta
from pyspark.sql.functions import (
    col,
    count,
    countDistinct,
    create_map,
    dayofweek,
    posexplode,
    from_unixtime,
    greatest,
    hour,
    lit,
    max,
    minute,
    when,
)
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

# Set the variables for cassandra
hostnames = {
    'single_node' : '10.241.0.110',
    'cluster' : '10.128.175.71'
}
hostname = 'single_node'
username = 'featurestore-superuser'
password = 'HlOmCDx4d6wv7uSL6NtV'
auth_provider = PlainTextAuthProvider(
    username=username, password=password
)

target_date = datetime(2022,7,1).strftime('%Y%m%d')
target_table = "mobile_activities"
target_keyspace = "featurestore"

In [None]:
#====================================================================#
# create target table                                                #
#====================================================================#

if hostname == 'single_node':
    cluster = Cluster([hostnames[hostname]])
if hostname == 'cluster':
    cluster = Cluster(
        [hostnames[hostname]],
        port=9042,
        auth_provider=auth_provider
    )

session = cluster.connect()

create_keyspace = """
CREATE KEYSPACE IF NOT EXISTS featurestore
WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : '1' }
AND durable_writes = false
"""

session.execute(create_keyspace)

create_table = """
CREATE TABLE IF NOT EXISTS featurestore.{target_table}
( mid text
, app_type text
, app_ver text
, region text
, log_date text
, PRIMARY KEY ((log_date, mid))
)
"""

session.execute(
    create_table.format(target_table=target_table)
)

In [3]:
#====================================================================#
# source tables from IU                                              #
#====================================================================#

query1 = """
select
    mid,
    app_type,
    app_ver,
    region,
    log_date
from
    linewallet_business_pro.w_user_base
where
    log_date='{target_date}'
"""

#====================================================================#
# Generate the features of mobile activities                         #
#====================================================================#

result = spark.sql(query1.format(target_date=target_date))

In [8]:
#====================================================================#
# Insert the features to cassandra                                   #
#====================================================================#

print(
    "size of the {target_table} in IU : {size}"
    .format(
        target_table = target_table,
        size = format(result.count(), ',')
    )
)
print(
    "[{start_time}] Insert START - table : {target_table}"
    .format(
        start_time = datetime.now(),
        target_table = target_table
    )
)

# extract 10 samples
sample = result.head(10)
df_sample = spark.createDataFrame(sample)

# write to cassandra
(
    df_sample
    .write
    .format("org.apache.spark.sql.cassandra")
    .mode("overwrite")
    .option("confirm.truncate", "true")
    .options(table=target_table, keyspace=target_keyspace)
    .save()
)
print(
    "[{end_time}] Insert DONE - table : {target_table}"
    .format(
        end_time = datetime.now(),
        target_table = target_table
    )
)

#====================================================================#
# Verify the migrated features in cassandra                          #
#====================================================================#

df_cassandra = (
    spark
    .read
    .format("org.apache.spark.sql.cassandra")
    .options(table=target_table, keyspace=target_keyspace)
    .load()
)
df_cassandra.show()

Insert DONE - table : mobile_activities
size of the mobile_activities in IU : 16,534,604
size of the mobile_activities in Cassandra : 10
+--------+--------------------+--------+-------+------+
|log_date|                 mid|app_type|app_ver|region|
+--------+--------------------+--------+-------+------+
|20220701|u0108cfad4bd5a49d...|     IOS|12.10.0|    ID|
|20220701|u00d3f847f2c6f5dd...|     IOS| 12.9.0|    ID|
|20220701|u00157b35fcd7af4b...| ANDROID|11.19.1|    ID|
|20220701|u007652735cfe4eda...| ANDROID|12.10.1|    ID|
|20220701|u00205b81e0c9f539...| ANDROID|12.10.1|    ID|
|20220701|u0120c75fbeaef4a7...| ANDROID| 12.6.1|    ID|
|20220701|u0044d7c7d7876d24...|     IOS|12.10.0|    ID|
|20220701|u012b0dd82f1e51d1...| ANDROID|12.10.1|    ID|
|20220701|u012c086d0d696ac5...| ANDROID| 12.8.0|    ID|
|20220701|u01446a6b423882a5...|     IOS| 12.9.0|    ID|
+--------+--------------------+--------+-------+------+



In [7]:
#====================================================================#
# Calculate statistics information                                   #
#====================================================================#

base_features_label = ['app_type', 'app_ver', 'region']
base_features_continuous = []

for feature in base_features_label:
    statistics = (
        result
        .groupBy(feature).count().orderBy(feature)
        .withColumn('total', lit(result.count()))
        .withColumnRenamed(feature, 'label')
    )
    statistics = (
        statistics
        .withColumn(
            'probability', statistics['count']/statistics['total']
        )
        .withColumn('base_feature_group', lit(target_table))
        .withColumn('base_feature', lit(feature))
        .drop('total')
    )
    statistics.show()

+-------+-------+------------------+------------------+------------+
|  label|  count|       probability|base_feature_group|base_feature|
+-------+-------+------------------+------------------+------------+
|ANDROID|7130923|0.4312726812205481| mobile_activities|    app_type|
|    IOS|9403681|0.5687273187794518| mobile_activities|    app_type|
+-------+-------+------------------+------------------+------------+

+-------+-----+--------------------+------------------+------------+
|  label|count|         probability|base_feature_group|base_feature|
+-------+-----+--------------------+------------------+------------+
| 10.0.0|  250|1.511980571170618...| mobile_activities|     app_ver|
| 10.0.1|  903|5.461273823068275E-5| mobile_activities|     app_ver|
| 10.0.2| 1038| 6.27774333150041E-5| mobile_activities|     app_ver|
| 10.1.0|   68|4.112587153584083E-6| mobile_activities|     app_ver|
| 10.1.1| 2095|1.267039718640978...| mobile_activities|     app_ver|
|10.10.0|  739| 4.46941456838035E