Copyright (c) Microsoft Corporation. 
Licensed under the MIT license. 
# Data Engineering

After cleaning the data, we transform it in order to capture relevant metrics for ML modeling. These metrics  capture information related to:
* Users & sessions
* Buying behavior
* Product details - brand, category, subcategories, product

Results are written to the delta lake.


## Library Imports


In [None]:
import pyspark
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *

## Read in Data from Delta Lake

In [None]:
data_lake_account_name = ''
file_system_name = ''

In [None]:
full_dataset = ''

In [None]:
paths = [f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/transformed_data/cleaned_data_electronics']
full_dataset = spark.read.format("delta").load(*paths)

In [None]:
# add month & year, re-order columns
cleaned_df = full_dataset.withColumn('month', month('event_time')) \
    .withColumn('year', year('event_time')) \
    .drop('category_code') \
    .select('user_id', 'year', 'month', 'event_type', 'product_id', 'category_id', 'category', 'subcategory', 'brand', 'price', 'user_session', 'event_time')

In [None]:
# write cleaned_df table to an intermediate spark table
cleaned_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/intermediate_tables/cleaned_df")

In [None]:
# read cleaned_df table from intermediate spark table
cleaned_df = spark.read.format("delta").load(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/intermediate_tables/cleaned_df")

## Data Transformation

### Growth Indicator

Classify customers as growth (1) or no growth (0) based on the month-over-month change in net revenue.

1. Growth if there is a >10% net revenue increase
1. No growth if there is a >10% net revenue decrease


In [None]:
# get monthly revenue
growth = cleaned_df.filter(col('event_type') == 'purchase') \
    .withColumn('revenue', cleaned_df['price'].cast('double'))\
    .groupBy('user_id', 'year', 'month') \
    .sum('revenue') \
    .withColumnRenamed('sum(revenue)', 'total_net_revenue') \
    .orderBy('user_id', 'year', 'month')

In [None]:
# get deltas for previous month
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

window_specs = Window.partitionBy('user_id').orderBy('user_id', 'year', 'month')

growth_lag = growth.withColumn('last_month_revenue', lag(growth.total_net_revenue).over(window_specs).cast('double'))
growth_delta = growth_lag.withColumn('delta_net_revenue', (growth_lag.total_net_revenue - growth_lag.last_month_revenue).cast('double'))

In [None]:
# identify growth vs. no growth customers
# growth defined as +/-10% revenue month-over-month

df_growth_a = growth_delta.withColumn('percent_delta_revenue', growth_delta['delta_net_revenue']/growth_delta['last_month_revenue'].cast('double'))
df_growth = df_growth_a.withColumn('growth', 
        when(df_growth_a['percent_delta_revenue'] > .1, 1)
        .when(df_growth_a['percent_delta_revenue'] < -.1, 0)) \
        .drop('last_month_revenue', 'delta_net_revenue', 'total_net_revenue', 'percent_delta_revenue') \
        .filter(col('growth').isNotNull())

## Aggregated Metrics

Transform data to produce metrics related to user sessions, buying behavior, and product categories. All features are aggregated on a per-user, per-month basis.

### Session & Buying Metrics

* Number of sessions
* Average session duration
* Average conversion rate
* Average order value
* Average cart abandon rate


In [None]:
# sessions per user
sessions_per_user_per_month = cleaned_df.groupBy('user_id', 'year', 'month') \
    .agg(countDistinct('user_session').alias('sessions_per_user_per_month')) \
    .fillna({'sessions_per_user_per_month': 0}) \
    .orderBy('user_id', 'year', 'month')

In [None]:
# avg session duration
# time between start & end of each session, aggregated per user per month
session_durations = cleaned_df.groupBy('user_id', 'year', 'month', 'user_session') \
    .agg(
        unix_timestamp(min('event_time')).alias('session_start_time'),
        unix_timestamp(max('event_time')).alias('session_end_time')) \
    .withColumn('session_duration', col('session_end_time')-col('session_start_time')) \
    .drop('user_session', 'session_start_time', 'session_end_time')

avg_session_duration_per_user_per_month = session_durations.groupBy('user_id', 'year', 'month') \
    .agg(mean('session_duration').cast('double').alias('avg_session_duration_per_user_per_month')) \
    .orderBy('user_id', 'year', 'month')

#avg_session_duration_per_user_per_month.orderBy(desc('avg_session_duration_per_user_per_month')).show(5)

In [None]:
# avg conversion rate
# avg # purchases / # views per user per month
avg_conversion_rate_per_user_per_month = cleaned_df.groupBy('user_id', 'year', 'month') \
    .agg(
        count(when(col('event_type') == 'view', True)).alias('num_views'),
        count(when(col('event_type') == 'purchase', True)).alias('num_purchases')) \
    .fillna({'num_views': 0, 'num_purchases': 0}) \
    .withColumn('avg_conversion_rate_per_user_per_month', (col('num_purchases')/col('num_views')).cast('double')) \
    .drop('num_views', 'num_purchases') \
    .orderBy('user_id', 'year', 'month')

#avg_conversion_rate_per_user_per_month.orderBy(desc('avg_conversion_rate_per_user_per_month')).show(5)

In [None]:
# avg order value
# price per user per month, for purchases only
avg_order_value_per_user_per_month = cleaned_df.filter(col('event_type') == 'purchase') \
    .groupBy('user_id', 'year', 'month') \
    .agg(mean('price').cast('double').alias('avg_order_value_per_user_per_month')) \
    .orderBy('user_id', 'year', 'month')

#avg_order_value_per_user_per_month.show(5)

In [None]:
# avg_cart_abandon_rate
# items that were added to cart, but not purchased
abandon_rate_per_session = cleaned_df.filter((col('event_type') == 'purchase') | (col('event_type') == 'cart')) \
    .groupBy('user_id', 'year', 'month', 'user_session', 'product_id') \
    .pivot('event_type').agg(count('product_id')) \
    .fillna({'cart':0, 'purchase':0}) \
    .withColumn('cart_abandon_rate', (col('cart')-col('purchase'))/col('cart'))

avg_cart_abandon_rate = abandon_rate_per_session.groupBy('user_id', 'year', 'month') \
    .agg(mean('cart_abandon_rate').cast('double').alias('avg_cart_abandon_rate'))

#avg_cart_abandon_rate.show(5)

### Brand, Subcategory, & Product Metrics

For the top 5 most popular values in each product-related category (brand, subcategory, and product_id), identify the frequency of user clickstream interactions (product views, add to cart, and purchases).

In [None]:
# reusable function
## event_type = clickstream activity (view, cart, purchase)
## match_type = product-related column (brand, subcategory, product_id)

def get_top_5(df, event_type, match_type):

    # get list of top 5
    top_5_list = df.filter(col('event_type')==event_type).groupBy(match_type).pivot('event_type') \
        .agg(count('user_session')).orderBy(desc(event_type)) \
        .select(match_type).limit(5).rdd.flatMap(lambda x: x).collect()
        
    # filter df for top 5
    top_5_df = df.where(col(match_type).isin(top_5_list)) \
        .filter(col('event_type')==event_type) \
        .groupBy('user_id', 'year', 'month') \
        .pivot(match_type) \
        .agg(count('user_session'))

    # reformat types / naming convention
    if (event_type == 'view'):
        event_type = 'viewed'
    elif (event_type == 'cart'):
        event_type = 'added'
    else:
        event_type = 'purchased'

    # convert to binary & count columns
    for i in range(1, len(top_5_list)+1):
        i_name = top_5_list[i-1]
        top_5_df = top_5_df.withColumn(f'{match_type}_{i_name}_{event_type}_binary', when(col(i_name).isNotNull(), 1).otherwise(0)) \
            .withColumnRenamed(f'{i_name}', f'{match_type}_{i_name}_{event_type}_count') \
            .fillna({f'{match_type}_{i_name}_{event_type}_count': 0})

    return top_5_df

In [None]:
# brands
top_brands_viewed = get_top_5(cleaned_df, 'view', 'brand')
top_brands_added = get_top_5(cleaned_df, 'cart', 'brand')
top_brands_purchased = get_top_5(cleaned_df, 'purchase', 'brand')

# subcategories
top_subcategories_viewed = get_top_5(cleaned_df, 'view', 'subcategory')
top_subcategories_added = get_top_5(cleaned_df, 'cart', 'subcategory')
top_subcategories_purchased = get_top_5(cleaned_df, 'purchase', 'subcategory')

# products
top_products_viewed = get_top_5(cleaned_df, 'view', 'product_id')
top_products_added = get_top_5(cleaned_df, 'cart', 'product_id')
top_products_purchased = get_top_5(cleaned_df, 'purchase', 'product_id')

## Join DataFrames into Single DataFrame

In [None]:
# join dfs
def join_dfs (df_list):
    joined_df = df_growth
    for l in df_list:
        joined_df = joined_df.join(l, ['user_id', 'year', 'month'], how='left')
    return joined_df

features_df = join_dfs([sessions_per_user_per_month, \
    avg_session_duration_per_user_per_month, \
    avg_conversion_rate_per_user_per_month, \
    avg_order_value_per_user_per_month, \
    avg_cart_abandon_rate, \
    top_brands_viewed, top_brands_added, top_brands_purchased, \
    top_subcategories_viewed, top_subcategories_added, top_subcategories_purchased, \
    top_products_viewed, top_products_added, top_products_purchased
    ]).fillna(0)

# display(features_df.take(15))

## Save Transformed Data to a Delta Table


In [None]:
# write transformed data to spark table
features_df.write.format('delta').mode('overwrite').option("overwriteSchema", "true").save(f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/transformed_data/transformed_data')