# Data Prep - Product Entity
For products, much like invoices, we can summarize purchasing statistics for each individual product to have a sense of how it performs given certain situations (time of year, whether it is at a discount or not). 

Additionally, I will add features related to the position of the product in respect to the whole, such as the product's popularity in terms of number of purchases, total spent on, amongst other facts.

In essence, I am implementing an "embedding" of sorts, as I am representing the nominal entity (the name of a product) using a continuous value that has some inherent order to it.

In [1]:
!pip install inflection nb_black >> ../configs/package_installation.txt

In [7]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

<IPython.core.display.Javascript object>

In [8]:
# PySpark dependencies:
import pyspark
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.window import Window

# database utilities:
import pandas as pd

# other relevant libraries:
import warnings
import inflection
import unicodedata
from datetime import datetime, timedelta
import json
import re
import os
from glob import glob
import shutil
import itertools

# setting global parameters for visualizations:
warnings.filterwarnings("ignore")
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

<IPython.core.display.Javascript object>

# 0. Building Spark Session

In [9]:
# loading the configurations needed for Spark
def init_spark(app_name):

    spark = (
        SparkSession.builder.appName(app_name)
        .config("spark.files.overwrite", "true")
        .config("spark.sql.repl.eagerEval.enabled", True)
        .config("spark.sql.repl.eagerEval.maxNumRows", 5)
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    )

    return spark


# init the spark session:
spark = init_spark("Product Preparation")

<IPython.core.display.Javascript object>

In [10]:
# verifying the spark session:
spark

<IPython.core.display.Javascript object>

# 1. Utility Functions

In [11]:
def get_pivot_product_info(df, feature_col):
    """Helper function to reproduce the pivotting and aggregation procedure for generic columns"""
    df_pivot = (
        df.groupby("customer_id")
        .pivot("type_of_product")
        .agg(F.max(F.col(feature_col)))
    )

    new_cols = [f"{col}_{feature_col}" for col in df_pivot.columns[1:]]

    for idx, col in enumerate(new_cols, start=1):
        df_pivot = df_pivot.withColumnRenamed(df_pivot.columns[idx], col)

    return df_pivot


def save_to_filesystem(df, target_path, parquet_path, filename):
    """Helper function to save pyspark dataframes as parquets in a way that is similar to writing to local files

    Args:
        df (pyspark.sql.dataframe.DataFrame): dataframe to be saved
        target_path (str): path that will store the file
        filename (str): name of the resulting file

    Returns:
        None
    """
    PARQUET_FILE = f"{target_path}/{parquet_path}"
    OUTPUT_FILE = f"{target_path}/{filename}"

    if os.path.exists(PARQUET_FILE):
        shutil.rmtree(
            PARQUET_FILE
        )  # if the directory already exists, remove it (throws error if not)

    # saves the dataframe:
    df.coalesce(1).write.save(PARQUET_FILE)

    # retrieves file resulting from the saving procedure:
    original_file = glob(f"{PARQUET_FILE}/*.parquet")[0]

    # renames the resulting file and saves it to the target directory:
    os.rename(original_file, OUTPUT_FILE)

    shutil.rmtree(PARQUET_FILE)

    return True


def apply_category_map(category_map):
    """Helper function to convert strings given a map

    Note:
        This function uses the function generator scheme, much like the PySpark code

    Args:
        original_category (str): the original category name
        category_map (dict): the hash table or dictionary for converting the values:

    Returns:
        new_category (str): the resulting category

    """

    def func(row):
        try:
            result = category_map[row]
        except:
            result = None
        return result

    return F.udf(func)


def get_datetime_features(df, time_col):
    """Function to extract time-based features from pyspark dataframes

    Args:
        df (pyspark.sql.dataframe.DataFrame): the original dataframe that needs to be enriched
        time_col (str): the string name of the column containing the date object

    Returns:
        df (pyspark.sql.dataframe.DataFrame): resulting pyspark dataframe with the added features
            -> See list of attribute the source code for the attributes

    """

    # applying date-related functions:

    # day-level attributes:
    df = df.withColumn("day_of_week", F.dayofweek(F.col(time_col)))

    df = df.withColumn("day_of_month", F.dayofmonth(F.col(time_col)))

    df = df.withColumn("day_of_year", F.dayofyear(F.col(time_col)))

    # week-level attributes:
    df = df.withColumn("week_of_year", F.weekofyear(F.col(time_col)))

    # month-level attributes:
    df = df.withColumn("month", F.month(F.col(time_col)))

    df = df.withColumn("quarter", F.quarter(F.col(time_col)))

    # year-level attributes:
    df = df.withColumn("year", F.year(F.col(time_col)))

    return df


def bulk_aggregate(df, group_col, aggs, target_cols):
    """Wrapper function to apply multiple aggregations when performing group bys

    It utilizes the spark's SQL Context and string interpolation to perform the aggregation using SQL syntax.

    Args:
        df (pyspark.sql.dataframe.DataFrame): dataframe with raw data
        group_col (str): the column that will be used for grouping
        aggs (list): list of aggregations that want to be made (must be the same name as pyspark.sql.functions)
        target_cols (str): columns in which aggregations will be performed

    Returns:
        df_grouped (pyspark.sql.dataframe.DataFrame): dataframe with the grouped data
    """

    # buils the cartersian product of the lists
    aggs_to_perform = itertools.product(aggs, target_cols)

    Q_LAYOUT = """
    SELECT
        {},
        {}
        FROM df
        GROUP BY {}
    """

    aggregations = []
    for agg, col in aggs_to_perform:

        # builds the string for aggregation
        statement = f"{agg.upper()}({col}) as {agg}_{col}"
        aggregations.append(statement)

    full_statement = ",\n".join(aggregations)

    # uses string interpolation to build the full query statement
    QUERY = Q_LAYOUT.format(group_col, full_statement, group_col)

    # registers the dataframe as temporary table:
    df.registerTempTable("df")
    df_grouped = spark.sql(QUERY)

    # rounds values:
    for column in df_grouped.columns:
        df_grouped = df_grouped.withColumn(column, F.round(F.col(column), 1))

    return df_grouped


######### Text Processing Functions ########
@udf("string")
def normalize_text(text):
    """Helper function to normalize text data to ASCII and lower case, removing spaces

    Args:
        text (string): the string that needs to be normalized

    Returns:
        text (string): cleaned up string

    """
    regex = r"[^a-zA-Z0-9]+"

    if text is not None:

        text = str(text)
        text = text.lower()
        text = re.sub(regex, " ", text)
        text = text.strip()
        text = str(
            unicodedata.normalize("NFKD", text).encode("ASCII", "ignore"), "utf-8"
        )

    return text


def get_null_columns(df, normalize=False):
    """Helper function to print the number of null records for each column of a PySpark DataFrame.

    Args:
        df (pyspark.sql.dataframe.DataFrame): a PySpark Dataframe object

    Returns:
        None -> prints to standard out

    """

    if normalize:
        total = df.count()

        df_nulls = df.select(
            [
                (F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)) / total).alias(
                    column
                )
                for column in df.columns
            ]
        )

    else:
        df_nulls = df.select(
            [
                F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)).alias(column)
                for column in df.columns
            ]
        )

    # displaying the results to standard out
    df_nulls.show(1, truncate=False, vertical=True)


@udf("boolean")
def is_set_or_pack(text):

    # description entries to match:
    set_descriptions = {"set", "set of", "pack", "pack of", "box", "box of"}

    if text is not None:
        text = str(text)

        if text in set_descriptions:
            return True

        else:
            return False

    else:
        return False


@udf("integer")
def get_unit_size(text):

    if text is not None:
        check_if_digit = len(re.findall(r"(\d+)", text)) > 0

        if check_if_digit:
            set_size = int(re.findall(r"(\d+)", text)[0])
            return set_size

        else:
            return 1

    else:
        return 1


@udf("boolean")
def has_non_digits_only(text):
    """Function to match entries in the dataset that are purely non-digit characters

    Args:
        text (str): string containing the invoice code

    Returns:
        boolean: whether the text contains non-digit characters and is not related to cancellations

    """

    if text is not None:
        condition = all(character.isalpha() for character in text)

        if condition:
            return True

        else:
            return False

    else:
        return False

<IPython.core.display.Javascript object>

# 2. Loading and Inspecting the Data

In [12]:
# loading the raw dataset:
df_clean = spark.read.parquet("../data/processed/tb_ecommerce.parquet")

<IPython.core.display.Javascript object>

In [13]:
# instantiating the SQL Context:
sql_context = SQLContext(spark)

# generating a temporary view of the raw dataframe:
sql_context.registerDataFrameAsTable(df_clean, "tb_ecommerce")

<IPython.core.display.Javascript object>

In [14]:
# checking the table's schema:
df_clean.printSchema()

root
 |-- invoice_no: string (nullable = true)
 |-- invoice_date: date (nullable = true)
 |-- description: string (nullable = true)
 |-- stock_code: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- unit_price: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- is_missing_customer_id: boolean (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- day_of_year: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- is_commercial_holiday: boolean (nullable = true)
 |-- is_commercial_holiday_week: boolean (nullable = true)
 |-- is_commercial_holiday_month: boolean (nullable = true)
 |-- is_bank_holiday: boolean (nullable = true)
 |-- is_bank_holiday_week: boolean (nullable = true)
 |-- is_bank_holiday_month: boolean (nu

<IPython.core.display.Javascript object>

In [15]:
# visualizing a few entries:
df_clean

invoice_no,invoice_date,description,stock_code,quantity,unit_price,customer_id,country,is_missing_customer_id,day_of_week,day_of_month,day_of_year,week_of_year,month,quarter,year,is_commercial_holiday,is_commercial_holiday_week,is_commercial_holiday_month,is_bank_holiday,is_bank_holiday_week,is_bank_holiday_month,is_cancelled,is_return,is_free_item,total_item_price,retail_price,min_unit_price,avg_unit_price,median_unit_price,max_unit_price,is_discounted_item,has_non_digit,is_postage,is_manual,is_discount,is_fee,days_to_next_commercial_holiday,days_to_next_bank_holiday
536938,2016-12-01,green 3 piece pol...,84997A,24,3.75,14680,united kingdom,False,5,1,336,48,12,4,2016,False,False,True,False,False,True,False,False,False,90.0,3.75,3.39,4.340506329113922,3.75,8.29,False,False,False,False,False,False,25,24
536938,2016-12-01,jumbo bag pink po...,22386,20,1.95,14680,united kingdom,False,5,1,336,48,12,4,2016,False,False,True,False,False,True,False,False,False,39.0,2.08,1.65,2.5887769784172683,2.08,5.06,True,False,False,False,False,False,25,24
536938,2016-12-01,white skull hot w...,21479,72,3.39,14680,united kingdom,False,5,1,336,48,12,4,2016,False,False,True,False,False,True,False,False,False,244.08,4.25,0.0,4.961486486486487,4.25,9.13,True,False,False,False,False,False,25,24
536938,2016-12-01,victorian sewing ...,21258,24,10.95,14680,united kingdom,False,5,1,336,48,12,4,2016,False,False,True,False,False,True,False,False,False,262.79999999999995,12.75,0.0,16.494450549450555,12.75,25.49,True,False,False,False,False,False,25,24
536938,2016-12-01,red retrospot cha...,20724,20,0.85,14680,united kingdom,False,5,1,336,48,12,4,2016,False,False,True,False,False,True,False,False,True,17.0,0.85,0.0,1.1348314606741543,0.85,5.63,False,False,False,False,False,False,25,24


<IPython.core.display.Javascript object>

In [16]:
# generating a few summary values:
total_revenue = df_clean.select(
    F.sum(F.col("total_item_price")).alias("total_revenue")
).collect()[0]["total_revenue"]

total_items_sold = df_clean.select(
    F.sum(F.col("quantity")).alias("total_items_sold")
).collect()[0]["total_items_sold"]

<IPython.core.display.Javascript object>

# 3. Implementing Product Features
I will follow the hypothesis/features planning below to implement relevant features from a product perspective.

<img src="../reports/figures/Product Entity.png" alt = "Product Entity Map" style = "width:1182px; height=702px;">

## 3.1 Product Metadata

In [17]:
# product metadata (price, description, et cetera)
df_product_info = df_clean.groupby("stock_code").agg(
    F.first(F.col("description")).alias("product_description"),
    F.length(F.first(F.col("description"))).alias("description_length"),
    F.first(F.col("retail_price")).alias("retail_price"),
    F.first(F.abs(F.col("min_unit_price"))).alias("min_product_unit_price"),
    F.first(F.abs(F.col("max_unit_price"))).alias("max_product_unit_price"),
    F.first(F.abs(F.col("avg_unit_price"))).alias("avg_product_unit_price"),
    F.first(F.abs(F.col("median_unit_price"))).alias("median_product_unit_price"),
    F.sum(F.abs(F.col("total_item_price"))).alias("total_spent_on"),
    (F.sum(F.abs(F.col("total_item_price"))) / F.lit(total_revenue)).alias(
        "total_contribution_revenue"
    ),
    F.sum(F.abs(F.col("quantity"))).alias("total_units_sold"),
    (F.sum(F.abs(F.col("quantity"))) / F.lit(total_items_sold)).alias(
        "total_contribution_units"
    ),
)

<IPython.core.display.Javascript object>

## 3.2 Sets, packs and boxes

In [18]:
# extracting the type of product (set or pack instead of invidiual item)
df_product_info = df_product_info.withColumn(
    "is_set_or_pack", is_set_or_pack(F.col("product_description"))
)

<IPython.core.display.Javascript object>

In [19]:
# unit size for the products:
df_product_info = df_product_info.withColumn(
    "unit_size", get_unit_size(F.col("product_description"))
)

<IPython.core.display.Javascript object>

In [20]:
df_product_info.printSchema()

root
 |-- stock_code: string (nullable = true)
 |-- product_description: string (nullable = true)
 |-- description_length: integer (nullable = true)
 |-- retail_price: string (nullable = true)
 |-- min_product_unit_price: double (nullable = true)
 |-- max_product_unit_price: double (nullable = true)
 |-- avg_product_unit_price: double (nullable = true)
 |-- median_product_unit_price: double (nullable = true)
 |-- total_spent_on: double (nullable = true)
 |-- total_contribution_revenue: double (nullable = true)
 |-- total_units_sold: double (nullable = true)
 |-- total_contribution_units: double (nullable = true)
 |-- is_set_or_pack: boolean (nullable = true)
 |-- unit_size: integer (nullable = true)



<IPython.core.display.Javascript object>

## 3.3 Product Price Embedding

In [21]:
# window function for ranking based on specific columns:
retail_price_window = Window.orderBy("retail_price")

df_temp = df_product_info.withColumn(
    "retail_price_rank", F.percent_rank().over(retail_price_window)
)

df_product_price = df_temp.groupby("stock_code").agg(
    F.first(F.col("product_description")).alias("product_description"),
    F.round(F.max(F.col("retail_price_rank")), 3).alias("retail_price_score"),
)

<IPython.core.display.Javascript object>

## 3.4 Product Spending Embedding

In [23]:
spent_on_window = Window.orderBy("total_spent_on")

units_sold_window = Window.orderBy("total_units_sold")

df_temp = df_product_info.withColumn(
    "spent_on_rank", F.percent_rank().over(spent_on_window)
)

df_temp = df_temp.withColumn(
    "units_sold_rank", F.percent_rank().over(units_sold_window)
)

df_product_spending = df_temp.groupby("stock_code").agg(
    F.first(F.col("product_description")).alias("product_description"),
    F.round(F.max(F.col("spent_on_rank")), 3).alias("revenue_score"),
    F.round(F.max(F.col("units_sold_rank")), 3).alias("quantity_sold_score"),
)

<IPython.core.display.Javascript object>

# 4. Preparing the Output

In [25]:
df_product = df_product_info.join(
    df_product_price, on=["stock_code", "product_description"], how="left"
)

df_product = df_product.join(
    df_product_spending, on=["stock_code", "product_description"], how="left"
)

df_product = df_product.drop_duplicates(subset=["stock_code", "product_description"])

<IPython.core.display.Javascript object>

# 5. Saving the Dataset

In [None]:
# saving the enhanced raw data as parquet in the processed step of the pipeline
PROCESSED_DATA_DIR = '../data/processed'


# using the helper function to save the file:
save_to_filesystem(df_product, 
                   PROCESSED_DATA_DIR,
                   'tb_product',
                   'tb_product.parquet')