# Data Prep - Invoice Entity
I will compose a view from the raw dataset that is a summary of the multiple characteristics of a single invoice. Thus, we will have the `invoice_no` as the primary key of this dataset.

Throughout this notebook, I will use both Spark's standard API and the `SQLContext` to illustrate when the SQLContext can make grouping structures more readable and portable. Another good reason for using it is that it can be more easily communicated to business and less technical stakeholders that are more used to SQL syntax.  

In [1]:
!pip install inflection nb_black >> ../configs/package_installation.txt

In [1]:
# loading magic commands:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

<IPython.core.display.Javascript object>

In [2]:
# PySpark dependencies:
import pyspark
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.window import Window

# database utilities
import pandas as pd

# other relevant libraries:
import warnings
import inflection
import unicodedata
from datetime import datetime, timedelta
import json
import re
import os
from glob import glob
import shutil
import itertools

# setting global parameters for visualizations:
warnings.filterwarnings("ignore")
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

<IPython.core.display.Javascript object>

# 0. Configuring Spark Session

In [3]:
# loading the configurations needed for Spark
def init_spark(app_name):

    spark = (
        SparkSession.builder.appName(app_name)
        .config("spark.files.overwrite", "true")
        .config("spark.sql.repl.eagerEval.enabled", True)
        .config("spark.sql.repl.eagerEval.maxNumRows", 5)
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    )

    return spark


# init the spark session:
spark = init_spark("Invoice Preparation")

<IPython.core.display.Javascript object>

In [4]:
# verifying the spark session:
spark

<IPython.core.display.Javascript object>

# 1. Utility Functions

In [5]:
def save_to_filesystem(df, target_path, parquet_path, filename):
    """Helper function to save pyspark dataframes as parquets in a way that is similar to writing to local files

    Args:
        df (pyspark.sql.dataframe.DataFrame): dataframe to be saved
        target_path (str): path that will store the file
        filename (str): name of the resulting file

    Returns:
        None
    """
    PARQUET_FILE = f"{target_path}/{parquet_path}"
    OUTPUT_FILE = f"{target_path}/{filename}"

    if os.path.exists(PARQUET_FILE):
        shutil.rmtree(
            PARQUET_FILE
        )  # if the directory already exists, remove it (throws error if not)

    # saves the dataframe:
    df.coalesce(1).write.save(PARQUET_FILE)

    # retrieves file resulting from the saving procedure:
    original_file = glob(f"{PARQUET_FILE}/*.parquet")[0]

    # renames the resulting file and saves it to the target directory:
    os.rename(original_file, OUTPUT_FILE)

    shutil.rmtree(PARQUET_FILE)

    return True


def apply_category_map(category_map):
    """Helper function to convert strings given a map

    Note:
        This function uses the function generator scheme, much like the PySpark code

    Args:
        original_category (str): the original category name
        category_map (dict): the hash table or dictionary for converting the values:

    Returns:
        new_category (str): the resulting category

    """

    def func(row):
        try:
            result = category_map[row]
        except:
            result = None
        return result

    return F.udf(func)


def get_datetime_features(df, time_col):
    """Function to extract time-based features from pyspark dataframes

    Args:
        df (pyspark.sql.dataframe.DataFrame): the original dataframe that needs to be enriched
        time_col (str): the string name of the column containing the date object

    Returns:
        df (pyspark.sql.dataframe.DataFrame): resulting pyspark dataframe with the added features
            -> See list of attribute the source code for the attributes

    """

    # applying date-related functions:

    # day-level attributes:
    df = df.withColumn("day_of_week", F.dayofweek(F.col(time_col)))

    df = df.withColumn("day_of_month", F.dayofmonth(F.col(time_col)))

    df = df.withColumn("day_of_year", F.dayofyear(F.col(time_col)))

    # week-level attributes:
    df = df.withColumn("week_of_year", F.weekofyear(F.col(time_col)))

    # month-level attributes:
    df = df.withColumn("month", F.month(F.col(time_col)))

    df = df.withColumn("quarter", F.quarter(F.col(time_col)))

    # year-level attributes:
    df = df.withColumn("year", F.year(F.col(time_col)))

    return df


def bulk_aggregate(df, group_col, aggs, target_cols):
    """Wrapper function to apply multiple aggregations when performing group bys

    It utilizes the spark's SQL Context and string interpolation to perform the aggregation using SQL syntax.

    Args:
        df (pyspark.sql.dataframe.DataFrame): dataframe with raw data
        group_col (str): the column that will be used for grouping
        aggs (list): list of aggregations that want to be made (must be the same name as pyspark.sql.functions)
        target_cols (str): columns in which aggregations will be performed

    Returns:
        df_grouped (pyspark.sql.dataframe.DataFrame): dataframe with the grouped data
    """

    # buils the cartersian product of the lists
    aggs_to_perform = itertools.product(aggs, target_cols)

    Q_LAYOUT = """
    SELECT
        {},
        {}
        FROM df
        GROUP BY {}
    """

    aggregations = []
    for agg, col in aggs_to_perform:

        # builds the string for aggregation
        statement = f"{agg.upper()}({col}) as {agg}_{col}"
        aggregations.append(statement)

    full_statement = ",\n".join(aggregations)

    # uses string interpolation to build the full query statement
    QUERY = Q_LAYOUT.format(group_col, full_statement, group_col)

    # registers the dataframe as temporary table:
    df.registerTempTable("df")
    df_grouped = spark.sql(QUERY)

    # rounds values:
    for column in df_grouped.columns:
        df_grouped = df_grouped.withColumn(column, F.round(F.col(column), 1))

    return df_grouped


######### Text Processing Functions ########
@udf("string")
def normalize_text(text):
    """Helper function to normalize text data to ASCII and lower case, removing spaces

    Args:
        text (string): the string that needs to be normalized

    Returns:
        text (string): cleaned up string

    """
    regex = r"[^a-zA-Z0-9]+"

    if text is not None:

        text = str(text)
        text = text.lower()
        text = re.sub(regex, " ", text)
        text = text.strip()
        text = str(
            unicodedata.normalize("NFKD", text).encode("ASCII", "ignore"), "utf-8"
        )

    return text


def get_null_columns(df, normalize=False):
    """Helper function to print the number of null records for each column of a PySpark DataFrame.

    Args:
        df (pyspark.sql.dataframe.DataFrame): a PySpark Dataframe object

    Returns:
        None -> prints to standard out

    """

    if normalize:
        total = df.count()

        df_nulls = df.select(
            [
                (F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)) / total).alias(
                    column
                )
                for column in df.columns
            ]
        )

    else:
        df_nulls = df.select(
            [
                F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)).alias(column)
                for column in df.columns
            ]
        )

    # displaying the results to standard out
    df_nulls.show(1, truncate=False, vertical=True)


@udf("boolean")
def is_set_or_pack(text):

    # description entries to match:
    set_descriptions = {"set", "set of", "pack", "pack of", "box", "box of"}

    if text is not None:
        text = str(text)

        if text in set_descriptions:
            return True

        else:
            return False

    else:
        return False


@udf("integer")
def get_unit_size(text):

    if text is not None:
        check_if_digit = len(re.findall(r"(\d+)", text)) > 0

        if check_if_digit:
            set_size = int(re.findall(r"(\d+)", text)[0])
            return set_size

        else:
            return 1

    else:
        return 1


@udf("boolean")
def has_non_digits_only(text):
    """Function to match entries in the dataset that are purely non-digit characters

    Args:
        text (str): string containing the invoice code

    Returns:
        boolean: whether the text contains non-digit characters and is not related to cancellations

    """

    if text is not None:
        condition = all(character.isalpha() for character in text)

        if condition:
            return True

        else:
            return False

    else:
        return False

<IPython.core.display.Javascript object>

# 2. Loading the Data

In [6]:
# loading the raw dataset:
df_clean = spark.read.parquet("../data/processed/tb_ecommerce.parquet")

<IPython.core.display.Javascript object>

In [7]:
# instantiating the SQL Context:
sql_context = SQLContext(spark.sparkContext)

df_clean.createOrReplaceTempView("tb_ecommerce")

<IPython.core.display.Javascript object>

# 3. Implement Invoice features
Just like the raw dataset, I have devised a plan for features in the hypothesis map below.

<img src="../reports/figures/Invoice Entity.png" alt = "Invoice Entity Map" style = "width:1182px; height=702px;">

## 3.1 Invoice statistics 

In [8]:
QUERY_INVOICE_ITEMS = """
SELECT
    -- group unit
    invoice_no,
    
    -- unique information about the invoice
    FIRST(customer_id) as customer_id,
    FIRST(country) as customer_country,
    FIRST(invoice_date) as date,
    CONCAT_WS('; ', COLLECT_LIST(description)) as invoice_items,
    
    -- basket features
    SUM(ABS(quantity)) as basket_size,
    COUNT(DISTINCT description) as basket_diversity,
    
    -- payment features
    ROUND(SUM(ABS(total_item_price)), 2) as total_paid,
    
    -- discounted items features
    ROUND(AVG(CAST(is_discounted_item as LONG)), 3) as pct_sale_items,
    SUM(CAST(is_discounted_item as LONG)) as sale_items,
    SUM(CASE WHEN is_discounted_item == true THEN ABS(quantity) ELSE 0 END) as total_quantity_sale,
    ROUND(SUM(CASE WHEN is_discounted_item == true THEN ABS(total_item_price) ELSE 0 END), 2) as total_paid_sale,
    
    -- returned and cancelled item features
    SUM(CAST(is_cancelled as LONG)) as cancelled_items,
    SUM(CASE WHEN is_cancelled == true THEN ABS(quantity) ELSE 0 END) as total_quantity_cancelled,
    ROUND(SUM(CASE WHEN is_cancelled == true THEN ABS(total_item_price) ELSE 0 END), 2) as total_paid_cancelled,
    SUM(CAST(is_return as LONG)) as returned_items,
    SUM(CASE WHEN is_return == true THEN ABS(quantity) ELSE 0 END) as total_quantity_returned,
    ROUND(SUM(CASE WHEN is_return == true THEN ABS(total_item_price) ELSE 0 END), 2) as total_paid_returned,
    
    -- free items features
    ROUND(AVG(CAST(is_free_item as LONG)), 3) as pct_free_items,
    SUM(CAST(is_free_item as LONG)) as free_items,
    SUM(CASE WHEN is_free_item == true THEN ABS(quantity) ELSE 0 END) as total_quantity_free,
    
    -- fees, postage and manuals
   ROUND(SUM(CASE WHEN is_postage == true THEN ABS(total_item_price) ELSE 0 END), 2) as total_paid_postage,
   ROUND(SUM(CASE WHEN is_manual == true THEN ABS(total_item_price) ELSE 0 END), 2) as total_paid_manual,
   ROUND(SUM(CASE WHEN is_fee == true THEN ABS(total_item_price) ELSE 0 END), 2) as total_paid_fee,
   ROUND(SUM(CASE WHEN is_discount == true THEN ABS(total_item_price) ELSE 0 END), 2) as total_discounts
    
FROM tb_ecommerce
GROUP BY invoice_no
"""

# performing the grouping operation
df_invoice = spark.sql(QUERY_INVOICE_ITEMS)

<IPython.core.display.Javascript object>

Additionally, I will introduce few more features related to important products in a certain invoice, such as the most expensive and cheapest item of the invoice, as well as the most representative item.

## 3.2 Invoice product features

In [9]:
Q_ITEMS_INFORMATION = """
with expensive as (
SELECT 
    invoice_no,
    'product_most_expensive' as window_type,
    description
    FROM (
    SELECT 
        invoice_no,
        description,
        ROW_NUMBER() OVER(PARTITION BY invoice_no, description ORDER BY unit_price DESC) as row_idx
    FROM tb_ecommerce
    ) as temp
    WHERE row_idx = 1
),

cheapest as (

SELECT 
    invoice_no,
    'product_cheapest' as window_type,
    description
    FROM (
    SELECT 
        invoice_no,
        description,
        ROW_NUMBER() OVER(PARTITION BY invoice_no, description ORDER BY unit_price ASC) as row_idx
    FROM tb_ecommerce
    ) as temp
    WHERE row_idx = 1
),

most_spent as (

SELECT 
    invoice_no,
    'product_most_spent' as window_type,
    description
    FROM (
    SELECT 
        invoice_no,
        description,
        ROW_NUMBER() OVER(PARTITION BY invoice_no, description ORDER BY ABS(total_item_price) DESC) as row_idx
    FROM tb_ecommerce
    ) as temp
    WHERE row_idx = 1
),

most_bought as (

SELECT 
    invoice_no,
    'product_most_bought' as window_type,
    description
    FROM (
    SELECT 
        invoice_no,
        description,
        ROW_NUMBER() OVER(PARTITION BY invoice_no, description ORDER BY ABS(quantity) DESC) as row_idx
    FROM tb_ecommerce
    ) as temp
    WHERE row_idx = 1
)

SELECT invoice_no, window_type, description FROM expensive
UNION
SELECT invoice_no, window_type, description FROM cheapest
UNION
SELECT invoice_no, window_type, description FROM most_spent
UNION
SELECT invoice_no, window_type, description FROM most_bought
"""

df_items_info = spark.sql(Q_ITEMS_INFORMATION)

<IPython.core.display.Javascript object>

In [10]:
# pivotting the item information dataframe to get the wide-format we need
df_items_pivot = (
    df_items_info.groupby("invoice_no").pivot("window_type").agg(F.first("description"))
)

<IPython.core.display.Javascript object>

In [11]:
# joining the results back into the invoice dataframe:
df_invoice = df_invoice.join(df_items_pivot, on=["invoice_no"], how="left")

<IPython.core.display.Javascript object>

In [12]:
# joining the subset of previously defined date-related attributes:
cols_to_remove = [
    "description",
    "stock_code",
    "quantity",
    "unit_price",
    "customer_id",
    "country",
    "is_missing_customer_id",
    "total_item_price",
    "retail_price",
    "min_unit_price",
    "avg_unit_price",
    "median_unit_price",
    "max_unit_price",
    "is_discounted_item",
    "has_non_digit",
    "is_postage",
    "is_manual",
    "is_discount",
    "is_fee",
    "is_cancelled",
    "is_return",
    "is_free_item",
    "temp_id",
]

df_dates = df_clean.drop(*cols_to_remove)

<IPython.core.display.Javascript object>

In [13]:
# the resulting dataframe:
df_invoice = df_invoice.join(df_dates, how="left", on=["invoice_no"]).drop_duplicates(
    subset=["invoice_no", "invoice_date"]
)

<IPython.core.display.Javascript object>

## 3.3 Aggregated and relative features
Finally, I will add a few more features related to proportions occupied by certain types of products (free items, returns, et cetera).

In [14]:
df_invoice.printSchema()

root
 |-- invoice_no: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- customer_country: string (nullable = true)
 |-- date: date (nullable = true)
 |-- invoice_items: string (nullable = false)
 |-- basket_size: double (nullable = true)
 |-- basket_diversity: long (nullable = false)
 |-- total_paid: double (nullable = true)
 |-- pct_sale_items: double (nullable = true)
 |-- sale_items: long (nullable = true)
 |-- total_quantity_sale: double (nullable = true)
 |-- total_paid_sale: double (nullable = true)
 |-- cancelled_items: long (nullable = true)
 |-- total_quantity_cancelled: double (nullable = true)
 |-- total_paid_cancelled: double (nullable = true)
 |-- returned_items: long (nullable = true)
 |-- total_quantity_returned: double (nullable = true)
 |-- total_paid_returned: double (nullable = true)
 |-- pct_free_items: double (nullable = true)
 |-- free_items: long (nullable = true)
 |-- total_quantity_free: double (nullable = true)
 |-- total_paid_postage: 

<IPython.core.display.Javascript object>

In [15]:
# pct of discounted items:
df_invoice = df_invoice.withColumn(
    "pct_paid_sale_items", F.round(F.col("total_paid_sale") / F.col("total_paid"), 2)
)

df_invoice = df_invoice.withColumn(
    "pct_paid_shipping_and_fees",
    F.round(
        (F.col("total_paid_postage") + F.col("total_paid_fee")) / F.col("total_paid"), 2
    ),
)

df_invoice = df_invoice.withColumn(
    "pct_paid_manuals", F.round((F.col("total_paid_manual")) / F.col("total_paid"), 2)
)

<IPython.core.display.Javascript object>

## 3.4 Cleaning up the dataset

In [16]:
# let's put columns into a more readable format (sort them alphabetically)

pivot_cols = ["invoice_no", "customer_id", "customer_country", "date", "invoice_items"]

current_cols = df_invoice.columns

remaining_cols = sorted(list(filter(lambda col: col not in pivot_cols, current_cols)))

# adding final set of columns:
cols_selected = pivot_cols + remaining_cols

<IPython.core.display.Javascript object>

In [17]:
# selecting the columns in order:
df_invoice = df_invoice.select(*cols_selected)

<IPython.core.display.Javascript object>

In [18]:
# verifying number of entries:
df_invoice.count()

23191

<IPython.core.display.Javascript object>

In [19]:
# visualizing the results:
df_invoice

invoice_no,customer_id,customer_country,date,invoice_items,basket_diversity,basket_size,cancelled_items,day_of_month,day_of_week,day_of_year,days_to_next_bank_holiday,days_to_next_commercial_holiday,free_items,invoice_date,is_bank_holiday,is_bank_holiday_month,is_bank_holiday_week,is_commercial_holiday,is_commercial_holiday_month,is_commercial_holiday_week,month,pct_free_items,pct_paid_manuals,pct_paid_sale_items,pct_paid_shipping_and_fees,pct_sale_items,product_cheapest,product_most_bought,product_most_expensive,product_most_spent,quarter,returned_items,sale_items,total_discounts,total_paid,total_paid_cancelled,total_paid_fee,total_paid_manual,total_paid_postage,total_paid_returned,total_paid_sale,total_quantity_cancelled,total_quantity_free,total_quantity_returned,total_quantity_sale,week_of_year,year
536938,14680,united kingdom,2016-12-01,green 3 piece pol...,14,464.0,0,1,5,336,24,25,2,2016-12-01,False,True,False,False,True,False,12,0.143,0.0,0.87,0.0,0.714,white skull hot w...,jumbo bag baroque...,jumbo bag pink po...,red 3 piece retro...,4,0,10,0.0,1680.88,0.0,0.0,0.0,0.0,0.0,1466.88,0.0,40.0,0.0,376.0,48,2016
537691,13842,united kingdom,2016-12-06,3 hook photo shel...,20,163.0,0,6,3,341,19,20,4,2016-12-06,False,True,False,False,True,False,12,0.2,0.0,0.13,0.0,0.1,heart of wicker s...,wood s 3 cabinet ...,party bunting,party bunting,4,0,2,0.0,310.57,0.0,0.0,0.0,0.0,0.0,39.15,0.0,64.0,0.0,7.0,49,2016
538184,17880,united kingdom,2016-12-08,empire design ros...,26,314.0,0,8,5,343,17,18,5,2016-12-08,False,True,False,False,True,False,12,0.192,0.0,0.08,0.0,0.077,retrospot giant t...,doormat welcome t...,calendar paper cu...,calendar paper cu...,4,0,2,0.0,458.92,0.0,0.0,0.0,0.0,0.0,35.58,0.0,120.0,0.0,18.0,49,2016
538517,17618,united kingdom,2016-12-10,wooden box of dom...,41,161.0,0,10,7,345,15,16,8,2016-12-10,False,True,False,False,True,False,12,0.151,0.0,0.08,0.0,0.038,grey floral feltc...,feltcraft christm...,clothes pegs retr...,vintage cream dog...,4,0,2,0.0,320.28,0.0,0.0,0.0,0.0,0.0,25.76,0.0,43.0,0.0,8.0,49,2016
538879,15392,united kingdom,2016-12-12,christmas gingham...,19,402.0,0,12,2,347,13,14,12,2016-12-12,False,True,False,False,True,False,12,0.632,0.0,0.04,0.0,0.053,danish rose round...,pack of 72 retros...,card billboard font,card billboard font,4,0,1,0.0,338.98,0.0,0.0,0.0,0.0,0.0,13.6,0.0,316.0,0.0,16.0,50,2016


<IPython.core.display.Javascript object>

# 4. Saving the Dataset

## 4.1 Writing to the filesystem

In [20]:
# saving the enhanced raw data as parquet in the processed step of the pipeline
PROCESSED_DATA_DIR = "../data/processed"


# using the helper function to save the file:
save_to_filesystem(df_invoice, PROCESSED_DATA_DIR, "tb_invoice", "tb_invoice.parquet")

True

<IPython.core.display.Javascript object>