In [52]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
import base64

# Decoded SQL query
decoded_query = base64.b64decode('c2VsZWN0ICogZnJvbSBpbnZvaWNlX2J5X2FjY291bnQ=').decode('utf-8')


Define file paths

In [53]:
current_dir = os.getcwd()
invoices_files_names = '../csv_files/invoices/invoice-*.json'
invoices_files_path = os.path.join(current_dir, invoices_files_names)

Start ingest data into Spark

In [54]:
spark = SparkSession.builder.appName('JSON data transformation').getOrCreate()
invoices_df = spark.read.format('json').option('multiline', True).load(invoices_files_path)
invoices_df.show(10)

+------------------+-------+-------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          @context|  @type|          accountId|  billingPeriod|              broker|            customer|         description|   minimumPaymentDue|          paymentDue|paymentDueDate|       paymentStatus|            provider|     referencesOrder|     totalPaymentDue|                 url|
+------------------+-------+-------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|http://schema.org/|Invoice|xxxx-xxxx-xxxx-7563|           NULL|{LocalBusiness, A...|{Person, Jean Geo...|                NULL|{Pr

Analytic invoice amount

In [55]:
invoice_amount_df = invoices_df.select(F.col("totalPaymentDue.*"))
invoice_amount_df.show()

+------------------+------+-------------+
|             @type| price|priceCurrency|
+------------------+------+-------------+
|PriceSpecification|4500.0|          USD|
|PriceSpecification|   0.0|          USD|
|PriceSpecification| 200.0|          USD|
|PriceSpecification|  70.0|          USD|
+------------------+------+-------------+



In [59]:
invoice_by_account = invoices_df.select(F.col("accountId"), F.explode(F.col("referencesOrder")).alias("order"))
invoice_by_account = invoice_by_account \
    .withColumn("type", F.col("order.orderedItem.@type")) \
    .withColumn("description", F.col("order.orderedItem.description")) \
    .withColumn("name", F.col("order.orderedItem.name"))
invoice_by_account = invoice_by_account.filter("description IS NOT NULL AND type = 'Service'")
invoice_by_account.show(10, False)

+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+-------+-------------------------------+----+
|accountId          |order                                                                                                                                              |type   |description                    |name|
+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+-------+-------------------------------+----+
|xxxx-xxxx-xxxx-7563|{Order, furnace installation, 2015-12-05, NULL, {Service, furnace installation and tuning, NULL, NULL}, http://purl.org/goodrelations/v1#ByInvoice}|Service|furnace installation and tuning|NULL|
|xxxx-xxxx-xxxx-1234|{Order, furnace installation, 2014-12-02, NULL, {Service, furnace installation, NULL, NULL}, http://purl.org/goodrelati