## Instantiate a SparkSession

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("MyApp").master("local[*]").getOrCreate()

# check if works
dftest = spark.range(100)
print(dftest.count())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/21 15:11:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

100


                                                                                

## Load dataset

In [55]:
# load dataset
df = spark.read.option("delimiter", ",").option("header", True).csv("../data/data.csv")
df.printSchema()
cardinality = df.count()
print("Cardinality:", cardinality)

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)

Cardinality: 541909


## EDA

In [62]:
print("Any null records?")
(df.select(
        [
            F.round((F.sum(F.col(col).isNull().cast("int"))/F.lit(cardinality)*100), 2)
            .alias(f"{col} [%]")
            for col in df.columns
        ]
    )
    .show()
)

Any null records?
+-------------+-------------+---------------+------------+---------------+-------------+--------------+-----------+
|InvoiceNo [%]|StockCode [%]|Description [%]|Quantity [%]|InvoiceDate [%]|UnitPrice [%]|CustomerID [%]|Country [%]|
+-------------+-------------+---------------+------------+---------------+-------------+--------------+-----------+
|          0.0|          0.0|           0.27|         0.0|            0.0|          0.0|         24.93|        0.0|
+-------------+-------------+---------------+------------+---------------+-------------+--------------+-----------+



In [37]:
print("Distribution of the two continuous attributes")
df.select(["UnitPrice", "Quantity"]).summary().show()

# back_transactions are those with negative Quantity
df.select("*").filter(F.col("Quantity") < 0).createOrReplaceTempView("back_transactions")

print("Distribution of the negative Quantity")
back_transactions = spark.table("back_transactions").select(F.col("Quantity")).summary().show()

print("Distribution of the negative UnitPrice")
df.select("UnitPrice").filter(F.col("UnitPrice") < 0).summary().show()

print("Any negative UnitPrice for back_transactions?")
spark.sql("SELECT COUNT(UnitPrice) FROM back_transactions WHERE UnitPrice < 0").show()

Distribution of the two continuous attributes


                                                                                

+-------+------------------+------------------+
|summary|         UnitPrice|          Quantity|
+-------+------------------+------------------+
|  count|            541909|            541909|
|   mean|4.6111136260897085|  9.55224954743324|
| stddev| 96.75985306117963|218.08115785023438|
|    min|         -11062.06|                -1|
|    25%|              1.25|               1.0|
|    50%|              2.08|               3.0|
|    75%|              4.13|              10.0|
|    max|             99.96|               992|
+-------+------------------+------------------+

Distribution of the negative Quantity
+-------+------------------+
|summary|          Quantity|
+-------+------------------+
|  count|             10624|
|   mean|-45.60721009036145|
| stddev|   1092.2142164236|
|    min|                -1|
|    25%|             -10.0|
|    50%|              -2.0|
|    75%|              -1.0|
|    max|              -990|
+-------+------------------+

Distribution of the negative UnitPri

In [48]:
print("Ghost customers (i.e. null CustomerID) distribution of Quantity and UnitPrice")
(df.select(["UnitPrice", "Quantity", "CustomerID"])
    .filter(F.col("CustomerID").isNull() == True)
    .select(["UnitPrice", "Quantity"])
    .summary()
    .show()
)

Ghost customers (i.e. null CustomerID) distribution of Quantity and UnitPrice
+-------+-----------------+------------------+
|summary|        UnitPrice|          Quantity|
+-------+-----------------+------------------+
|  count|           135080|            135080|
|   mean|8.076576917382749|1.9955729937814628|
| stddev|151.9008162787955| 66.69615267858345|
|    min|        -11062.06|                -1|
|    25%|             1.63|               1.0|
|    50%|             3.29|               1.0|
|    75%|              5.4|               3.0|
|    max|            99.96|                99|
+-------+-----------------+------------------+



## Data engineering
* Create new attributes out of the original dataset

In [73]:
df = df.withColumn("Revenue", df.UnitPrice * df.Quantity)
tot_revenue = df.agg(F.sum("Revenue")).collect()[0][0]
print("Global revenues, including back transactions:",tot_revenue)

Global revenues, including back transactions: 9747747.933999127
