In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Demo').getOrCreate()

In [3]:
from pyspark.sql.functions import when
from pyspark.sql.functions import col
from pyspark.sql import functions as func

In [38]:
currency  = 'dim_currency.csv'
customers = 'dim_customer.csv'
dates = 'dim_date.csv'
product = 'dim_product.csv'
intsales = 'fact_internet_sales.csv'

In [9]:
df = spark.read.csv('fact_internet_sales.csv', header=True, inferSchema=True)
df.select('ProductKey', 'OrderDateKey', 'CustomerKey', 'CurrencyKey').show(5)

+----------+------------+-----------+-----------+
|ProductKey|OrderDateKey|CustomerKey|CurrencyKey|
+----------+------------+-----------+-----------+
|       310|    20101229|      21768|         19|
|       346|    20101229|      28389|         39|
|       346|    20101229|      25863|        100|
|       336|    20101229|      14501|        100|
|       346|    20101229|      11003|          6|
+----------+------------+-----------+-----------+
only showing top 5 rows



In [12]:
dfcurrency = spark.read.csv(currency, header=True, inferSchema=True)
dfcustomers = spark.read.csv(customers, header=True, inferSchema=True)
dfdates = spark.read.csv(dates, header=True, inferSchema=True)
dfproducts = spark.read.csv(product, header=True, inferSchema=True)
dfsales = spark.read.csv(intsales, header=True, inferSchema=True)

In [26]:
# dfcurrency.show(2)
# dfcustomers.show()
# dfdates.show(2)
# dfproducts.show(2)
# dfsales.show(2)

+-----------+--------------------+--------------+
|CurrencyKey|CurrencyAlternateKey|  CurrencyName|
+-----------+--------------------+--------------+
|          1|                 AFA|       Afghani|
|          2|                 DZD|Algerian Dinar|
+-----------+--------------------+--------------+
only showing top 2 rows



#### **Date regulation**

In [13]:
dfdates = dfdates.filter(col('EnglishDayNameOfWeek')=='Sunday')

#### **Product Regulation**
Include only products where:
- `Color` is **Silver**.
- Has **Size** information.
- `Weight` is greater than **10**.

In [14]:
dfproducts = dfproducts.filter((col('Color') == 'Silver') & (col('Size').isNotNull()) & (col('Weight')>10))
dfproducts.select('ProductKey', 'Color', 'Size', 'Weight' ).show(10)

+----------+------+----+------+
|ProductKey| Color|Size|Weight|
+----------+------+----+------+
|       344|Silver|  38| 20.35|
|       345|Silver|  42| 20.77|
|       346|Silver|  44| 21.13|
|       347|Silver|  48| 21.42|
|       352|Silver|  38| 23.35|
|       353|Silver|  38| 23.35|
|       354|Silver|  42| 23.77|
|       355|Silver|  42| 23.77|
|       356|Silver|  46| 24.13|
|       357|Silver|  46| 24.13|
+----------+------+----+------+
only showing top 10 rows



#### **Customer regulations**
Include customers who:
- Have a `YearlyIncome` greater than **100,000.0**.
- Have more than **1 child**.


In [16]:
dfcustomers.show(2)

+-----------+---------+----------+--------+---------+----------+-------------+------+------+--------------------+------------+-------------+---------------+------------+-------------------+
|CustomerKey|FirstName|MiddleName|LastName|NameStyle| BirthDate|MaritalStatus|Suffix|Gender|        EmailAddress|YearlyIncome|TotalChildren|   AddressLine1|AddressLine2|              Phone|
+-----------+---------+----------+--------+---------+----------+-------------+------+------+--------------------+------------+-------------+---------------+------------+-------------------+
|      11000|      Jon|         V|    Yang|    false|1971-10-06|            M|  NULL|     M|jon24@adventure-w...|     90000.0|            2|3761 N. 14th St|        NULL|1 (11) 500 555-0162|
|      11001|   Eugene|         L|   Huang|    false|1976-05-10|            S|  NULL|     M|eugene10@adventur...|     60000.0|            3|     2243 W St.|        NULL|1 (11) 500 555-0110|
+-----------+---------+----------+--------+-------

In [37]:
dfcustomers = dfcustomers.filter((col('YearlyIncome') >100_000) & (col('TotalChildren') == 1))
dfcustomers.select('CustomerKey', 'FirstName', 'LastName', 'YearlyIncome', 'TotalChildren').show()




+-----------+----------+--------+------------+-------------+
|CustomerKey| FirstName|LastName|YearlyIncome|TotalChildren|
+-----------+----------+--------+------------+-------------+
|      11180|     April|   Anand|    160000.0|            1|
|      11257|Jacqueline|  Powell|    120000.0|            1|
|      11258|    Xavier|    Hill|    120000.0|            1|
|      11261| Stephanie| Collins|    130000.0|            1|
|      11283|    Arturo|     Lal|    110000.0|            1|
|      11284|   Theresa| Serrano|    110000.0|            1|
|      11285|    Jeremy|Anderson|    120000.0|            1|
|      11288|     Cindy| Sanchez|    110000.0|            1|
|      11289|     Maria|  Carter|    130000.0|            1|
|      11290|   Katelyn| Sanchez|    130000.0|            1|
|      11291|     Jenna|  Wright|    130000.0|            1|
|      11292|      Seth|Phillips|    150000.0|            1|
|      11467|    Arturo|   Zheng|    170000.0|            1|
|      11759|      Dawn|

#### **Merging data frames**

In [33]:
dff = (dfproducts
          .join(dfsales, on="ProductKey", how="inner")
          .join(dfcustomers, on="CustomerKey", how="inner")
#          .join(dfcurrency, on="CurrencyKey", how="inner")
          .join(dfdates, dfsales.OrderDateKey == dfdates.DateKey, how="inner"))

dff.select('FirstName', 'LastName', 'CustomerKey','YearlyIncome', 'TotalChildren', 'ProductKey', 'Color', 'Size', 'Weight' ).show(truncate=False)


+---------+---------+-----------+------------+-------------+----------+------+----+------+
|FirstName|LastName |CustomerKey|YearlyIncome|TotalChildren|ProductKey|Color |Size|Weight|
+---------+---------+-----------+------------+-------------+----------+------+----+------+
|Willie   |Rai      |14673      |110000.0    |1            |354       |Silver|42  |23.77 |
|Jessica  |Ward     |15587      |110000.0    |1            |354       |Silver|42  |23.77 |
|Katelyn  |Sanchez  |11290      |130000.0    |1            |355       |Silver|42  |23.77 |
|Adam     |Patterson|19696      |120000.0    |1            |592       |Silver|42  |27.77 |
|Jennifer |Peterson |14433      |110000.0    |1            |357       |Silver|46  |24.13 |
|Victor   |Blanco   |15586      |110000.0    |1            |357       |Silver|46  |24.13 |
+---------+---------+-----------+------------+-------------+----------+------+----+------+



#### **Aggregations**

In [34]:
newdff = (dff.groupBy("CustomerKey", "FirstName") 
             .agg(
                 func.sum("TaxAmt").alias("Total TaxAmt"),
                 func.avg("SalesAmount").alias("Average SalesAmount"),
                 func.avg("TotalProductCost").alias("Average TotalProductCost") 
             ))

newdff.show(20, truncate=False)

+-----------+---------+------------+-------------------+------------------------+
|CustomerKey|FirstName|Total TaxAmt|Average SalesAmount|Average TotalProductCost|
+-----------+---------+------------+-------------------+------------------------+
|15587      |Jessica  |165.7136    |2071.4196          |1117.8559               |
|19696      |Adam     |45.1992     |564.99             |308.2179                |
|11290      |Katelyn  |185.5992    |2319.99            |1265.6195               |
|14673      |Willie   |165.7136    |2071.4196          |1117.8559               |
|15586      |Victor   |185.5992    |2319.99            |1265.6195               |
|14433      |Jennifer |185.5992    |2319.99            |1265.6195               |
+-----------+---------+------------+-------------------+------------------------+



In [35]:
finaldf = newdff.orderBy('FirstName', ascending = True)
finaldf.show()


+-----------+---------+------------+-------------------+------------------------+
|CustomerKey|FirstName|Total TaxAmt|Average SalesAmount|Average TotalProductCost|
+-----------+---------+------------+-------------------+------------------------+
|      19696|     Adam|     45.1992|             564.99|                308.2179|
|      14433| Jennifer|    185.5992|            2319.99|               1265.6195|
|      15587|  Jessica|    165.7136|          2071.4196|               1117.8559|
|      11290|  Katelyn|    185.5992|            2319.99|               1265.6195|
|      15586|   Victor|    185.5992|            2319.99|               1265.6195|
|      14673|   Willie|    165.7136|          2071.4196|               1117.8559|
+-----------+---------+------------+-------------------+------------------------+



#### **Data Presentation**

In [36]:
finaldf = finaldf.drop('CustomerKey')
finaldf.show()

+---------+------------+-------------------+------------------------+
|FirstName|Total TaxAmt|Average SalesAmount|Average TotalProductCost|
+---------+------------+-------------------+------------------------+
|     Adam|     45.1992|             564.99|                308.2179|
| Jennifer|    185.5992|            2319.99|               1265.6195|
|  Jessica|    165.7136|          2071.4196|               1117.8559|
|  Katelyn|    185.5992|            2319.99|               1265.6195|
|   Victor|    185.5992|            2319.99|               1265.6195|
|   Willie|    165.7136|          2071.4196|               1117.8559|
+---------+------------+-------------------+------------------------+

