In [1]:
import pyspark.sql.functions as f
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkConf, SparkContext, HiveContext

In [2]:
executionDate = input("Input date you want transform data from HDFS DataLake and save to Hive Storage: ")

In [3]:
executionDate

'2019-09-13'

In [4]:
runTime = executionDate.split("-")
year = runTime[0]
month = runTime[1]
day = runTime[2]

In [5]:
# create spark session
spark = SparkSession \
   .builder \
   .appName("Daily Gross Revenue Report") \
   .config('hive.exec.dynamic.partition', 'true') \
   .config('hive.exec.dynamic.partition.mode', 'nonstrict') \
   .config('spark.sql.warehouse.dir', 'hdfs://localhost:9000/user/hive/warehouse') \
   .enableHiveSupport() \
   .getOrCreate()
#   

23/08/01 21:37:57 WARN Utils: Your hostname, bigdata-etl resolves to a loopback address: 127.0.1.1; using 192.168.85.128 instead (on interface ens33)
23/08/01 21:37:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/01 21:38:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/01 21:38:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
# load data to spark df
orders_df = spark.read.parquet('hdfs://localhost:9000/datalake/orders').drop("year", "month", "day")
order_detail_df = spark.read.parquet('hdfs://localhost:9000/datalake/order_detail').drop("year", "month", "day")
products_df = spark.read.parquet('hdfs://localhost:9000/datalake/products').drop("year", "month", "day", "created_at")
inventory_df = spark.read.parquet('hdfs://localhost:9000/datalake/inventory').drop("year", "month", "day")

                                                                                

In [7]:
orders_df.show(5)

                                                                                

+---+--------+----------+----------+
| id|quantity|created_at|product_id|
+---+--------+----------+----------+
|  1|       1|2009-01-25|    331449|
|  2|       1|2019-09-13|    182256|
|  3|       2|2004-05-04|    108399|
|  4|       3|2011-02-20|     81461|
|  5|       3|2007-07-11|    136274|
+---+--------+----------+----------+
only showing top 5 rows



In [8]:
order_detail_df.show(5)

[Stage 5:>                                                          (0 + 1) / 1]

+---+------+-----------+--------+-------+
| id| total|    payment|order_id|user_id|
+---+------+-----------+--------+-------+
|  1|710051|credit_card|       1| 209279|
|  2|375643|       cash|       2| 242546|
|  3|975362|       cash|       3| 135215|
|  4|417644|credit_card|       4| 111433|
|  5|481473|credit_card|       5|  44346|
+---+------+-----------+--------+-------+
only showing top 5 rows



                                                                                

In [9]:
products_df.show(5)

+---+--------+--------------+------------+------------+
| id|    make|         model|    category|inventory_id|
+---+--------+--------------+------------+------------+
|  1|     BMW|      5 Series|Sedan, Wagon|      999830|
|  2| Mercury| Grand Marquis|       Sedan|      988335|
|  3|   Honda|          CR-V|         SUV|      986788|
|  4|Cadillac|           XT5|         SUV|      986910|
|  5|  Nissan|Titan Crew Cab|      Pickup|      988637|
+---+--------+--------------+------------+------------+
only showing top 5 rows



In [10]:
inventory_df.show(5)

+---+--------+
| id|quantity|
+---+--------+
|  1|     355|
|  2|     492|
|  3|     269|
|  4|     394|
|  5|     239|
+---+--------+
only showing top 5 rows



In [11]:
pre_df = orders_df \
    .filter(orders_df["created_at"] == "2019-09-13") \
    .join(order_detail_df, orders_df["id"] == order_detail_df["order_id"], "inner") \
    .join(products_df, orders_df["product_id"] == products_df["id"], "inner") \
    .join(inventory_df.select(f.col("quantity").alias("inv_quantity"), f.col("id")), products_df["inventory_id"] == inventory_df["id"], "inner")

In [12]:
pre_df.show(5)



In [None]:
map_df = pre_df.groupBy("Make", "Model", "Category", "product_id", "inv_quantity") \
    .agg(
        f.sum("quantity").alias("Sales"),
        f.sum("total").alias("Revenue")
    )

In [None]:
result_df = map_df \
    .withColumn("LetfOver", f.col("inv_quantity") - f.col("Sales")) \
    .withColumn("year", f.lit(year)) \
    .withColumn("month", f.lit(month)) \
    .withColumn("day", f.lit(day)) \
    .select("Make", "Model", "Category", "Sales", "Revenue", "year", "month", "day", "LetfOver")

In [None]:
df = spark.sql("show databases")

+---------+
|namespace|
+---------+
|  default|
|    haha1|
|  reports|
|     test|
|    test1|
+---------+



In [None]:
df = spark.sql("describe database test")
df.select('info_value').collect()

[Row(info_value='spark_catalog'),
 Row(info_value='test'),
 Row(info_value=''),
 Row(info_value='file:/home/hadoop/Desktop/cars-sale-etl/etl_souce/spark-warehouse/test.db'),
 Row(info_value='hadoop')]

In [None]:
result_df.write \
    .format("hive") \
    .partitionBy("year", "month", "day") \
    .mode("append") \
    .saveAsTable("reports.daily_gross_revenue")