**Load the data**

In [0]:
df_oct = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",header=True,inferSchema=True)

df_nov = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",header=True,inferSchema=True)

print(f'There are {df_oct.count()} rows in the oct dataframe')
df_oct.show(5)

print(f'There are {df_nov.count()} rows in the nov dataframe')
df_nov.show(5)


There are 42448764 rows in the oct dataframe
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-10-01 00:00:00|      view|  44600062|2103807459595387724|                NULL|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|
|2019-10-01 00:00:00|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|
|2019-10-01 00:00:01|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|  543.1|519107250|566511c2-e2e3-422...|
|2019-10-01 00:00:01|      view|   1307067|2053013558920217191|  computers.notebook|  lenovo| 251.74|550050854|7c90fc70-0e80-459...|
|2019-10-01 00:00:04|   

In [0]:
df_oct.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



**Operatation**

In [0]:
df_oct.select("event_time","event_type","product_id","brand","price").show(5)

+-------------------+----------+----------+--------+-------+
|         event_time|event_type|product_id|   brand|  price|
+-------------------+----------+----------+--------+-------+
|2019-10-01 00:00:00|      view|  44600062|shiseido|  35.79|
|2019-10-01 00:00:00|      view|   3900821|    aqua|   33.2|
|2019-10-01 00:00:01|      view|  17200506|    NULL|  543.1|
|2019-10-01 00:00:01|      view|   1307067|  lenovo| 251.74|
|2019-10-01 00:00:04|      view|   1004237|   apple|1081.98|
+-------------------+----------+----------+--------+-------+
only showing top 5 rows


In [0]:
df_oct.select("event_type").distinct().show()
df_oct.select("brand").distinct().show()

+----------+
|event_type|
+----------+
|  purchase|
|      cart|
|      view|
+----------+

+---------+
|    brand|
+---------+
| coolfort|
|  caprice|
|    daiwa|
|   casper|
|   sunday|
| marshall|
|     skad|
|      ivt|
| willmark|
|milavitsa|
|  ersport|
|    grohe|
|    daisy|
|  riviera|
|    ballu|
|    trebl|
|   carver|
|     NULL|
|   a-case|
|    kugoo|
+---------+
only showing top 20 rows


In [0]:
df_oct.filter((df_oct.event_type == "purchase") & (df_oct.price > 1000)).show()

+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+-------+---------+--------------------+
|2019-10-01 02:23:25|  purchase|   1005135|2053013555631882655|electronics.smart...|  apple|1747.79|515384420|7f82b450-6c45-434...|
|2019-10-01 02:31:01|  purchase|   1004238|2053013555631882655|electronics.smart...|  apple| 1206.4|555462711|38c6d3f7-6c32-4fe...|
|2019-10-01 02:31:46|  purchase|   1005124|2053013555631882655|electronics.smart...|  apple|1634.51|518599877|839e2764-11b1-4f8...|
|2019-10-01 02:32:38|  purchase|   1005105|2053013555631882655|electronics.smart...|  apple|1415.48|518463311|8b7b4848-d031-47b...|
|2019-10-01 02:34:23|  purchase|   1005116|2053013555631882655|electronics.s

*Top 10 brand name by total revenue*

In [0]:
from pyspark.sql.functions import sum, count, col

print(f'Top 10 brand name by total revenue')
df_oct.filter(df_oct.event_type == "purchase") \
  .groupBy("brand") \
  .agg(sum("price").alias("total_revenue")) \
  .orderBy("total_revenue", ascending=False) \
  .show(10)


Top 10 brand name by total revenue
+-------+--------------------+
|  brand|       total_revenue|
+-------+--------------------+
|  apple|1.1120926881999876E8|
|samsung| 4.640753260999967E7|
| xiaomi|   9194033.289999982|
|   NULL|   8540601.029999994|
| huawei|   4883421.740000001|
|   acer|  3576719.5199999986|
|     lg|   3387887.959999997|
|lucente|   3124113.370000003|
|   sony|  2478196.6800000006|
|   oppo|   2412959.759999997|
+-------+--------------------+
only showing top 10 rows


*Number of purchases per category*

In [0]:
df_oct.filter(df_oct.event_type == "purchase") \
  .groupBy("category_code") \
  .agg(count("category_code").alias("Purchase_count")) \
  .orderBy(col("Purchase_count").desc()) \
  .show(10)

+--------------------+--------------+
|       category_code|Purchase_count|
+--------------------+--------------+
|electronics.smart...|        338018|
|electronics.audio...|         30503|
|electronics.video.tv|         21565|
|  electronics.clocks|         17906|
|appliances.kitche...|         16148|
|  computers.notebook|         15590|
|appliances.enviro...|         12378|
|appliances.kitche...|         11218|
|  electronics.tablet|          5603|
|auto.accessories....|          4647|
+--------------------+--------------+
only showing top 10 rows


*Exporting the result*

In [0]:
Brand_By_Revenue_df = \
   df_oct.filter(df_oct.event_type == "purchase") \
  .groupBy("brand") \
  .agg(sum("price").alias("total_revenue")) \
  .orderBy("total_revenue", ascending=False)


In [0]:
  Brand_By_Revenue_df.write.mode("overwrite").saveAsTable("Brand_Revenue")

In [0]:
# Check the table
spark.read.table("Brand_Revenue").show()

+--------+--------------------+
|   brand|       total_revenue|
+--------+--------------------+
|   apple|1.1120926881999876E8|
| samsung| 4.640753260999967E7|
|  xiaomi|   9194033.289999982|
|    NULL|   8540601.029999994|
|  huawei|   4883421.740000001|
|    acer|  3576719.5199999986|
|      lg|   3387887.959999997|
| lucente|   3124113.370000003|
|    sony|  2478196.6800000006|
|    oppo|   2412959.759999997|
|  lenovo|  1752638.5300000003|
| indesit|  1250060.9500000004|
|   bosch|  1248729.0900000003|
|      hp|  1227215.9900000002|
|   artel|  1034152.4099999999|
|    asus|   970019.3899999994|
|    beko|   963940.5900000005|
|   haier|   892047.6899999996|
|dauscher|   608437.0199999997|
|   canon|           561658.19|
+--------+--------------------+
only showing top 20 rows
