# Reading DAGs

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

<h1> Topics </h1>

1. Reading Files (parquet)
2. Narrow Operations
   - `filter`
   - `withColumn`: adding/modifying a column
   - `select`: selecting relevant column
3. Wide Operations
   - Joins
     - Sort Merge Join
     - Broadcast Join
   - GroupBy
     - `count`
     - `sum`
     - `countDistinct`

# Reading File

In [0]:
%run ../PySpark/DatasetSourcePath

In [0]:
transactions_file = sourcePath + "/dataset/data_skew/transactions.parquet"
df_transactions = spark.read.parquet(transactions_file)

In [0]:
df_transactions.rdd.getNumPartitions()

In [0]:
sc.setJobDescription("Show Action")
df_transactions.show(5, False)

## Switch to Spark UI 
> **Note** A `batch` refers to a group of rows that are processed together.

In [0]:
customers_file = sourcePath + "/dataset/data_skew/customers.parquet"
df_customers = spark.read.parquet(customers_file)

In [0]:
df_customers.show(5, False)

# Narrow Transformations
- `filter` rows where `city='boston'`
- `add` a new column: adding `first_name` and `last_name`
- `alter` an exisitng column: adding 5 to `age` column
- `select` relevant columns

In [0]:
df_narrow_transform = (
    df_customers
    .filter(col("city") == "boston")
    .withColumn("first_name", split("name", " ").getItem(0))
    .withColumn("last_name", split("name", " ").getItem(1))
    .withColumn("age", col("age") + lit(5))
    .select("cust_id", "first_name", "last_name", "age", "gender", "birthday")
)

df_narrow_transform.write.format("noop").mode("overwrite").save("/df_narrow_transform.parquet")

In [0]:
df_narrow_transform.show(7, False)

In [0]:
df_customer_gt_50 = (
    df_customers
    .filter(col("age").cast("int") > 50)
)
df_customer_gt_50.write.format("noop").mode("overwrite").save("/df_customer_gt_50.parquet")

# Wide Transformations
1. Joins
   - Sort Merge Join
   - Broadcast Join
2. GroupBy
   - `count`
   - `countDistinct`
   - `sum`

## 1. Joins

### Sort Merge Join

In [0]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [0]:
df_joined = (
    df_transactions.join(
        df_customers,
        how="inner",
        on="cust_id"
    )
)

In [0]:
df_joined.write.format("noop").mode("overwrite").save("/df_joined.parquet")

### Broadcast Join

In [0]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 10485760)

In [0]:
df_broadcast_joined = (
    df_transactions.join(
        F.broadcast(df_customers),
        how="inner",
        on="cust_id"
    )
)

In [0]:
df_broadcast_joined.write.format("noop").mode("overwrite").save("../data/test/df_broadcast_joined.parquet")

## 2. GroupBy

### GroupBy Count

In [0]:
df_city_counts = (
    df_transactions
    .groupBy("city")
    .count()
)

In [0]:
df_city_counts.show(5, False)

In [0]:
df_txn_amt_city = (
    df_transactions
    .groupBy("city")
    .agg(F.sum("amt").alias("txn_amt"))
)

In [0]:
df_txn_amt_city.show(5, False)

### GroupBy Count Distinct 

In [0]:
df_txn_per_city = (
    df_transactions
    .groupBy("city")
    .agg(F.countDistinct("txn_id").alias("txn_count"))
)

In [0]:
df_txn_per_city.show(5, False)

In [0]:
spark.stop()