## Big data Test Notebook

This notebook demonstrates how to connect to Spark and Hive, load data, and perform analysis.

## 1. Initialize Spark Session with Hive Support

In [1]:
import os
print(os.environ.get('SPARK_HOME'))
os.environ['SPARK_HOME'] = '/usr/local/spark'
print(os.environ.get('SPARK_HOME'))

/opt/bitnami/spark
/usr/local/spark


In [2]:
# First, stop any existing SparkContext
try:
    from pyspark import SparkContext
    sc = SparkContext.getOrCreate()
    sc.stop()
    print("Stopped existing SparkContext")
except Exception as e:
    print(f"No existing SparkContext to stop or error occurred: {e}")

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, explode, lit, array
import time
import os


# Create a Spark session with explicit cluster configuration
spark = SparkSession.builder \
    .appName("Explicit Spark Job Test") \
    .master("yarn") \
    .config("spark.driver.host", "jupyter") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.driver.memory", "1g") \
    .config("spark.yarn.am.memory", "1g") \
    .config("spark.yarn.am.cores", "1") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "1") \
    .config("spark.default.parallelism", "10") \
    .config("spark.sql.shuffle.partitions", "10") \
    .enableHiveSupport() \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")


Stopped existing SparkContext
Spark version: 3.3.0
Spark UI: http://jupyter:4040


### Example 1

In [3]:

# Create a large dataset to force distributed processing
print("Creating large dataset...")
df = spark.range(0, 1000000, 1, 10)  # 1 million rows with 10 partitions

# Add more columns to make the dataset larger
df = df.withColumn("random_value", rand()) \
       .withColumn("double_value", col("id") * 2) \
       .withColumn("array_col", array([lit(i) for i in range(10)]))

# Force materialization with cache
print("Caching dataset...")
df.cache()

# Force evaluation with count
print(f"Dataset count: {df.count()}")

# Perform a complex operation that will definitely use the cluster
print("Running aggregation job...")
result = df.groupBy(df.id % 100).count()

# Force shuffle with repartition
result = result.repartition(20)

# Collect results to trigger job execution
print("Collecting results...")
collected = result.collect()
print(f"Result size: {len(collected)}")

# Perform a join operation (this will trigger another job)
print("Running join operation...")
df2 = spark.range(0, 100, 1, 5)
joined = df.join(df2, df.id % 100 == df2.id, "inner")

# Force execution with show
print("Sample of joined data:")
joined.show(5)

# Sleep to keep the application alive so you can check the UI
print("Job completed. Sleeping for 30 seconds so you can check the Spark UI...")
time.sleep(30)

# Stop the Spark session
print("Stopping Spark session...")
spark.stop()
print("Done.")


Creating large dataset...
Caching dataset...
Dataset count: 1000000
Running aggregation job...
Collecting results...
Result size: 100
Running join operation...
Sample of joined data:
+---+-------------------+------------+--------------------+---+
| id|       random_value|double_value|           array_col| id|
+---+-------------------+------------+--------------------+---+
|  0| 0.8990252057024203|           0|[0, 1, 2, 3, 4, 5...|  0|
|  1| 0.6316103078771831|           2|[0, 1, 2, 3, 4, 5...|  1|
|  2|0.26705630323001883|           4|[0, 1, 2, 3, 4, 5...|  2|
|  3|0.38013733978644937|           6|[0, 1, 2, 3, 4, 5...|  3|
|  4| 0.6930249579784798|           8|[0, 1, 2, 3, 4, 5...|  4|
+---+-------------------+------------+--------------------+---+
only showing top 5 rows

Job completed. Sleeping for 30 seconds so you can check the Spark UI...
Stopping Spark session...
Done.


### Example 2

In [2]:
# # Create a Spark session with explicit cluster configuration
# spark = SparkSession.builder \
#     .appName("Explicit Spark Job Test") \
#     .master("yarn") \
#     .config("spark.driver.host", "jupyter") \
#     .config("spark.submit.deployMode", "client") \
#     .config("spark.driver.memory", "1g") \
#     .config("spark.yarn.am.memory", "1g") \
#     .config("spark.yarn.am.cores", "1") \
#     .config("spark.executor.memory", "1g") \
#     .config("spark.executor.cores", "1") \
#     .config("spark.default.parallelism", "10") \
#     .config("spark.sql.shuffle.partitions", "10") \
#     .enableHiveSupport() \
#     .getOrCreate()

# print(f"Spark version: {spark.version}")
# print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

NameError: name 'SparkSession' is not defined

In [3]:
from pyspark.sql.functions import col, to_date, count
import os

df = spark.read.parquet("hdfs://namenode:9000/data/raw/nyc_trip_data/*.parquet")

df_processed = df.withColumn("date", to_date(col("tpep_pickup_datetime"))) \
                 .groupBy("date") \
                 .agg(count("*").alias("num_trips"))

In [4]:
df.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2024-01-01 00:57:55|  2024-01-01 01:17:43|              1|         1.72|         1|                 N|         186|          79|           2|       17.7|  1.0|    0.5|       0.

In [5]:
df.count()

9554778

In [6]:
from pyspark.sql.types import IntegerType, DecimalType
from pyspark.sql.functions import col, to_date, avg, sum, count

# Transform: Aggregate revenue metrics
fact_revenue = df.withColumn("date", to_date(col("tpep_pickup_datetime"))) \
    .withColumn("payment_type_id", col("Payment_type").cast(IntegerType())) \
    .groupBy("date", "payment_type_id") \
    .agg(
        sum(col("Fare_amount").cast(DecimalType(10, 2))).alias("total_fare"),
        sum(col("Tip_amount").cast(DecimalType(10, 2))).alias("total_tips"),
        avg(col("Fare_amount").cast(DecimalType(10, 2))).alias("avg_fare_per_trip"),
        count("*").alias("transaction_count")
    )

# Dimension table: Payment types (from NYC TLC data dictionary)
dim_payment_type_data = [
    (1, "Credit Card"),
    (2, "Cash"),
    (3, "No Charge"),
    (4, "Dispute"),
    (5, "Unknown"),
    (6, "Voided Trip")
]

dim_payment_type = spark.createDataFrame(dim_payment_type_data, ["payment_type_id", "payment_desc"])

In [7]:
dim_payment_type.show()

+---------------+------------+
|payment_type_id|payment_desc|
+---------------+------------+
|              1| Credit Card|
|              2|        Cash|
|              3|   No Charge|
|              4|     Dispute|
|              5|     Unknown|
|              6| Voided Trip|
+---------------+------------+



In [8]:
fact_revenue.show()

+----------+---------------+----------+----------+-----------------+-----------------+
|      date|payment_type_id|total_fare|total_tips|avg_fare_per_trip|transaction_count|
+----------+---------------+----------+----------+-----------------+-----------------+
|2024-01-13|              2| 255818.15|     16.68|        16.735454|            15286|
|2024-01-13|              3|   3868.72|     21.21|         6.363026|              608|
|2024-01-15|              4|   1213.51|     87.45|         1.072005|             1132|
|2024-01-19|              1|1341188.40| 304211.09|        17.573454|            76319|
|2024-01-23|              3|   3833.31|      0.00|         6.784619|              565|
|2024-01-15|              1|1218745.17| 272509.15|        19.892684|            61266|
|2024-01-18|              2| 266067.65|     23.78|        17.543693|            15166|
|2024-01-22|              4|   1684.57|     71.35|         1.266594|             1330|
|2024-01-22|              3|   4157.00|    

In [73]:
# Stop the Spark session
spark.stop()