## Big data Test Notebook

This notebook demonstrates how to connect to Spark and Hive, load data, and perform analysis.

## 1. Initialize Spark Session with Hive Support

In [1]:
import os
print(os.environ.get('SPARK_HOME'))
os.environ['SPARK_HOME'] = '/usr/local/spark'
print(os.environ.get('SPARK_HOME'))

/opt/bitnami/spark
/usr/local/spark


In [78]:
!pip install pydeequ



In [51]:
# First, stop any existing SparkContext
try:
    from pyspark import SparkContext
    sc = SparkContext.getOrCreate()
    sc.stop()
    print("Stopped existing SparkContext")
except Exception as e:
    print(f"No existing SparkContext to stop or error occurred: {e}")

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, explode, lit, array
import time
import os


# Create a Spark session with explicit cluster configuration
spark = SparkSession.builder \
    .appName("Explicit Spark Job Test") \
    .master("yarn") \
    .config("spark.driver.host", "jupyter") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.driver.memory", "4g") \
    .config("spark.yarn.am.memory", "1g") \
    .config("spark.yarn.am.cores", "1") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.default.parallelism", "10") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.sql.caseSensitive", "false") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")


Stopped existing SparkContext
Spark version: 3.3.0
Spark UI: http://jupyter:4040


In [52]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, TimestampType, DataType, LongType

unified_schema = StructType([
    StructField("VendorID", LongType(), True), # int, bigint -> long
    StructField("tpep_pickup_datetime", TimestampType(), True),
    StructField("tpep_dropoff_datetime", TimestampType(), True),
    # Đọc passenger_count là double trước cho an toàn (vì có thể là double), sau đó ép về long
    StructField("passenger_count", DoubleType(), True),
    StructField("trip_distance", DoubleType(), True),
    # Đọc RatecodeID là double trước cho an toàn (vì có thể là double), sau đó ép về long
    StructField("RatecodeID", DoubleType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("PULocationID", LongType(), True), # int, bigint -> long
    StructField("DOLocationID", LongType(), True), # int, bigint -> long
    StructField("payment_type", LongType(), True), # bigint -> long
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("congestion_surcharge", DoubleType(), True),
    # Đọc airport_fee là double, xử lý cả Airport_fee và airport_fee
    # Spark thường tự xử lý case-insensitivity khi áp schema cho Parquet,
    # nhưng chúng ta chỉ định nghĩa 'airport_fee' (viết thường)
    StructField("airport_fee", DoubleType(), True),
])

In [12]:
import time
from collections import defaultdict

hdfs_path_str = "hdfs://namenode:9000/data/raw/nyc_trip/"


# Lấy danh sách file Parquet bằng Hadoop API của Spark
hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
URI = spark.sparkContext._jvm.java.net.URI
Path = spark.sparkContext._jvm.org.apache.hadoop.fs.Path
FileSystem = spark.sparkContext._jvm.org.apache.hadoop.fs.FileSystem

fs = FileSystem.get(URI(hdfs_path_str), hadoop_conf)
status_list = fs.listStatus(Path(hdfs_path_str))

parquet_files = []
if status_list:
    for status in status_list:
        file_path = status.getPath().toString()
        if file_path.endswith(".parquet") and not status.isDirectory() and not status.getPath().getName().startswith("_"):
            parquet_files.append(file_path)
else:
    print(f"Không tìm thấy file nào trong {hdfs_path_str}")

print(f"Tìm thấy {len(parquet_files)} file Parquet hợp lệ.")

# Sử dụng defaultdict(set) để tự động tạo set khi gặp key mới
all_column_types = defaultdict(set)
files_with_errors = []
start_time = time.time()

if parquet_files:
    print(f"\nBắt đầu quét schema của tất cả các cột trong các file bằng Spark:")
    for i, file_path in enumerate(parquet_files):
        print(f"  [{i+1}/{len(parquet_files)}] Đang xử lý file: {file_path.split('/')[-1]} ...", end="")
        try:
            schema = spark.read.parquet(file_path).schema
            processed_cols_count = 0
            # Lặp qua tất cả các field (cột) trong schema của file này
            for field in schema.fields:
                col_name = field.name
                if isinstance(field.dataType, DataType): # Kiểm tra cơ bản kiểu dữ liệu hợp lệ
                    spark_type_name = field.dataType.simpleString()
                    # Thêm kiểu dữ liệu vào set tương ứng với tên cột
                    all_column_types[col_name].add(spark_type_name)
                    processed_cols_count += 1

            print(f" -> Đã xử lý {processed_cols_count} cột.")

        except Exception as e:
            # Ghi nhận lỗi nhưng tiếp tục với file tiếp theo
            error_msg = f"{type(e).__name__}: {e}"
            print(f" -> LỖI ĐỌC FILE: {error_msg}")
            files_with_errors.append((file_path, error_msg))

    end_time = time.time()
    print(f"\nHoàn thành quét {len(parquet_files)} file trong {end_time - start_time:.2f} giây.")

# In kết quả tổng hợp
print("\n--- Tổng hợp kiểu dữ liệu cho từng cột ---")
if all_column_types:
    # Sắp xếp tên cột theo alphabet để dễ nhìn
    for col_name in sorted(all_column_types.keys()):
        # Lấy danh sách các kiểu duy nhất và sắp xếp chúng
        sorted_types = sorted(list(all_column_types[col_name]))
        print(f"- {col_name}: {', '.join(sorted_types)}")
else:
     print("Không tìm thấy cột nào hoặc tất cả các file đều lỗi.")

Tìm thấy 60 file Parquet hợp lệ.

Bắt đầu quét schema của tất cả các cột trong các file bằng Spark:
  [1/60] Đang xử lý file: yellow_tripdata_2020-01.parquet ... -> Đã xử lý 19 cột.
  [2/60] Đang xử lý file: yellow_tripdata_2020-02.parquet ... -> Đã xử lý 19 cột.
  [3/60] Đang xử lý file: yellow_tripdata_2020-03.parquet ... -> Đã xử lý 19 cột.
  [4/60] Đang xử lý file: yellow_tripdata_2020-04.parquet ... -> Đã xử lý 19 cột.
  [5/60] Đang xử lý file: yellow_tripdata_2020-05.parquet ... -> Đã xử lý 19 cột.
  [6/60] Đang xử lý file: yellow_tripdata_2020-06.parquet ... -> Đã xử lý 19 cột.
  [7/60] Đang xử lý file: yellow_tripdata_2020-07.parquet ... -> Đã xử lý 19 cột.
  [8/60] Đang xử lý file: yellow_tripdata_2020-08.parquet ... -> Đã xử lý 19 cột.
  [9/60] Đang xử lý file: yellow_tripdata_2020-09.parquet ... -> Đã xử lý 19 cột.
  [10/60] Đang xử lý file: yellow_tripdata_2020-10.parquet ... -> Đã xử lý 19 cột.
  [11/60] Đang xử lý file: yellow_tripdata_2020-11.parquet ... -> Đã xử lý 19 c

In [53]:
from pyspark.sql.functions import col, year, month

input_path = "hdfs://namenode:9000/data/raw/nyc_trip/"

db_name = "raw"
table_name = "nyc_trips_raw"
output_table = f"{db_name}.{table_name}"

# HDFS base path for the 'raw' database (as seen in your psql output)
db_location = f"hdfs://namenode:9000/user/hive/warehouse/{db_name}.db"
output_path = f"{db_location}/{table_name}"

# Partition columns (ensure these exist in the data or are derived below)
partition_columns = ["year", "month"]

# Column containing the date/timestamp information to derive year/month
# Adjust this to the actual column name in your parquet files
timestamp_column = "tpep_pickup_datetime" # Example: common in NYC Taxi dataset

print(f"Spark Session created. Spark UI: {spark.sparkContext.uiWebUrl}")
print(f"Using Hive Metastore: {spark.conf.get('spark.sql.catalogImplementation') == 'hive'}")

# --- Create Hive Database (if not exists) ---
# Ensure the database exists with the correct HDFS location
print(f"Ensuring Hive database '{db_name}' exists at location '{db_location}'...")
spark.sql(f"CREATE DATABASE IF NOT EXISTS {db_name}")
print(f"Using database '{db_name}'")
spark.sql(f"USE {db_name}")

# --- Read Raw Data ---
print(f"Reading Parquet files from: {input_path}")
try:
    df_raw = spark.read.schema(unified_schema).parquet(input_path)
    print("Successfully read raw data. Schema:")
    df_raw.printSchema()
    print(f"Number of rows read: {df_raw.count()}")
except Exception as e:
    print(f"ERROR: Failed to read Parquet files from {input_path}. {e}")
    # spark.stop()
    exit(1)

Spark Session created. Spark UI: http://jupyter:4040
Using Hive Metastore: True
Ensuring Hive database 'raw' exists at location 'hdfs://namenode:9000/user/hive/warehouse/raw.db'...
Using database 'raw'
Reading Parquet files from: hdfs://namenode:9000/data/raw/nyc_trip/
Successfully read raw data. Schema:
root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharg

In [54]:
# --- Data Transformation: Final Casting and Partitioning ---
print("Starting final transformations: casting specific columns and deriving partitions...")
df_transformed = df_raw

# Đảm bảo VendorID là LongType
if "VendorID" in df_transformed.columns:
    print("- Casting 'VendorID' to LongType...")
    df_transformed = df_transformed.withColumn("VendorID", col("VendorID").cast(LongType()))
    
# Ép kiểu các cột đã đọc dưới dạng double về long nếu cần
columns_to_cast_to_long = ["passenger_count", "RatecodeID"]
for col_name in columns_to_cast_to_long:
    if col_name in df_transformed.columns:
        print(f"- Casting '{col_name}' from double to long...")
        # Có thể thêm xử lý lỗi nếu giá trị không thể ép kiểu (ví dụ: giá trị không nguyên)
        df_transformed = df_transformed.withColumn(col_name, col(col_name).cast(LongType()))
    else:
        print(f"- Column '{col_name}' not found for final casting, skipping.")
        
# Derive Partition Columns
print(f"Deriving partition columns '{partition_columns}' from '{timestamp_column}'...")
if timestamp_column in df_transformed.columns:
    # Đảm bảo cột timestamp thực sự là timestamp trước khi dùng year/month
    if isinstance(df_transformed.schema[timestamp_column].dataType, TimestampType):
        df_final = df_transformed.withColumn("year", year(col(timestamp_column))) \
                                 .withColumn("month", month(col(timestamp_column)))
        print("Derived columns added.")
    else:
        print(f"ERROR: Column '{timestamp_column}' is not TimestampType after read. Actual type: {df_transformed.schema[timestamp_column].dataType}")
        #spark.stop()
        exit(1)

else:
    print(f"ERROR: Timestamp column '{timestamp_column}' not found for partitioning.")
    #spark.stop()
    exit(1)


Starting final transformations: casting specific columns and deriving partitions...
- Casting 'VendorID' to LongType...
- Casting 'passenger_count' from double to long...
- Casting 'RatecodeID' from double to long...
Deriving partition columns '['year', 'month']' from 'tpep_pickup_datetime'...
Derived columns added.


In [55]:
df_final.printSchema()
df_final.count()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



174689444

In [64]:
df_final.show(5)

Py4JJavaError: An error occurred while calling o1489.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 16.0 failed 4 times, most recent failure: Lost task 0.3 in stage 16.0 (TID 107) (71fe71010aed executor 1): org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file hdfs://namenode:9000/data/raw/nyc_trip/yellow_tripdata_2020-01.parquet. Column: [airport_fee], Expected: double, Found: INT32
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:706)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:278)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1125)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:187)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:316)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:212)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:274)
	... 20 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:506)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file hdfs://namenode:9000/data/raw/nyc_trip/yellow_tripdata_2020-01.parquet. Column: [airport_fee], Expected: double, Found: INT32
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:706)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:278)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1125)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:187)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:316)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:212)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:274)
	... 20 more


In [58]:
# --- Write Partitioned Data to Hive Table ---
print(f"Writing data to Hive table '{output_table}' partitioned by {partition_columns}...")
print(f"(Data will be stored in HDFS under: {output_path}/*)")
try:
    df_final.write \
        .partitionBy(partition_columns) \
        .mode("overwrite") \
        .format("parquet") \
        .saveAsTable(output_table) # This registers the table in Hive Metastore
    print(f"Successfully wrote partitioned data to Hive table '{output_table}'.")
except Exception as e:
    print(f"ERROR: Failed to write data to Hive table '{output_table}'. {e}")
    # spark.stop()
    exit(1)

# --- Verification (Optional) ---
print("\n--- Verification ---")
print(f"Listing tables in database '{db_name}':")
spark.sql(f"SHOW TABLES IN {db_name}").show()

print(f"\nDescribing table '{output_table}':")
spark.sql(f"DESCRIBE FORMATTED {output_table}").show(truncate=False, n=100)

print(f"\nShowing partitions for table '{output_table}':")
spark.sql(f"SHOW PARTITIONS {output_table}").show(50, truncate=False)

print(f"\nQuerying sample data from '{output_table}':")
spark.sql(f"SELECT * FROM {output_table} WHERE year = <some_year> AND month = <some_month> LIMIT 5").show(truncate=False) # Replace <some_year> and <some_month> with actual values

# # --- Stop Spark Session ---
# print("Stopping Spark Session.")
# spark.stop()
# print("Script finished.")

Writing data to Hive table 'raw.nyc_trips_raw' partitioned by ['year', 'month']...
(Data will be stored in HDFS under: hdfs://namenode:9000/user/hive/warehouse/raw.db/nyc_trips_raw/*)
ERROR: Failed to write data to Hive table 'raw.nyc_trips_raw'. An error occurred while calling o1516.saveAsTable.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:638)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:278)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
	at org.apache.spark.sql.execution.datasources.DataSource.writeAndRead(DataSource.scala:538)
	at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.saveDataIntoTable(createDataSourceTables.scala:228)
	at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.run(createDataSour

AnalysisException: Table or view not found: raw.nyc_trips_raw; line 1 pos 19;
'DescribeRelation true, [col_name#3478, data_type#3479, comment#3480]
+- 'UnresolvedTableOrView [raw, nyc_trips_raw], DESCRIBE TABLE, true


In [65]:
from pyspark.sql.functions import col, min, max, count, when, lit # Import các hàm cần thiết
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, TimestampType, DataType

APP_NAME = "NYC Taxi Processing Pipeline"
DATA_PATH =  hdfs_path_str
# Tùy chọn: Giới hạn số lượng dòng hiển thị cho các ví dụ lỗi/dữ liệu
SAMPLE_ROW_LIMIT = 10

def analyze_range(df, column_name):
    """Phân tích min/max của một cột số."""
    print(f"Analyzing range for column: {column_name}")
    try:
        result = df.agg(
            min(col(column_name)).alias(f"min_{column_name}"),
            max(col(column_name)).alias(f"max_{column_name}")
        ).first() # Lấy dòng kết quả đầu tiên
        if result:
            min_val = result[f"min_{column_name}"]
            max_val = result[f"max_{column_name}"]
            print(f" -> Min: {min_val}, Max: {max_val}")
            return min_val, max_val
        else:
            print(" -> No data found for range analysis.")
            return None, None
    except Exception as e:
        print(f" -> ERROR analyzing range for {column_name}: {e}")
        return None, None

def check_cast_issues(df_raw, df_processed, column_name, original_type="double", target_type="long"):
    """Kiểm tra các vấn đề khi ép kiểu từ kiểu gốc sang kiểu đích."""
    print(f"Checking casting issues for: {column_name} ({original_type} -> {target_type})")
    raw_col = df_raw.alias("raw").col(column_name)
    processed_col = df_processed.alias("processed").col(column_name)
    
    # Tham gia df gốc và df đã xử lý để so sánh trên cùng một bản ghi (sử dụng ID duy nhất nếu có, hoặc dựa vào thứ tự nếu đảm bảo)
    # Nếu không có ID duy nhất, cách đơn giản hơn là đếm trực tiếp
    try:
        # Đếm các trường hợp giá trị gốc không null nhưng giá trị sau cast lại là null
        cast_null_count = df_processed.join(df_raw.selectExpr(f"{column_name} as raw_{column_name}", "*"), 
                                            taxi_df_raw.columns, # Join trên tất cả cột để đảm bảo khớp đúng dòng (có thể chậm)
                                            "inner") \
                                      .where(col(column_name).isNull() & col(f"raw_{column_name}").isNotNull()) \
                                      .count()

        # Hoặc cách đơn giản hơn nếu không join được: Đếm dựa trên so sánh trực tiếp (có thể không hoàn toàn chính xác nếu có null gốc)
        # cast_null_count = df_processed.filter(col(column_name).isNull())\
        #                               .join(df_raw.filter(col(column_name).isNotNull()).selectExpr(f"{column_name} as raw_{column_name}"), 
        #                                     ??? How to join without unique id ??? ) # This join is problematic
        
        # Cách tiếp cận thực tế hơn: Đếm null sau cast và so sánh với null trước cast
        null_after_cast = df_processed.where(col(column_name).isNull()).count()
        null_before_cast = df_raw.where(col(column_name).isNull()).count()
        
        # Ước tính số lượng lỗi cast:
        estimated_cast_errors = null_after_cast - null_before_cast
        
        print(f" -> Nulls before cast: {null_before_cast}")
        print(f" -> Nulls after cast: {null_after_cast}")
        if estimated_cast_errors > 0:
            print(f" -> !!! POTENTIAL CAST ERRORS DETECTED: Approximately {estimated_cast_errors} rows became null after casting.")
            # Hiển thị ví dụ các dòng gặp lỗi cast
            print(f"   -> Example rows from raw data that likely caused cast errors for '{column_name}':")
            # Tìm các giá trị không null ban đầu mà khi cast thì khác hoặc null
            potential_error_rows = df_raw.filter(
                col(column_name).isNotNull() & \
                (col(column_name).cast(target_type).isNull() | (col(column_name) != col(column_name).cast(target_type)))
            )
            potential_error_rows.select(column_name).distinct().show(SAMPLE_ROW_LIMIT, truncate=False)
            return estimated_cast_errors
        else:
            print(" -> No obvious casting errors detected (null count did not increase significantly).")
            return 0
    except Exception as e:
        print(f" -> ERROR checking cast issues for {column_name}: {e}")
        return -1 # Indicate error

In [66]:
print(f"\n--- Step 2: Read Raw Parquet Data from {DATA_PATH} ---")
try:
    taxi_df_raw = spark.read \
        .schema(unified_schema) \
        .parquet("hdfs://namenode:9000/data/raw/nyc_trip/*.parquet")

    print("Successfully read raw data.")
    raw_count = taxi_df_raw.count()
    print(f"Total records read: {raw_count}")
    print("Raw DataFrame Schema:")
    taxi_df_raw.printSchema()

except Exception as e:
    print(f"FATAL ERROR reading raw data: {e}")
    spark.stop()
    exit(1)

print("\n--- Step 3: Initial Cast (Double to Long) ---")
taxi_df_processed = taxi_df_raw \
    .withColumn("passenger_count_long", col("passenger_count").cast(LongType())) \
    .withColumn("RatecodeID_long", col("RatecodeID").cast(LongType()))

print("Initial casting applied.")
print("Schema after initial casting (new columns added):")
taxi_df_processed.printSchema()


--- Step 2: Read Raw Parquet Data from hdfs://namenode:9000/data/raw/nyc_trip/ ---
Successfully read raw data.
Total records read: 174689444
Raw DataFrame Schema:
root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: 

In [67]:
print("\n--- Step 4: Check Casting Issues ---")
passenger_cast_errors = check_cast_issues(taxi_df_raw.select("passenger_count"), 
                                           taxi_df_processed.select("passenger_count_long"), 
                                           "passenger_count", original_type="double", target_type="long")

ratecode_cast_errors = check_cast_issues(taxi_df_raw.select("RatecodeID"), 
                                          taxi_df_processed.select("RatecodeID_long"), 
                                          "RatecodeID", original_type="double", target_type="long")

# Xử lý các cột đã cast (ví dụ: giữ lại cột cast nếu không có lỗi, hoặc xử lý lỗi)
# Ở đây, chúng ta sẽ giữ lại cột đã cast và đổi tên/bỏ cột cũ
# Nếu có lỗi nghiêm trọng, bạn có thể quyết định dừng pipeline hoặc xử lý dòng lỗi
if passenger_cast_errors > 0 or ratecode_cast_errors > 0:
     print("WARNING: Casting errors detected. Review the problematic values shown above.")
     # Quyết định tiếp theo: Dừng? Ghi log lỗi? Gán giá trị mặc định?...
     # Ví dụ: Gán giá trị mặc định cho dòng lỗi cast
     # taxi_df_processed = taxi_df_processed.withColumn(
     #     "passenger_count", 
     #     when(col("passenger_count_long").isNull() & col("passenger_count").isNotNull(), lit(-1).cast(LongType())) # Gán -1 cho lỗi
     #     .otherwise(col("passenger_count_long"))
     # ) # Logic tương tự cho RatecodeID
     pass # Hiện tại chỉ cảnh báo





--- Step 4: Check Casting Issues ---
Checking casting issues for: passenger_count (double -> long)


AttributeError: 'DataFrame' object has no attribute 'col'

In [68]:
# Đổi tên cột đã cast thành công và bỏ cột double gốc
taxi_df_processed = taxi_df_processed \
                    .withColumn("passenger_count", col("passenger_count_long")) \
                    .withColumn("RatecodeID", col("RatecodeID_long")) \
                    .drop("passenger_count_long", "RatecodeID_long")
print("\nSchema after resolving initial cast columns:")
taxi_df_processed.printSchema()


# 6. Phân Tích Miền Giá Trị cho việc Tối Ưu Hóa
print("\n--- Step 5: Analyze Value Ranges for Optimization ---")
columns_to_analyze = [
    "VendorID", "passenger_count", "RatecodeID", 
    "PULocationID", "DOLocationID", "payment_type"
]
value_ranges = {}
for col_name in columns_to_analyze:
    min_val, max_val = analyze_range(taxi_df_processed, col_name)
    if min_val is not None and max_val is not None:
         value_ranges[col_name] = {"min": min_val, "max": max_val}
    else:
         print(f"Could not determine range for {col_name}, skipping optimization for this column.")


# 7. Ép Kiểu Tối Ưu Hóa
print("\n--- Step 6: Optimize Data Types ---")
taxi_df_optimized = taxi_df_processed # Bắt đầu từ df đã xử lý

# Logic ép kiểu dựa trên kết quả phân tích và kiến thức miền
# Ví dụ (Cần điều chỉnh dựa trên kết quả thực tế từ Step 5):
optimizations = {}
if "VendorID" in value_ranges:
    if value_ranges["VendorID"]["min"] >= 0 and value_ranges["VendorID"]["max"] <= 127:
         optimizations["VendorID"] = ByteType()
if "passenger_count" in value_ranges:
     # Thường là số nhỏ, dùng ShortType an toàn
    if value_ranges["passenger_count"]["min"] >= -32768 and value_ranges["passenger_count"]["max"] <= 32767:
         optimizations["passenger_count"] = ShortType()
if "RatecodeID" in value_ranges:
    # Mã rate thường rất nhỏ
    if value_ranges["RatecodeID"]["min"] >= -128 and value_ranges["RatecodeID"]["max"] <= 127:
        optimizations["RatecodeID"] = ByteType()
    elif value_ranges["RatecodeID"]["min"] >= -32768 and value_ranges["RatecodeID"]["max"] <= 32767:
         optimizations["RatecodeID"] = ShortType()
if "payment_type" in value_ranges:
     # Mã payment thường rất nhỏ
    if value_ranges["payment_type"]["min"] >= -128 and value_ranges["payment_type"]["max"] <= 127:
         optimizations["payment_type"] = ByteType()
    elif value_ranges["payment_type"]["min"] >= -32768 and value_ranges["payment_type"]["max"] <= 32767:
         optimizations["payment_type"] = ShortType()
if "PULocationID" in value_ranges and "DOLocationID" in value_ranges:
    # Location ID thường < 265
    max_loc = max(value_ranges["PULocationID"]["max"], value_ranges["DOLocationID"]["max"])
    min_loc = min(value_ranges["PULocationID"]["min"], value_ranges["DOLocationID"]["min"])
    if min_loc >= -32768 and max_loc <= 32767:
        optimizations["PULocationID"] = ShortType()
        optimizations["DOLocationID"] = ShortType()

if optimizations:
    print("Applying the following type optimizations:")
    for col_name, target_type in optimizations.items():
        print(f" - Casting '{col_name}' to {target_type}")
        taxi_df_optimized = taxi_df_optimized.withColumn(col_name, col(col_name).cast(target_type))

    print("\nSchema after Optimization:")
    taxi_df_optimized.printSchema()
else:
    print("No optimizations applied based on range analysis or defaults.")

# 8. Hiển thị Kết Quả Cuối Cùng
print("\n--- Step 7: Display Final Optimized Data ---")
print(f"Displaying first {SAMPLE_ROW_LIMIT} rows of the final optimized DataFrame:")
taxi_df_optimized.show(SAMPLE_ROW_LIMIT, truncate=False)


Schema after resolving initial cast columns:
root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)


--- Step 5: Analyze Value Ranges for Optimization ---
Analyzing range for column: VendorID
 -> 

NameError: name 'ByteType' is not defined

In [94]:
from pyspark.sql.functions import col, to_date, count
import os

df = spark.read.schema(unified_schema).option("mode", "PERMISSIVE").parquet("hdfs://namenode:9000/data/raw/nyc_trip/*.parquet")
#df = spark.read.parquet("hdfs://namenode:9000/data/raw/nyc_trip/*.parquet")


In [95]:
df.show(5)

Py4JJavaError: An error occurred while calling o4164.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 63) (34af8229d9c6 executor 1): org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file hdfs://namenode:9000/data/raw/nyc_trip/yellow_tripdata_2020-01.parquet. Column: [passenger_count], Expected: bigint, Found: DOUBLE
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:706)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:278)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1125)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:187)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:316)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:212)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:274)
	... 20 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:506)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file hdfs://namenode:9000/data/raw/nyc_trip/yellow_tripdata_2020-01.parquet. Column: [passenger_count], Expected: bigint, Found: DOUBLE
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:706)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:278)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:364)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1125)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:187)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:316)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:212)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:274)
	... 20 more


In [87]:
df_processed = df.withColumn("date", to_date(col("tpep_pickup_datetime"))) \
                 .groupBy("date") \
                 .agg(count("*").alias("num_trips"))

print(df.sample(fraction=0.1).describe().show())

Py4JJavaError: An error occurred while calling o4093.describe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 327.0 failed 4 times, most recent failure: Lost task 4.3 in stage 327.0 (TID 1279) (34af8229d9c6 executor 1): org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file hdfs://namenode:9000/data/raw/nyc_trip/yellow_tripdata_2024-11.parquet. Column: [DOLocationID], Expected: bigint, Found: INT32
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:706)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:278)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.aggregate.SortAggregateExec.$anonfun$doExecute$1(SortAggregateExec.scala:62)
	at org.apache.spark.sql.execution.aggregate.SortAggregateExec.$anonfun$doExecute$1$adapted(SortAggregateExec.scala:59)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:877)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:877)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1125)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:187)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:316)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:212)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:274)
	... 26 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file hdfs://namenode:9000/data/raw/nyc_trip/yellow_tripdata_2024-11.parquet. Column: [DOLocationID], Expected: bigint, Found: INT32
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:706)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:278)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:553)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at org.apache.spark.sql.execution.aggregate.SortAggregateExec.$anonfun$doExecute$1(SortAggregateExec.scala:62)
	at org.apache.spark.sql.execution.aggregate.SortAggregateExec.$anonfun$doExecute$1$adapted(SortAggregateExec.scala:59)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:877)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:877)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1125)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:187)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:316)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:212)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:274)
	... 26 more


In [39]:
df_processed.unpersist()

DataFrame[date: date, num_trips: bigint]

In [79]:
from pydeequ.analyzers import AnalysisRunner, AnalyzerContext, Size, Completeness, Mean, CountDistinct, Histogram

os.environ["SPARK_VERSION"] = '3.3'
# Giả sử bạn có SparkSession 'spark' và DataFrame 'df'
analysisResult = AnalysisRunner(spark) \
    .onData(df) \
    .addAnalyzer(Size()) \
    .addAnalyzer(DataType("VendorID")) \
    .run()

analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show(truncate=False)

TypeError: 'JavaPackage' object is not callable

In [38]:
# ----- TỐI ƯU: Cache DataFrame -----
df_processed.cache()
# Lưu ý: Lệnh cache() cũng là lazy. Việc cache thực sự chỉ xảy ra khi một action được gọi trên df_processed.

# ----- Thực thi Action lần đầu (sẽ tính toán và cache) -----
print("Chạy lần đầu (tính toán và cache):")
start_time = time.time()
df_processed.show(10)
# Hoặc bạn có thể dùng một action khác không tốn kém để kích hoạt cache, ví dụ: df_processed.count()
# df_processed.count() # Action này sẽ buộc Spark tính toán và cache toàn bộ df_processed
print(f"Thời gian lần đầu: {time.time() - start_time:.2f} giây")

# ----- Thực thi Action lần thứ hai (sẽ đọc từ cache) -----
print("\nChạy lần thứ hai (đọc từ cache):")
start_time = time.time()
df_processed.show(10) # Lần này sẽ nhanh hơn đáng kể
print(f"Thời gian lần hai: {time.time() - start_time:.2f} giây")

Chạy lần đầu (tính toán và cache):
+----------+---------+
|      date|num_trips|
+----------+---------+
|2019-12-31|      129|
|2020-01-04|   182752|
|2020-01-16|   239657|
|2020-06-19|    22291|
|2020-07-31|    34555|
|2024-09-30|   104745|
|2024-10-08|   121402|
|2024-10-16|   134891|
|2024-11-01|   137690|
|2024-05-01|   121929|
+----------+---------+
only showing top 10 rows

Thời gian lần đầu: 9.13 giây

Chạy lần thứ hai (đọc từ cache):
+----------+---------+
|      date|num_trips|
+----------+---------+
|2019-12-31|      129|
|2020-01-04|   182752|
|2020-01-16|   239657|
|2020-06-19|    22291|
|2020-07-31|    34555|
|2024-09-30|   104745|
|2024-10-08|   121402|
|2024-10-16|   134891|
|2024-11-01|   137690|
|2024-05-01|   121929|
+----------+---------+
only showing top 10 rows

Thời gian lần hai: 0.11 giây


In [30]:
df.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2020-01-01 00:28:15|  2020-01-01 00:33:03|            1.0|          1.2|       1.0|                 N|         238|         239|           1|        6.0|  3.0|    0.5|      1.4

In [5]:
df.count()

174689444

In [6]:
from pyspark.sql.types import IntegerType, DecimalType
from pyspark.sql.functions import col, to_date, avg, sum, count

# Transform: Aggregate revenue metrics
fact_revenue = df.withColumn("date", to_date(col("tpep_pickup_datetime"))) \
    .withColumn("payment_type_id", col("Payment_type").cast(IntegerType())) \
    .groupBy("date", "payment_type_id") \
    .agg(
        sum(col("Fare_amount").cast(DecimalType(10, 2))).alias("total_fare"),
        sum(col("Tip_amount").cast(DecimalType(10, 2))).alias("total_tips"),
        avg(col("Fare_amount").cast(DecimalType(10, 2))).alias("avg_fare_per_trip"),
        count("*").alias("transaction_count")
    )

# Dimension table: Payment types (from NYC TLC data dictionary)
dim_payment_type_data = [
    (1, "Credit Card"),
    (2, "Cash"),
    (3, "No Charge"),
    (4, "Dispute"),
    (5, "Unknown"),
    (6, "Voided Trip")
]

dim_payment_type = spark.createDataFrame(dim_payment_type_data, ["payment_type_id", "payment_desc"])

In [7]:
dim_payment_type.show()

+---------------+------------+
|payment_type_id|payment_desc|
+---------------+------------+
|              1| Credit Card|
|              2|        Cash|
|              3|   No Charge|
|              4|     Dispute|
|              5|     Unknown|
|              6| Voided Trip|
+---------------+------------+



In [8]:
fact_revenue.show()

+----------+---------------+----------+----------+-----------------+-----------------+
|      date|payment_type_id|total_fare|total_tips|avg_fare_per_trip|transaction_count|
+----------+---------------+----------+----------+-----------------+-----------------+
|2024-01-13|              2| 255818.15|     16.68|        16.735454|            15286|
|2024-01-13|              3|   3868.72|     21.21|         6.363026|              608|
|2024-01-15|              4|   1213.51|     87.45|         1.072005|             1132|
|2024-01-19|              1|1341188.40| 304211.09|        17.573454|            76319|
|2024-01-23|              3|   3833.31|      0.00|         6.784619|              565|
|2024-01-15|              1|1218745.17| 272509.15|        19.892684|            61266|
|2024-01-18|              2| 266067.65|     23.78|        17.543693|            15166|
|2024-01-22|              4|   1684.57|     71.35|         1.266594|             1330|
|2024-01-22|              3|   4157.00|    

In [73]:
# Stop the Spark session
spark.stop()

In [60]:
# Time dimension
spark.sql("""
    CREATE TABLE IF NOT EXISTS curated.dim_date (
        date_id DATE,
        day INT,
        month INT,
        year INT,
        quarter INT,
        is_weekend BOOLEAN
    )
    STORED AS PARQUET
""")

# Store dimension
spark.sql("""
    CREATE TABLE IF NOT EXISTS curated.dim_store (
        store_id STRING,
        store_name STRING,
        region STRING 
    )
    STORED AS PARQUET
""")

# Product dimension
spark.sql("""
    CREATE TABLE IF NOT EXISTS curated.dim_product (
        product_id STRING,
        product_name STRING,
        category STRING
    )
    STORED AS PARQUET
""")

# Fact table
spark.sql("""
    CREATE TABLE IF NOT EXISTS curated.fact_sales (
        sales_id BIGINT,
        date_id DATE,
        store_id STRING,
        product_id STRING,
        unit_price DOUBLE,
        quantity INT,
        total_amount DOUBLE
    )
    PARTITIONED BY (year INT, month INT)
    STORED AS PARQUET
""")

# For our example, we'll insert sample data into dimension tables
# In a real scenario, these would be populated from the processed data with more logic

# Insert sample store data
spark.sql("""
    INSERT OVERWRITE TABLE curated.dim_store VALUES
    ('Store1', 'Downtown Store', 'East'),
    ('Store2', 'Mall Store', 'West'),
    ('Store3', 'Airport Store', 'North')
""")

# Insert sample product data
spark.sql("""
    INSERT OVERWRITE TABLE curated.dim_product VALUES
    ('Product1', 'Basic Widget', 'Widgets'),
    ('Product2', 'Premium Widget', 'Widgets'),
    ('Product3', 'Super Gadget', 'Gadgets')
""")

# Insert data into fact table from processed zone
spark.sql("""
    INSERT OVERWRITE TABLE curated.fact_sales
    SELECT 
        CAST(hash(concat(sale_date, store_id, product_id, rand())) AS BIGINT) as sales_id,
        sale_date as date_id,
        store_id,
        product_id,
        unit_price,
        quantity,
        total_amount,
        year,
        month
    FROM processed.sales
""")


DataFrame[]

In [62]:
spark.sql("""
    SELECT * FROM curated.dim_store
""").show()

+--------+--------------+------+
|store_id|    store_name|region|
+--------+--------------+------+
|  Store1|Downtown Store|  East|
|  Store3| Airport Store| North|
|  Store2|    Mall Store|  West|
+--------+--------------+------+



In [63]:
databases_to_clear = ["raw", "processed", "curated"]

print("Starting process to drop tables...")

for db_name in databases_to_clear:
    print(f"\n--- Processing database: {db_name} ---")

    try:
        # Kiểm tra xem database có tồn tại không bằng cách thử USE nó
        # Hoặc có thể dùng SHOW DATABASES LIKE 'db_name' nhưng USE đơn giản hơn
        spark.sql(f"USE {db_name}")
        print(f"Switched to database '{db_name}'. Listing tables...")

        # Lấy danh sách các bảng trong database hiện tại
        tables_df = spark.sql(f"SHOW TABLES") # SHOW TABLES sẽ liệt kê trong database đang USE

        # Thu thập tên bảng về Driver (an toàn vì số lượng bảng thường không quá lớn)
        # DataFrame trả về có cột 'tableName'
        table_list = [row.tableName for row in tables_df.select("tableName").collect()]

        if not table_list:
            print(f"No tables found in database '{db_name}'. Skipping.")
            continue

        print(f"Tables found in '{db_name}': {table_list}")

        # Lặp qua từng bảng và thực hiện DROP TABLE
        for table_name in table_list:
            full_table_name = f"{db_name}.{table_name}"
            drop_command = f"DROP TABLE IF EXISTS {full_table_name}"
            print(f"Executing: {drop_command}")
            try:
                spark.sql(drop_command)
                print(f"Successfully dropped table {full_table_name}")
            except Exception as drop_error:
                print(f"ERROR: Failed to drop table {full_table_name}. Error: {drop_error}")

    except AnalysisException as db_error:
        # Bắt lỗi nếu database không tồn tại
        if "database not found" in str(db_error).lower():
             print(f"Database '{db_name}' not found. Skipping.")
        else:
             print(f"An analysis error occurred while processing database '{db_name}'. Error: {db_error}")
    except Exception as e:
        # Bắt các lỗi khác có thể xảy ra
        print(f"An unexpected error occurred while processing database '{db_name}'. Error: {e}")


print("\n--- Process finished ---")

Starting process to drop tables...

--- Processing database: raw ---
Switched to database 'raw'. Listing tables...
No tables found in database 'raw'. Skipping.

--- Processing database: processed ---
Switched to database 'processed'. Listing tables...
Tables found in 'processed': ['sales']
Executing: DROP TABLE IF EXISTS processed.sales
Successfully dropped table processed.sales

--- Processing database: curated ---
Switched to database 'curated'. Listing tables...
Tables found in 'curated': ['dim_date', 'dim_product', 'dim_store', 'fact_sales']
Executing: DROP TABLE IF EXISTS curated.dim_date
Successfully dropped table curated.dim_date
Executing: DROP TABLE IF EXISTS curated.dim_product
Successfully dropped table curated.dim_product
Executing: DROP TABLE IF EXISTS curated.dim_store
Successfully dropped table curated.dim_store
Executing: DROP TABLE IF EXISTS curated.fact_sales
Successfully dropped table curated.fact_sales

--- Process finished ---
