In [1]:
import os
print(os.environ.get('SPARK_HOME'))
os.environ['SPARK_HOME'] = '/usr/local/spark'
print(os.environ.get('SPARK_HOME'))

/opt/bitnami/spark
/usr/local/spark


In [45]:
!pip install great_expectations

Collecting great_expectations
  Downloading great_expectations-1.3.12-py3-none-any.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting posthog<4,>3
  Downloading posthog-3.23.0-py2.py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tzlocal>=1.2
  Downloading tzlocal-5.3.1-py3-none-any.whl (18 kB)
Collecting marshmallow<4.0.0,>=3.7.1
  Downloading marshmallow-3.26.1-py3-none-any.whl (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting altair<5.0.0,>=4.2.1
  Downloading altair-4.2.2-py3-none-any.whl (813 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m813.6/813.6 kB[0m [31m108.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic>=1.10.7
  Downloading pydantic-2.

In [53]:
# First, stop any existing SparkContext
try:
    from pyspark import SparkContext
    sc = SparkContext.getOrCreate()
    sc.stop()
    print("Stopped existing SparkContext")
except Exception as e:
    print(f"No existing SparkContext to stop or error occurred: {e}")

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, explode, lit, array
import time
import os


# Create a Spark session with explicit cluster configuration
spark = SparkSession.builder \
    .appName("Explicit Spark Job Test") \
    .master("yarn") \
    .config("spark.driver.host", "jupyter") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.driver.memory", "4g") \
    .config("spark.yarn.am.memory", "1g") \
    .config("spark.yarn.am.cores", "1") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.default.parallelism", "10") \
    .config("spark.sql.shuffle.partitions", "10") \
    .config("spark.sql.files.maxPartitionBytes", "128m") \
    .config("spark.sql.caseSensitive", "false") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")


Stopped existing SparkContext
Spark version: 3.3.0
Spark UI: http://jupyter:4040


In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField, StringType, FloatType,
                               ArrayType, LongType, BooleanType, IntegerType,
                               TimestampType)
from pyspark.sql.functions import from_unixtime, year, month, col

HDFS_RAW_FILE = "hdfs:///data/raw/amazon_reviews/Kindle_Store.jsonl"
HDFS_PROCESSED_DIR = "hdfs:///data/processed/amazon_reviews/kindle_store" # Thư mục chứa Parquet partitions
HIVE_DATABASE_NAME = "amazon_data" # Tên database Hive (tạo nếu chưa có)
HIVE_TABLE_NAME = "kindle_reviews_processed"



# --- 1. Define Schema based on the image ---
# Lưu ý: timestamp là Unix epoch nên dùng LongType hoặc IntegerType
# rating có thể là số nguyên hoặc thập phân, FloatType an toàn hơn
# images là list các string (url hoặc id?)
schema = StructType([
    StructField("rating", FloatType(), True),
    StructField("title", StringType(), True),
    StructField("text", StringType(), True),
    StructField("images", ArrayType(StringType()), True), # List of strings
    StructField("asin", StringType(), True), # Product ID
    StructField("parent_asin", StringType(), True), # Parent Product ID
    StructField("user_id", StringType(), True), # User ID
    StructField("timestamp", LongType(), True), # Unix timestamp (seconds)
    StructField("verified_purchase", BooleanType(), True),
    StructField("helpful_vote", IntegerType(), True)
])
print("Schema Defined.")

try:
    # df_raw = spark.read.schema(schema).parquet(HDFS_PROCESSED_DIR)
    # print(f"Successfully started reading from: {HDFS_PROCESSED_DIR}")
    df_raw = spark.read.schema(schema).json(HDFS_RAW_FILE)
    print(f"Successfully started reading from: {HDFS_RAW_FILE}")
    df_raw.printSchema() # Kiểm tra schema sau khi đọc
    print(f"Raw data count: {df_raw.count()}") # Đếm số dòng (có thể chậm với dữ liệu lớn)
except Exception as e:
    print(f"Error reading raw data from {HDFS_RAW_FILE}: {e}")
    spark.stop()
    exit(1)



Schema Defined.
Successfully started reading from: hdfs:///data/raw/amazon_reviews/Kindle_Store.jsonl
root
 |-- rating: float (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- images: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- asin: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- timestamp: long (nullable = true)
 |-- verified_purchase: boolean (nullable = true)
 |-- helpful_vote: integer (nullable = true)

Raw data count: 25577616


In [33]:
from pyspark.sql.functions import from_unixtime, year, month, dayofmonth, date_format, coalesce, lit, trim, when

print("============= CLEANING ============\n")
df_cleaned = df_raw \
    .filter(col("text").isNotNull() & (trim(col("text")) != "")) \
    .filter(col("rating").isNotNull()) \
    .filter(col("asin").isNotNull() & (trim(col("asin")) != "")) \
    .filter(col("user_id").isNotNull() & (trim(col("user_id")) != "")) \
    .withColumn("title", trim(coalesce(col("title"), lit("[no title]")))) \
    .withColumn("text", trim(col("text"))) \
    .withColumn("asin", trim(col("asin"))) \
    .withColumn("parent_asin", trim(col("parent_asin"))) \
    .withColumn("user_id", trim(col("user_id"))) \
    .withColumn("helpful_vote", coalesce(col("helpful_vote"), lit(0)).cast(IntegerType())) \
    .withColumn("images", coalesce(col("images"), array().cast(ArrayType(StringType())))) \
    .withColumn("review_time", from_unixtime(col("timestamp") / 1000).cast(TimestampType())) \
    .filter(col("review_time").isNotNull()) # Loại bỏ nếu timestamp không hợp lệ

df_cleaned = df_cleaned.withColumn("year", year(col("review_time"))) \
                     .withColumn("month", month(col("review_time"))) \
                     .withColumn("day", dayofmonth(col("review_time"))) \
                     .withColumn("date_str", date_format(col("review_time"), "yyyy-MM-dd")) \
                     .filter(col("year").isNotNull() & (col("year") >= 1990) & (col("year") <= 2025))

df_cleaned.select("timestamp", "review_time", "year", "month", "date_str").show(20, False)


+-------------+-------------------+----+-----+----------+
|timestamp    |review_time        |year|month|date_str  |
+-------------+-------------------+----+-----+----------+
|1427541413000|2015-03-28 11:16:53|2015|3    |2015-03-28|
|1504226946142|2017-09-01 00:49:06|2017|9    |2017-09-01|
|1644883955777|2022-02-15 00:12:35|2022|2    |2022-02-15|
|1363027885000|2013-03-11 18:51:25|2013|3    |2013-03-11|
|1637557512064|2021-11-22 05:05:12|2021|11   |2021-11-22|
|1637134078567|2021-11-17 07:27:58|2021|11   |2021-11-17|
|1632291278732|2021-09-22 06:14:38|2021|9    |2021-09-22|
|1614145710980|2021-02-24 05:48:30|2021|2    |2021-02-24|
|1599452688091|2020-09-07 04:24:48|2020|9    |2020-09-07|
|1574812541555|2019-11-26 23:55:41|2019|11   |2019-11-26|
|1568214752013|2019-09-11 15:12:32|2019|9    |2019-09-11|
|1567293346345|2019-08-31 23:15:46|2019|8    |2019-08-31|
|1566774264228|2019-08-25 23:04:24|2019|8    |2019-08-25|
|1558889036351|2019-05-26 16:43:56|2019|5    |2019-05-26|
|155833333716

In [47]:
import great_expectations as gx
from great_expectations.exceptions import DataContextError
from great_expectations.profile.user_configurable_profiler import UserConfigurableProfiler
from pyspark.sql import SparkSession # Chỉ để minh họa, bạn đã có session 'spark'


print("--- Starting Great Expectations Profiling ---")

# --- 1. Initialize Great Expectations Data Context ---
# Data Context quản lý cấu hình, datasources, suites, checkpoints...
# Mặc định, nó sẽ tạo cấu trúc thư mục 'great_expectations' trong thư mục làm việc hiện tại
try:
    context = gx.get_context()
    print("Existing Great Expectations context loaded.")
except DataContextError:
    context = gx.DataContext.create()
    print("New Great Expectations context created.")
except Exception as e:
    print(f"Error getting or creating Great Expectations context: {e}")
    spark.stop()
    exit(1)


# --- 2. Add Spark DataFrame as a Datasource and Data Asset ---
# Datasource định nghĩa cách GX kết nối với dữ liệu (ở đây là Spark)
# Data Asset đại diện cho DataFrame cụ thể của chúng ta
datasource_name = "my_spark_datasource" # Đặt tên tùy ý
asset_name = "kindle_reviews_cleaned_asset" # Đặt tên tùy ý

try:
    # Thêm Spark Datasource nếu chưa tồn tại
    datasource = context.sources.add_spark(name=datasource_name)
    print(f"Spark Datasource '{datasource_name}' added.")
except DataContextError:
    datasource = context.get_datasource(datasource_name)
    print(f"Spark Datasource '{datasource_name}' already exists, using it.")
except Exception as e:
    print(f"Error adding or getting Spark Datasource: {e}")
    spark.stop()
    exit(1)

try:
    # Thêm DataFrame làm Data Asset vào Datasource
    # Quan trọng: Truyền DataFrame 'df_cleaned' vào đây
    data_asset = datasource.add_dataframe_asset(name=asset_name, dataframe=df_cleaned)
    print(f"DataFrame Asset '{asset_name}' added to Datasource '{datasource_name}'.")
except Exception as e:
    print(f"Error adding DataFrame Asset: {e}")
    # Có thể cần xóa asset cũ nếu chạy lại: context.delete_asset(asset_name=f"{datasource_name}/{asset_name}")
    spark.stop()
    exit(1)

# --- 3. Create or Get an Expectation Suite ---
# Expectation Suite là nơi lưu trữ các kỳ vọng (Expectations) về dữ liệu
# Profiler sẽ tự động điền vào suite này
expectation_suite_name = "kindle_reviews_cleaned_profiling_suite" # Đặt tên tùy ý

try:
    suite = context.add_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f"New Expectation Suite '{expectation_suite_name}' created.")
except DataContextError:
    suite = context.get_expectation_suite(expectation_suite_name=expectation_suite_name)
    print(f"Expectation Suite '{expectation_suite_name}' already exists, using it.")
    # Có thể bạn muốn xóa các expectations cũ trước khi profiling lại
    # suite.expectations = []
except Exception as e:
    print(f"Error adding or getting Expectation Suite: {e}")
    spark.stop()
    exit(1)

# --- 4. Run the UserConfigurableProfiler ---
# Profiler này sẽ quét dữ liệu trong Data Asset và tự động tạo ra các Expectations
# Quá trình này thực chất là thực hiện các tính toán profiling trên Spark
print(f"Running UserConfigurableProfiler on asset '{asset_name}'...")
profiler = UserConfigurableProfiler(profile_dataset=data_asset)
suite = profiler.build_suite() # Đây là lúc profiling thực sự diễn ra
print("Profiler finished building the suite based on data characteristics.")

# Lưu Expectation Suite đã được điền bởi Profiler
try:
    context.save_expectation_suite(expectation_suite=suite)
    print(f"Expectation Suite '{expectation_suite_name}' saved with profiled expectations.")
except Exception as e:
    print(f"Error saving Expectation Suite: {e}")
    spark.stop()
    exit(1)

# --- 5. Configure and Run a Checkpoint ---
# Checkpoint định nghĩa cách chạy validation (và profiling đi kèm)
# Nó sẽ chạy Expectation Suite trên Data Asset và tạo ra kết quả validation
checkpoint_name = "kindle_reviews_cleaned_profiling_checkpoint" # Đặt tên tùy ý

# Cấu hình Checkpoint đơn giản
checkpoint_config = {
    "name": checkpoint_name,
    "config_version": 1.0,
    "class_name": "SimpleCheckpoint", # Lớp Checkpoint đơn giản
    "run_name_template": "%Y%m%d-%H%M%S-profile-run", # Mẫu tên cho mỗi lần chạy
    "validations": [ # Danh sách các validation cần chạy
        {
            "batch_request": data_asset.build_batch_request(), # Yêu cầu dữ liệu từ Data Asset
            "expectation_suite_name": expectation_suite_name, # Sử dụng suite vừa được profiler tạo
        }
    ],
    # Action để tự động cập nhật và mở Data Docs sau khi chạy
     "action_list": [
        {
            "name": "store_validation_result",
            "action": {"class_name": "StoreValidationResultAction"},
        },
        {
            "name": "update_data_docs",
            "action": {"class_name": "UpdateDataDocsAction", "site_names": []},
        },
        # { # Bỏ comment dòng này nếu muốn tự mở Data Docs trong trình duyệt
        #     "name": "open_data_docs",
        #     "action": {"class_name": "OpenDataDocsAction"}
        # }
    ]
}

# Thêm hoặc cập nhật Checkpoint vào context
try:
    context.add_or_update_checkpoint(**checkpoint_config)
    print(f"Checkpoint '{checkpoint_name}' added or updated.")
except Exception as e:
    print(f"Error adding or updating Checkpoint: {e}")
    spark.stop()
    exit(1)

# Chạy Checkpoint
print(f"Running Checkpoint '{checkpoint_name}'...")
results = context.run_checkpoint(checkpoint_name=checkpoint_name)
print("Checkpoint run finished.")

if not results["success"]:
    print("Checkpoint run failed or had validation errors.")
    # Nên kiểm tra chi tiết lỗi trong 'results' object hoặc Data Docs
else:
    print("Checkpoint run succeeded.")

# --- 6. Review Profiling Results in Data Docs ---
# Cách tốt nhất để xem kết quả profiling là qua Data Docs (báo cáo HTML)
print("\n--- Review Results ---")
print("Building Data Docs...")
try:
    context.build_data_docs()
    print("Data Docs build complete.")
    print(f"To view the profiling results, open the Data Docs HTML file, usually located at: great_expectations/uncommitted/data_docs/local_site/index.html")
    # Hoặc nếu bạn muốn mở tự động (cần bỏ comment action ở trên):
    # context.open_data_docs()
except Exception as e:
    print(f"Error building or opening Data Docs: {e}")

# (Optional) Truy cập metrics programmatically từ results object
# Việc này phức tạp hơn xem Data Docs
try:
    validation_result_identifier = results.list_validation_result_identifiers()[0]
    validation_result = results.get_validation_result(identifier=validation_result_identifier)
    # 'validation_result.results' là list các kết quả của từng expectation
    # Mỗi kết quả chứa 'observed_value' là metric đã tính toán
    print(f"\nExample metrics from results object (column: rating):")
    for evr in validation_result.results:
        if evr.expectation_config.kwargs.get("column") == "rating":
             metric_name = evr.expectation_config.expectation_type
             observed_value = evr.result.get("observed_value")
             print(f"- Expectation/Metric: {metric_name}, Observed Value: {observed_value}")
except Exception as e:
     print(f"Could not extract programmatic metrics: {e}")


print("\n--- Great Expectations Profiling Finished ---")


ImportError: cannot import name 'deprecated' from 'typing_extensions' (/opt/conda/lib/python3.10/site-packages/typing_extensions.py)

In [41]:
from pyspark.sql.functions import length


print("=========  VALIDATION ==========\n")
# Tạo cột lý do lỗi (ban đầu là null)
df_validated = df_cleaned.withColumn("validation_error", lit(None).cast(StringType()))

# Áp dụng các quy tắc validation
df_validated = df_validated.withColumn("validation_error",
    when((col("rating") < 1.0) | (col("rating") > 5.0), "Invalid Rating")
    .otherwise(col("validation_error"))) # Giữ lỗi cũ nếu có

df_validated = df_validated.withColumn("validation_error",
    when(col("helpful_vote") < 0, "Negative Helpful Vote")
    .otherwise(col("validation_error")))

# Ví dụ kiểm tra độ dài ASIN
df_validated = df_validated.withColumn("validation_error",
    when(length(col("asin")) != 10, "Invalid ASIN Length")
    .otherwise(col("validation_error")))

# Kiểm tra timestamp hợp lý (ví dụ: không sau ngày hôm nay)
from pyspark.sql.functions import current_timestamp
df_validated = df_validated.withColumn("validation_error",
    when(col("review_time") > current_timestamp(), "Future Timestamp")
    .otherwise(col("validation_error")))

# Tách thành 2 DataFrame: Hợp lệ và Không hợp lệ
df_valid = df_validated.filter(col("validation_error").isNull()).drop("validation_error")
df_invalid = df_validated.filter(col("validation_error").isNotNull())


# Lưu các bản ghi không hợp lệ để phân tích (Bảng Silver phụ)
HDFS_VALIDATION_FAILURES_DIR = "hdfs:///data/processed/amazon_reviews/kindle_store_validation_failures"
print(f"Writing validation failures to: {HDFS_VALIDATION_FAILURES_DIR}")
df_invalid.write \
    .partitionBy("year", "month", "validation_error") \
    .mode("overwrite") \
    .parquet(HDFS_VALIDATION_FAILURES_DIR)

Writing validation failures to: hdfs:///data/processed/amazon_reviews/kindle_store_validation_failures


In [42]:
from pyspark.sql.functions import lower

print("============== STANRDALIZATION ===========\n")
df_standardized = df_valid \
    .withColumn("title_processed", lower(col("title"))) \
    .withColumn("text_processed", lower(col("text")))
    # Giữ lại cột gốc nếu cần so sánh
    
df_standardized.count()




25576202

In [43]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec = Window.partitionBy("user_id", "asin").orderBy(col("review_time").desc())

df_deduplicated = df_standardized.withColumn("rn", row_number().over(window_spec)) \
                               .filter(col("rn") == 1) \
                               .drop("rn")
    
print(f"Deduplication complete. Count after deduplication: {df_deduplicated.count()}") # Cẩn thận khi count() trên data lớn

Deduplication complete. Count after deduplication: 25300905


In [46]:
df_final_processed = df_deduplicated

In [50]:
HDFS_PROCESSED_BASE = "hdfs:///data/processed/amazon_reviews"

# Hive Database Names
HIVE_PROCESSED_DB = "processed"

HDFS_PROCESSED_MAIN_DIR = f"{HDFS_PROCESSED_BASE}/kindle_store_main"
HIVE_PROCESSED_MAIN_TABLE = "kindle_reviews_main"

# 2. Validation Failures Table (vẫn giữ nguyên)
HDFS_VALIDATION_FAILURES_DIR = f"{HDFS_PROCESSED_BASE}/kindle_store_validation_failures"
HIVE_VALIDATION_FAILURES_TABLE = "kindle_reviews_validation_failures"

# 3. Product Dimension VIEW (sẽ tạo VIEW trên Hive)
HIVE_PRODUCTS_DIM_VIEW = "kindle_products_dim"

# 4. User Dimension VIEW (sẽ tạo VIEW trên Hive)
HIVE_USERS_DIM_VIEW = "kindle_users_dim"

In [48]:
df_to_save = df_final_processed.select(
    "rating", "title_processed", "text_processed", "images", "asin",
    "parent_asin", "user_id", "verified_purchase", "helpful_vote",
    "review_time", "year", "month", "day", "date_str"
    # Thêm cột gốc title, text nếu muốn: "title", "text",
)

print(f"Writing main processed data to: {HDFS_PROCESSED_MAIN_DIR}")
df_to_save.write \
    .partitionBy("year", "month") \
    .mode("overwrite") \
    .parquet(HDFS_PROCESSED_MAIN_DIR)

Writing main processed data to: hdfs:///data/processed/amazon_reviews/kindle_store_main


In [51]:
print("\n--- Creating/Updating Hive Tables and VIEWS ---")

spark.sql(f"CREATE DATABASE IF NOT EXISTS {HIVE_PROCESSED_DB}")
print(f"Ensured Hive database '{HIVE_PROCESSED_DB}' exists.")

# Create Hive External TABLE for Main Processed Data
print(f"Creating Hive TABLE: {HIVE_PROCESSED_DB}.{HIVE_PROCESSED_MAIN_TABLE}")
spark.sql(f"DROP TABLE IF EXISTS {HIVE_PROCESSED_DB}.{HIVE_PROCESSED_MAIN_TABLE}")
create_main_table_sql = f"""
CREATE EXTERNAL TABLE {HIVE_PROCESSED_DB}.{HIVE_PROCESSED_MAIN_TABLE} (
    rating FLOAT,
    title_processed STRING,
    text_processed STRING,
    images ARRAY<STRING>,
    asin STRING,
    parent_asin STRING,
    user_id STRING,
    verified_purchase BOOLEAN,
    helpful_vote INT,
    review_time TIMESTAMP,
    date_str STRING
)
PARTITIONED BY (year INT, month INT)
STORED AS PARQUET
LOCATION '{HDFS_PROCESSED_MAIN_DIR}'
TBLPROPERTIES ('parquet.compression'='SNAPPY')
"""
try:
    spark.sql(create_main_table_sql)
    print(f"Running MSCK REPAIR TABLE for {HIVE_PROCESSED_MAIN_TABLE}...")
    spark.sql(f"MSCK REPAIR TABLE {HIVE_PROCESSED_DB}.{HIVE_PROCESSED_MAIN_TABLE}")
    print(f"Successfully created and repaired main processed table.")
except Exception as e:
    print(f"ERROR creating/repairing main processed table: {e}")


# Create Hive External TABLE for Validation Failures (vẫn giữ nguyên)
print(f"Creating Hive TABLE: {HIVE_PROCESSED_DB}.{HIVE_VALIDATION_FAILURES_TABLE}")
spark.sql(f"DROP TABLE IF EXISTS {HIVE_PROCESSED_DB}.{HIVE_VALIDATION_FAILURES_TABLE}")
create_failures_table_sql = f"""
CREATE EXTERNAL TABLE {HIVE_PROCESSED_DB}.{HIVE_VALIDATION_FAILURES_TABLE} (
    rating FLOAT,
    title STRING,
    text STRING,
    asin STRING,
    parent_asin STRING,
    user_id STRING,
    verified_purchase BOOLEAN,
    helpful_vote INT,
    timestamp LONG,
    review_time TIMESTAMP,
    date_str STRING
)
PARTITIONED BY (year INT, month INT, validation_error STRING)
STORED AS PARQUET
LOCATION '{HDFS_VALIDATION_FAILURES_DIR}'
TBLPROPERTIES ('parquet.compression'='SNAPPY')
"""
try:
    spark.sql(create_failures_table_sql)
    print(f"Running MSCK REPAIR TABLE for {HIVE_VALIDATION_FAILURES_TABLE}...")
    spark.sql(f"MSCK REPAIR TABLE {HIVE_PROCESSED_DB}.{HIVE_VALIDATION_FAILURES_TABLE}")
    print(f"Successfully created and repaired validation failures table.")
except Exception as e:
    print(f"ERROR creating/repairing validation failures table: {e}")


# Create Hive VIEW for Products Dimension
print(f"Creating Hive VIEW: {HIVE_PROCESSED_DB}.{HIVE_PRODUCTS_DIM_VIEW}")
spark.sql(f"DROP VIEW IF EXISTS {HIVE_PROCESSED_DB}.{HIVE_PRODUCTS_DIM_VIEW}")
create_products_view_sql = f"""
CREATE VIEW {HIVE_PROCESSED_DB}.{HIVE_PRODUCTS_DIM_VIEW} AS
SELECT
    asin,
    parent_asin,
    MIN(review_time) AS first_review_time,
    MAX(review_time) AS last_review_time,
    AVG(rating) AS avg_rating,
    COUNT(*) AS total_reviews,
    SUM(helpful_vote) AS total_helpful_votes_received
FROM {HIVE_PROCESSED_DB}.{HIVE_PROCESSED_MAIN_TABLE}
GROUP BY asin, parent_asin
"""
try:
    spark.sql(create_products_view_sql)
    print(f"Successfully created products dimension VIEW.")
except Exception as e:
    print(f"ERROR creating products dimension VIEW: {e}")


# Create Hive VIEW for Users Dimension
print(f"Creating Hive VIEW: {HIVE_PROCESSED_DB}.{HIVE_USERS_DIM_VIEW}")
spark.sql(f"DROP VIEW IF EXISTS {HIVE_PROCESSED_DB}.{HIVE_USERS_DIM_VIEW}")
create_users_view_sql = f"""
CREATE VIEW {HIVE_PROCESSED_DB}.{HIVE_USERS_DIM_VIEW} AS
SELECT
    user_id,
    MIN(review_time) AS first_review_time,
    MAX(review_time) AS last_review_time,
    COUNT(*) AS total_reviews_written,
    AVG(rating) AS avg_rating_given,
    SUM(helpful_vote) AS total_helpful_votes_on_written_reviews
FROM {HIVE_PROCESSED_DB}.{HIVE_PROCESSED_MAIN_TABLE}
GROUP BY user_id
"""
try:
    spark.sql(create_users_view_sql)
    print(f"Successfully created users dimension VIEW.")
except Exception as e:
    print(f"ERROR creating users dimension VIEW: {e}")


print("\n--- Finished Simplified Storage and Hive Table/View Creation (Phase 3) ---")




--- Creating/Updating Hive Tables and VIEWS ---
Ensured Hive database 'processed' exists.
Creating Hive TABLE: processed.kindle_reviews_main
Running MSCK REPAIR TABLE for kindle_reviews_main...
Successfully created and repaired main processed table.
Creating Hive TABLE: processed.kindle_reviews_validation_failures
Running MSCK REPAIR TABLE for kindle_reviews_validation_failures...
Successfully created and repaired validation failures table.
Creating Hive VIEW: processed.kindle_products_dim
Successfully created products dimension VIEW.
Creating Hive VIEW: processed.kindle_users_dim
Successfully created users dimension VIEW.

--- Finished Simplified Storage and Hive Table/View Creation (Phase 3) ---


In [52]:
spark.stop()