In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType, BooleanType


In [0]:
path='abfss://gold@singaporecomadls.dfs.core.windows.net/reports'

In [0]:
master_df = spark.read.table("gold.final.master_companies")  # 200K records
recordowl_df = spark.read.table("silver.clean.recordowl")    # ~4.4K records

In [0]:
cols_to_check = [
    "uen", "company_name", "website", "hq_country", "no_of_locations_in_singapore",
    "linkedin", "facebook", "instagram", "industry", "number_of_employees",
    "company_size", "is_it_delisted", "stock_exchange_code", "revenue",
    "founding_year", "products_offered", "services_offered", "keywords"
]


In [0]:
def completeness(df, cols):
    total = df.count()
    completeness_exprs = [
        (F.round((F.count(F.when(F.col(c).isNotNull() & (F.trim(F.col(c)) != ""), c)) / F.lit(total)) * 100, 2)
         .alias(c)) for c in cols
    ]
    return df.select(completeness_exprs).limit(1)


In [0]:
master_count = master_df.count()
print(f"✅ Total Master records: {master_count}")

master_coverage = completeness(master_df, cols_to_check)
display(master_coverage)


In [0]:
display(master_coverage)


In [0]:
recordowl_subset = master_df.join(
    recordowl_df.select("uen_match").withColumnRenamed("uen_match", "uen"),
    "uen",
    "inner"
)

recordowl_count = recordowl_subset.count()
print(f"✅ RecordOwl subset in Master: {recordowl_count}")


In [0]:
recordowl_coverage = completeness(recordowl_subset, cols_to_check)
display(recordowl_coverage)


In [0]:
master_coverage = completeness(master_df, cols_to_check).withColumn("dataset", F.lit("Master (All 200K Companies)"))
recordowl_coverage = completeness(recordowl_subset, cols_to_check).withColumn("dataset", F.lit("RecordOwl (4.4K Subset)"))


In [0]:
coverage_comparison = (
    master_coverage
    .unionByName(recordowl_coverage)
    .select("dataset", *cols_to_check)
)

display(coverage_comparison)


In [0]:
coverage_summary_path = f"{path}/coverage_summary"

(
    coverage_comparison.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(coverage_summary_path)
)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS gold.analytics.coverage_summary
USING DELTA
LOCATION '{coverage_summary_path}'
""")


In [0]:
%sql
SELECT * FROM gold.analytics.coverage_summary

In [0]:
def compute_dq_score(df):
    website_regex = r'^(https?://)?([A-Za-z0-9.-]+\.[A-Za-z]{2,})(/.*)?$'

    df = (df
          .withColumn("_has_uen", F.when(F.col("uen").isNotNull(), 1).otherwise(0))
          .withColumn("_has_name", F.when(F.col("company_name").isNotNull(), 1).otherwise(0))
          .withColumn("_has_website", F.when(F.col("website").rlike(website_regex), 1).otherwise(0))
          .withColumn("_has_linkedin", F.when(F.col("linkedin").rlike(website_regex), 1).otherwise(0))
          .withColumn("_has_facebook", F.when(F.col("facebook").rlike(website_regex), 1).otherwise(0))
          .withColumn("_has_instagram", F.when(F.col("instagram").rlike(website_regex), 1).otherwise(0))
          .withColumn("_has_industry", F.when(F.col("industry").isNotNull(), 1).otherwise(0))
          .withColumn("_has_products", F.when(F.col("products_offered").isNotNull(), 1).otherwise(0))
          .withColumn("_has_services", F.when(F.col("services_offered").isNotNull(), 1).otherwise(0))
          .withColumn("_has_keywords", F.when(F.col("keywords").isNotNull(), 1).otherwise(0))
          .withColumn("_has_founding_year", F.when(F.col("founding_year").isNotNull(), 1).otherwise(0))
          .withColumn("_has_revenue", F.when(F.col("revenue").isNotNull(), 1).otherwise(0))
         )

    # Weighted DQ Score
    df = df.withColumn("dq_score",
        (F.col("_has_uen") * 15 +
         F.col("_has_name") * 15 +
         F.col("_has_website") * 15 +
         (F.col("_has_linkedin") + F.col("_has_facebook") + F.col("_has_instagram")) * 5 +
         F.col("_has_industry") * 10 +
         (F.col("_has_products") + F.col("_has_services") + F.col("_has_keywords")) * 5 +
         F.col("_has_founding_year") * 5 +
         F.col("_has_revenue") * 5)
    )

    df = df.withColumn("dq_score", F.least(F.col("dq_score"), F.lit(100)))
    return df


In [0]:
def get_stats(df, dataset_name):
    return (
        df.agg(
            F.lit(dataset_name).alias("dataset"),
            F.count("*").alias("total_records"),
            F.round(F.avg("dq_score"), 2).alias("avg_dq_score"),
            F.round(F.min("dq_score"), 2).alias("min_dq_score"),
            F.round(F.max("dq_score"), 2).alias("max_dq_score"),
            F.round(F.expr("percentile(dq_score, 0.25)"), 2).alias("p25_score"),
            F.round(F.expr("percentile(dq_score, 0.5)"), 2).alias("median_score"),
            F.round(F.expr("percentile(dq_score, 0.75)"), 2).alias("p75_score")
        )
    )

record_stats = get_stats(recordowl_scored, "RecordOwl (4.4K Subset)")
master_stats = get_stats(master_scored, "Master (All 200K Companies)")

# Union both
stats_summary = record_stats.unionByName(master_stats)

display(stats_summary)

In [0]:
stats_path = f"{path}/data_quality_stats"

(
    stats_summary.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(stats_path)
)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS gold.analytics.data_quality_stats
USING DELTA
LOCATION '{stats_path}'
""")

In [0]:
%sql
SELECT * FROM gold.analytics.data_quality_stats

In [0]:
top5_industries_master = (
    master_df.groupBy("industry")
    .count()
    .filter(F.col("industry").isNotNull())
    .orderBy(F.desc("count"))
    .limit(5)
)
display(top5_industries_master)


In [0]:
t5_path = f"{path}/top5_industries_master"

(
    top5_industries_master.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(t5_path)
)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS gold.analytics.top5_industries_master
USING DELTA
LOCATION '{t5_path}'
""")

In [0]:
%sql
SELECT * FROM gold.analytics.top5_industries_master

In [0]:
from pyspark.sql import functions as F

top5_industries_recordowl = (
    recordowl_subset
    .groupBy("industry")
    .count()
    .filter(F.col("industry").isNotNull())
    .orderBy(F.desc("count"))
    .limit(5)
)

display(top5_industries_recordowl)


In [0]:
t5_path = f"{path}/top5_industries_recordowl"

(
    top5_industries_recordowl.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(t5_path)
)

spark.sql(f"""
CREATE TABLE IF NOT EXISTS gold.analytics.top5_industries_recordowl
USING DELTA
LOCATION '{t5_path}'
""")

In [0]:
%sql
SELECT * FROM gold.analytics.top5_industries_recordowl