In [0]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import (
    col, lower, trim, regexp_replace, when, coalesce, 
    lit, count, countDistinct, avg, sum as _sum, max as _max,
    concat_ws, collect_list, array_distinct, flatten,
    monotonically_increasing_id, row_number, dense_rank,
    length, levenshtein, soundex, split,
    udf, struct, first, last, greatest, upper, concat, abs,substring, expr
)
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructType, StructField,BooleanType
from functools import reduce

In [0]:
def standardize_text(df, col_name):
    """Standardize text for matching"""
    return (df
        .withColumn(col_name, lower(trim(col(col_name))))
        .withColumn(col_name, regexp_replace(col(col_name), r'[^\w\s]', ' '))
        .withColumn(col_name, regexp_replace(col(col_name), r'\s+', ' '))
        .withColumn(col_name, trim(col(col_name)))
        # Common abbreviations
        .withColumn(col_name, regexp_replace(col(col_name), r'\bpte\s*ltd\b', 'private limited'))
        .withColumn(col_name, regexp_replace(col(col_name), r'\bltd\b', 'limited'))
        .withColumn(col_name, regexp_replace(col(col_name), r'\bllp\b', 'limited liability partnership'))
    )

def standardize_url(df, col_name):
    """Standardize URL for matching"""
    return (df
        .withColumn(col_name, lower(trim(col(col_name))))
        .withColumn(col_name, regexp_replace(col(col_name), r'^https?://', ''))
        .withColumn(col_name, regexp_replace(col(col_name), r'^www\.', ''))
        .withColumn(col_name, regexp_replace(col(col_name), r'/$', ''))
    )

def standardize_uen(df, col_name):
    """Standardize UEN format"""
    return (df
        .withColumn(col_name, upper(trim(col(col_name))))
        .withColumn(col_name, regexp_replace(col(col_name), r'[^\w]', ''))
    )

def standardize_phone(df, col_name):

    return (
        df
        .withColumn(col_name, regexp_replace(col(col_name), r'[^0-9]', ''))

        # 8-digit SG numbers → +65XXXXXXXX
        .withColumn(
            col_name,
            when(length(col(col_name)) == 8,
                 concat(lit("+65"), col(col_name)))
            .otherwise(col(col_name))
        )

        # 10-digit numbers with 65 prefix → +65XXXXXXXX
        .withColumn(
            col_name,
            when((length(col(col_name)) == 10) & (col(col_name).startswith("65")),
                 concat(lit("+"), col(col_name)))
            .otherwise(col(col_name))
        )

        # # Allow +65XXXXXXXX format
        # .withColumn(
        #     col_name,
        #     when(col(col_name).startswith("65"), concat(lit("+"), col(col_name)))
        #     .otherwise(col(col_name))
        # )

        # Final validation → Keep only valid SG numbers
        .withColumn(
            col_name,
            when(col(col_name).rlike(r'^\+65[0-9]{8}$'), col(col_name))
            .otherwise(None)
        )
    )

# Scraped Websites

In [0]:
bronze_path_scrape_websites = "abfss://bronze@singaporecomadls.dfs.core.windows.net/scrape_websites/scraped_websites.csv"
silver_path_scrape_websites = "abfss://silver@singaporecomadls.dfs.core.windows.net/unified_data/"

In [0]:
scrape_websites_schema = StructType([
    StructField("uen", StringType(), True),
    StructField("company_name", StringType(), True),
    StructField("website", StringType(), True),
    StructField("linkedin", StringType(), True),
    StructField("facebook", StringType(), True),
    StructField("instagram", StringType(), True),
    StructField("contact_email", StringType(), True),
    StructField("contact_phone", StringType(), True),
    StructField("keywords", StringType(), True),
    StructField("scrape_status", StringType(), True),
    StructField("html_saved", StringType(), True),
    StructField("html_size", StringType(), True),
    StructField("error", StringType(), True),
    StructField("scrape_time", StringType(), True)
])

In [0]:
scraped_websites_df=spark.read.format("csv").option("header",True).schema(scrape_websites_schema).load(bronze_path_scrape_websites)
display(scraped_websites_df)

In [0]:
scraped_websites_df  = scraped_websites_df.filter(col("scrape_status") == "success")
display(scraped_websites_df)

In [0]:
scraped_prepared = scraped_websites_df.filter(
    col("scrape_status") == "success"
).select(
    col("uen").alias("uen_match"),
    col("company_name").alias("scraped_company_name"),
    col("linkedin").alias("scraped_linkedin"),
    col("facebook").alias("scraped_facebook"),
    col("instagram").alias("scraped_instagram"),
    col("contact_email").alias("scraped_email"),
    col("contact_phone").alias("scraped_phone"),
    col("keywords").alias("scraped_keywords")
).filter(col("uen_match").isNotNull())
display(scraped_prepared)

In [0]:
scraped_prepared = standardize_uen(scraped_prepared, "uen_match")
scraped_prepared = standardize_text(scraped_prepared, "scraped_company_name")
scraped_prepared = standardize_url(scraped_prepared, "scraped_linkedin")
scraped_prepared = standardize_url(scraped_prepared, "scraped_facebook")
scraped_prepared = standardize_url(scraped_prepared, "scraped_instagram")
scraped_prepared = standardize_phone(scraped_prepared, "scraped_phone")
scraped_prepared = scraped_prepared.dropDuplicates(["uen_match"])
display(scraped_prepared)

# ACRA DATA

In [0]:
bronze_path_acra = "abfss://bronze@singaporecomadls.dfs.core.windows.net/acra/acra_data.csv"
silver_path_acra = "abfss://silver@singaporecomadls.dfs.core.windows.net/unified_data/"

In [0]:
acra_schema = StructType([
    StructField("uen", StringType(), True),
    StructField("issuance_agency_id", StringType(), True),
    StructField("entity_name", StringType(), True),
    StructField("entity_type_description", StringType(), True),
    StructField("business_constitution_description", StringType(), True),
    StructField("company_type_description", StringType(), True),
    StructField("paf_constitution_description", StringType(), True),
    StructField("entity_status_description", StringType(), True),
    StructField("registration_incorporation_date", StringType(), True),
    StructField("uen_issue_date", StringType(), True),
    StructField("address_type", StringType(), True),
    StructField("block", StringType(), True),
    StructField("street_name", StringType(), True),
    StructField("level_no", StringType(), True),
    StructField("unit_no", StringType(), True),
    StructField("building_name", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("other_address_line1", StringType(), True),
    StructField("other_address_line2", StringType(), True),
    StructField("account_due_date", StringType(), True),
    StructField("annual_return_date", StringType(), True),
    StructField("primary_ssic_code", StringType(), True),
    StructField("primary_ssic_description", StringType(), True),
    StructField("primary_user_described_activity", StringType(), True),
    StructField("secondary_ssic_code", StringType(), True),
    StructField("secondary_ssic_description", StringType(), True),
    StructField("secondary_user_described_activity", StringType(), True),
    StructField("no_of_officers", StringType(), True),
])

In [0]:
acra_df=spark.read.format("csv").option("header",True).schema(acra_schema).load(bronze_path_acra)
display(acra_df)

In [0]:
def safe(c):
    return when( (col(c).isNull()) | (trim(col(c)) == "") | (lower(col(c)) == "na"), None).otherwise(col(c))

In [0]:
acra_prepared = acra_df.select(
    col("uen"),
    col("entity_name").alias("company_name"),
    col("entity_type_description"),
    col("entity_status_description"),
    col("registration_incorporation_date").alias("founding_year"),
    col("primary_ssic_code").alias("industry_code"),
    col("primary_ssic_description").alias("industry_description"),
    col("secondary_ssic_code"),
    col("secondary_ssic_description"),
    col("no_of_officers"),
    concat_ws(", ",
        safe("block"),
        safe("street_name"),
        safe("building_name"),
        safe("postal_code")
    ).alias("address")
).filter(col("uen").isNotNull())

In [0]:
# Standardize
acra_prepared = standardize_uen(acra_prepared, "uen")
acra_prepared = standardize_text(acra_prepared, "company_name")
acra_prepared = acra_prepared.dropDuplicates(["uen"])

In [0]:
print(f"✓ ACRA records: {acra_prepared.count():,}")
display(acra_prepared)

# record_owl 

In [0]:
bronze_path_recordowl = "abfss://bronze@singaporecomadls.dfs.core.windows.net/recordowld/recordowl.csv"
silver_path_recordowl = "abfss://silver@singaporecomadls.dfs.core.windows.net/unified_data/"

In [0]:
record_owl_schema = StructType([
    StructField("uen", StringType(), True),
    StructField("company_name", StringType(), True),
    StructField("company_link", StringType(), True),
    StructField("registration_number", StringType(), True),
    StructField("registered_address", StringType(), True),
    StructField("operating_status", StringType(), True),
    StructField("company_age", StringType(), True),
    StructField("building", StringType(), True),
    StructField("contact_number", StringType(), True),
    StructField("website", StringType(), True),
    StructField("description", StringType(), True),
    StructField("primary_ssic_code", StringType(), True),
    StructField("primary_industry", StringType(), True),
    StructField("secondary_ssic_code", StringType(), True),
    StructField("secondary_industry", StringType(), True),
    StructField("company_founder", StringType(), True),
    StructField("facebook", StringType(), True),
    StructField("linkedin", StringType(), True),
    StructField("twitter", StringType(), True),
    StructField("instagram", StringType(), True),
    StructField("youtube", StringType(), True),
    StructField("tiktok", StringType(), True),
    StructField("pinterest", StringType(), True)
])

In [0]:
record_owl_df=spark.read.format("csv").option("header",True).schema(record_owl_schema).load(bronze_path_recordowl)
display(record_owl_df)

In [0]:
record_owl_prepared = record_owl_df.select(
    col("uen").alias("uen_match"),
    col("company_name").alias("owl_company_name"),
    col("website").alias("company_website"),  # PRIMARY website source!
    col("linkedin").alias("linkedin_url"),
    col("facebook").alias("facebook_url"),
    col("instagram").alias("instagram_url"),
    col("contact_number").alias("phone_number"),
    col("description").alias("company_description"),
    col("primary_ssic_code").alias("owl_ssic_code"),
    col("primary_industry").alias("owl_industry"),
    col("secondary_ssic_code").alias("owl_secondary_ssic"),
    col("secondary_industry").alias("owl_secondary_industry")
).filter(col("uen_match").isNotNull())
display(record_owl_prepared)

In [0]:
record_owl_prepared = standardize_uen(record_owl_prepared, "uen_match")
record_owl_prepared = standardize_text(record_owl_prepared, "owl_company_name")
record_owl_prepared = standardize_url(record_owl_prepared, "company_website")
record_owl_prepared = standardize_url(record_owl_prepared, "linkedin_url")
record_owl_prepared = standardize_url(record_owl_prepared, "facebook_url")
record_owl_prepared = standardize_url(record_owl_prepared, "instagram_url")
record_owl_prepared = standardize_phone(record_owl_prepared, "phone_number")
record_owl_prepared = record_owl_prepared.dropDuplicates(["uen_match"])
display(record_owl_prepared)

# Companies_SG

In [0]:
bronze_path_companiessg = "abfss://bronze@singaporecomadls.dfs.core.windows.net/companies_sg/companies_sg_data.csv"

In [0]:
companiessg = StructType([
    StructField("Entity Name", StringType(), True),
    StructField("UEN", StringType(), True),
    StructField("Registration Incorporation Date", StringType(), True),
    StructField("Company Type Description", StringType(), True),
    StructField("Entity Status Description", StringType(), True),
    StructField("Entity Type Description", StringType(), True),
    StructField("URL", StringType(), True)
])

In [0]:
companies_sg_df=spark.read.format("csv").option("header",True).schema(companiessg).load(bronze_path_companiessg)
display(companies_sg_df)

In [0]:
companies_sg_prepared = companies_sg_df.select(
    col("UEN").alias("uen_match"),
    col("Entity Name").alias("sg_company_name"),
    col("Registration Incorporation Date").alias("sg_reg_date"),
    col("Company Type Description").alias("sg_company_type"),
    col("Entity Status Description").alias("sg_entity_status")
).filter(col("uen_match").isNotNull())

companies_sg_prepared = standardize_uen(companies_sg_prepared, "uen_match")
companies_sg_prepared = standardize_text(companies_sg_prepared, "sg_company_name")
companies_sg_prepared = companies_sg_prepared.dropDuplicates(["uen_match"])
display(companies_sg_prepared)

# Stocks

In [0]:
bronze_path_stocks = "abfss://bronze@singaporecomadls.dfs.core.windows.net/stocks/sgx_stocks_extracted.csv"

In [0]:
stocks_schema = StructType([
    StructField("symbol", StringType(), True),
    StructField("company_name", StringType(), True),
    StructField("market_cap", StringType(), True),
    StructField("stock_price", StringType(), True),
    StructField("percent_change", StringType(), True),
    StructField("revenue", StringType(), True)
])

In [0]:
stocks_df=spark.read.format("csv").option("header",True).schema(stocks_schema).load(bronze_path_stocks)
display(stocks_df )

In [0]:
stocks_prepared = stocks_df.select(
    col("symbol").alias("stock_symbol"),
    col("company_name").alias("stock_company_name"),
    col("market_cap"),
    col("revenue"),
    col("stock_price"),
    col("percent_change")
)

stocks_prepared = standardize_text(stocks_prepared, "stock_company_name")
display(stocks_prepared)

In [0]:
# acra_prepared = acra_prepared.dropDuplicates(["uen"])
# record_owl_prepared = record_owl_prepared.dropDuplicates(["uen_match"])
# scraped_prepared = scraped_prepared.dropDuplicates(["uen_match"])
# companies_sg_prepared = companies_sg_prepared.dropDuplicates(["uen_match"])


In [0]:
unified = acra_prepared

In [0]:
unified = unified.join(
    companies_sg_prepared,
    unified.uen == companies_sg_prepared.uen_match,
    "left"
).drop("uen_match")

print(f"After Companies SG join: {unified.count():,} records")
# print(f"Records enriched with sg_url: {unified.filter(col('sg_url').isNotNull()).count():,}")

In [0]:
display(unified)

In [0]:
unified = unified.join(
    record_owl_prepared,
    unified.uen == record_owl_prepared.uen_match,
    "left"
).drop("uen_match")

print(f"After Record Owl join: {unified.count():,} records")
print(f"Records enriched with company_website: {unified.filter(col('company_website').isNotNull()).count():,}")

In [0]:
display(unified)

In [0]:
unified = unified.join(
    scraped_prepared,
    unified.uen == scraped_prepared.uen_match,
    "left"
).drop("uen_match")

print(f"After Scraped Websites join: {unified.count():,} records")

In [0]:
display(unified)

In [0]:
from pyspark.sql.functions import broadcast
filtered = broadcast(stocks_prepared).alias("s").join(
    unified.select("uen", "company_name").alias("c"),
    lower(substring(col("s.stock_company_name"), 1, 4)) == lower(substring(col("c.company_name"), 1, 4)),
    "inner"
)


In [0]:
filtered = filtered.withColumn(
    "similarity_score",
    1 - (levenshtein(col("stock_company_name"), col("company_name")) /
        greatest(length(col("stock_company_name")), length(col("company_name"))))
).filter(col("similarity_score") > 0.85)

In [0]:
display(filtered)

In [0]:
# stock_matches.persist()

In [0]:
# Keep best match per stock
window = Window.partitionBy("stock_symbol").orderBy(col("similarity_score").desc())
best_stock_matches = filtered.withColumn(
    "rank",
    row_number().over(window)
).filter(col("rank") == 1)

In [0]:
display(best_stock_matches)

In [0]:
stocks_to_join = best_stock_matches.select(
    "uen",
    "stock_symbol",
    "market_cap",
    "revenue",
    "stock_price",
    "percent_change"
)

unified = unified.join(stocks_to_join, "uen", "left")

print(f"After stock data join: {unified.count():,} records")
print(f"Records with stock data: {unified.filter(col('stock_symbol').isNotNull()).count():,}")

In [0]:
# unified.join(stocks_to_join, "uen", "inner").count()

In [0]:
display(unified)

In [0]:
unified.select('percent_change').distinct().show()

In [0]:
final_unified = unified.select(
    # ===== CORE IDENTIFIERS =====
    col("uen"),
    
    # Company name (priority: ACRA > Record Owl > Companies SG > Scraped)
    coalesce(
        col("company_name"),
        col("owl_company_name"),
        col("scraped_company_name"),
        col("sg_company_name"),
    ).alias("company_name"),
    
    # ===== WEBSITE - CORRECTED PRIORITY =====
    # Priority: Record Owl (PRIMARY) > Scraped > Companies SG URL
    coalesce(
        col("company_website")      # From Record Owl - PRIMARY source!
    ).alias("website"),
    
    # ===== LOCATION =====
    lit("Singapore").alias("hq_country"),
    
    # Number of locations (default to 1, can enhance with postal code analysis)
    lit(1).alias("no_of_locations_in_singapore"),
    
    # ===== SOCIAL MEDIA =====
    # Priority: Record Owl > Scraped
    coalesce(col("linkedin_url"), col("scraped_linkedin")).alias("linkedin"),
    coalesce(col("facebook_url"), col("scraped_facebook")).alias("facebook"),
    coalesce(col("instagram_url"), col("scraped_instagram")).alias("instagram"),
    
    # ===== INDUSTRY - CORRECTED =====
    coalesce(
    col("industry_description"),   # ACRA readable description
    col("owl_industry"),           # RecordOwl readable industry
    col("industry_code"),          # fallback to ACRA SSIC code
    col("owl_ssic_code")           # fallback to RecordOwl SSIC code
    ).alias("industry"),

# (optional) expose SSIC code separately for analysis
    coalesce(
        col("industry_code"),
        col("owl_ssic_code")
    ).alias("industry_code_final"),
    
    
    # ===== COMPANY SIZE INFORMATION =====
    col("no_of_officers"),
    
    # # Company size (categorical) - Based on number of officers/employees
    # when(col("number_of_officers").isNull(), "Unknown")
    # .when(col("number_of_officers") < 50, "Small")
    # .when(col("number_of_officers") < 250, "Medium")
    # .otherwise("Large")
    # .alias("company_size"),
    
    # ===== STOCK INFORMATION =====
    when(col("stock_symbol").isNull(), None)
    .when(col("percent_change").isNull(), True)
    .when(trim(col("percent_change")) == "-", True)  # No price = delisted
    .otherwise(False)
    .alias("is_it_delisted"),
    
    col("stock_symbol").alias("stock_exchange_code"),
    col("revenue"),
    col("market_cap"),
    col('stock_price'),

    # ===== OTHER CORE FIELDS =====
    col("founding_year"),
    
    # Contact information
    col("scraped_email").alias("contact_email"),
    coalesce(col("phone_number"), col("scraped_phone")).alias("contact_phone"),
    
    
    coalesce(
    col("industry_description"),
    col("owl_industry")
).alias("products_offered"),
    
    coalesce(
        col("secondary_ssic_description"),
        col("owl_secondary_industry")
    ).alias("services_offered"),
    
    # Keywords
    col("scraped_keywords").alias("keywords"),
    
    # ===== METADATA & ADDITIONAL FIELDS =====
    col("entity_status_description"),
    col("address"),
    col("company_description"),
    
    # Company type - CORRECTED to use from Companies SG
    coalesce(
    col("entity_type_description"),   # Legal type (ACRA)
    col("sg_company_type")            # Secondary source (SG)
    ).alias("company_type"),
    
    # # Keep source fields for tracking
    # col("acra_entity_type"),
    # col("sg_company_type")
)

In [0]:
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.types import DoubleType

def safe_numeric_cast(df, columns):
    """
    Safely standardize and cast semi-structured numeric columns
    like '32.4M', '1.2b', '450K', '12,345', 'USD 3.2B' into DoubleType.

    1. Removes commas & currency symbols.
    2. Converts K/M/B suffixes (case-insensitive) into e3/e6/e9.
    3. Attempts structured cast to DoubleType().
    4. Invalid entries become NULL.
    """
    for col_name in columns:
        if col_name in df.columns:
            df = (
                df
                # Remove commas
                .withColumn(col_name, regexp_replace(col(col_name), ",", ""))
                # Remove currency symbols & whitespace
                .withColumn(col_name, regexp_replace(col(col_name), r"(?i)[$€₹usd\s]", ""))
                # Replace K, M, B suffixes (case-insensitive)
                .withColumn(col_name, regexp_replace(col(col_name), r"(?i)k", "e3"))
                .withColumn(col_name, regexp_replace(col(col_name), r"(?i)m", "e6"))
                .withColumn(col_name, regexp_replace(col(col_name), r"(?i)b", "e9"))
                # Cast safely to DoubleType
                .withColumn(col_name, col(col_name).cast(DoubleType()))
            )
    return df



In [0]:
numeric_columns = ["revenue", "market_cap", "stock_price"]

final_unified = safe_numeric_cast(final_unified, numeric_columns)

In [0]:
display(final_unified)

In [0]:
final_unified.printSchema()


In [0]:
from pyspark.sql.functions import when, col, trim, lower

final_unified = final_unified.select(
    *[
        when(
            (trim(lower(col(c))) == "na") |
            (trim(lower(col(c))) == "n/a") |
            (trim(col(c)) == "-"),
            None
        ).otherwise(col(c)).alias(c)
        for c in final_unified.columns
    ]
)


In [0]:
display(final_unified)

In [0]:
silver_schema = StructType([
    StructField("uen", StringType()),
    StructField("company_name", StringType()),
    StructField("website", StringType()),
    StructField("hq_country", StringType()),
    StructField("no_of_locations_in_singapore", IntegerType()),
    StructField("linkedin", StringType()),
    StructField("facebook", StringType()),
    StructField("instagram", StringType()),
    StructField("industry", StringType()),
    StructField("industry_code_final", StringType()),
    StructField("no_of_officers", IntegerType()),
    StructField("is_it_delisted", BooleanType()),
    StructField("stock_exchange_code", StringType()),
    StructField("revenue", DoubleType()),
    StructField("market_cap", DoubleType()),
    StructField("stock_price", DoubleType()),
    StructField("founding_year", IntegerType()),
    StructField("contact_email", StringType()),
    StructField("contact_phone", StringType()),
    StructField("products_offered", StringType()),
    StructField("services_offered", StringType()),
    StructField("keywords", StringType()),
    StructField("entity_status_description", StringType()),
    StructField("address", StringType()),
    StructField("company_description", StringType()),
    StructField("company_type", StringType())
])

In [0]:
from pyspark.sql.functions import current_timestamp

# Add created_at and updated_at columns
final_unified = (
    final_unified
    .withColumn("created_at", current_timestamp())
    .withColumn("updated_at", current_timestamp())
)


In [0]:
display(final_unified)

In [0]:
silver_path_unified = "abfss://silver@singaporecomadls.dfs.core.windows.net/unified_data/"
final_unified.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(silver_path_unified)

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS silver.unified.unified_companies
USING DELTA
LOCATION '{silver_path_unified}'
""")

In [0]:
# spark.sql("ALTER TABLE silver.unified.firmable_unified_companies RENAME TO silver.unified.unified_companies")

In [0]:
%sql
SELECT * FROM silver.unified.unified_companies

In [0]:
final_unified = final_unified.withColumn(
    "data_completeness_score",
    (
        when(col("website").isNotNull(), 1).otherwise(0) +
        when(col("linkedin").isNotNull(), 1).otherwise(0) +
        when(col("facebook").isNotNull(), 1).otherwise(0) +
        when(col("instagram").isNotNull(), 1).otherwise(0) +
        when(col("contact_email").isNotNull(), 1).otherwise(0) +
        when(col("contact_phone").isNotNull(), 1).otherwise(0) +
        when(col("revenue").isNotNull(), 1).otherwise(0) +
        when(col("keywords").isNotNull(), 1).otherwise(0)
    ) / 8.0 * 100
)

print(f"✓ Final unified records: {final_unified.count():,}")

In [0]:
# Coverage statistics
coverage = final_unified.select(
    count("*").alias("total_companies"),
    countDistinct("uen").alias("unique_uens"),
    (count("website") / count("*") * 100).alias("website_%"),
    (count("linkedin") / count("*") * 100).alias("linkedin_%"),
    (count("facebook") / count("*") * 100).alias("facebook_%"),
    (count("instagram") / count("*") * 100).alias("instagram_%"),
    (count("contact_email") / count("*") * 100).alias("email_%"),
    (count("contact_phone") / count("*") * 100).alias("phone_%"),
    (count("revenue") / count("*") * 100).alias("revenue_%"),
    (count("company_type") / count("*") * 100).alias("company_type_%"),
    avg("data_completeness_score").alias("avg_completeness")
)

In [0]:
display(coverage)

In [0]:
import sys
import os
project_path = os.path.join(os.getcwd())
sys.path.append(project_path)

In [0]:
 %run "/Repos/24huda.zaidi@fostiima.org/DE-Singapore-Companies-DB/Pyspark_Notebooks/Cleaning/data_quality_utils"

In [0]:
# MAGIC %run "/Repos/24huda.zaidi@fostiima.org/DE-Singapore-Companies-DB/Pyspark_Notebooks/Cleaning/data_quality_utils"

# COMMAND ----------

df = spark.read.format("delta").load(
    "abfss://silver@singaporecomadls.dfs.core.windows.net/unified_data/"
)

# ✅ Step 1: Add completeness score
df = calculate_data_completeness_score(df)

# ✅ Step 2: Run data quality checks
df = run_all_quality_checks(df)

# ✅ Step 3: Export reports
export_matching_report(
    df, "abfss://silver@singaporecomadls.dfs.core.windows.net/export_matching_report/"
)
export_statistics(
    df, "abfss://silver@singaporecomadls.dfs.core.windows.net/export_statistics/"
)
