In [0]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import (
    col, lower, trim, regexp_replace, when, coalesce, 
    lit, count, countDistinct, avg, sum as _sum, max as _max,
    concat_ws, collect_list, array_distinct, flatten,
    monotonically_increasing_id, row_number, dense_rank,
    length, levenshtein, soundex, split,
    udf, struct, first, last, greatest, upper, concat, abs,substring, expr,year, to_date, current_timestamp
)
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructType, StructField,BooleanType
from functools import reduce
from delta.tables import DeltaTable

In [0]:
def standardize_text(df, col_name):
    """Standardize text for matching"""
    return (df
        .withColumn(col_name, lower(trim(col(col_name))))
        .withColumn(col_name, regexp_replace(col(col_name), r'[^\w\s]', ' '))
        .withColumn(col_name, regexp_replace(col(col_name), r'\s+', ' '))
        .withColumn(col_name, trim(col(col_name)))
        # Common abbreviations
        .withColumn(col_name, regexp_replace(col(col_name), r'\bpte\s*ltd\b', 'private limited'))
        .withColumn(col_name, regexp_replace(col(col_name), r'\bltd\b', 'limited'))
        .withColumn(col_name, regexp_replace(col(col_name), r'\bllp\b', 'limited liability partnership'))
    )

def standardize_url(df, col_name):
    """Standardize URL for matching"""
    return (df
        .withColumn(col_name, lower(trim(col(col_name))))
        .withColumn(col_name, regexp_replace(col(col_name), r'^https?://', ''))
        .withColumn(col_name, regexp_replace(col(col_name), r'^www\.', ''))
        .withColumn(col_name, regexp_replace(col(col_name), r'/$', ''))
    )

def standardize_uen(df, col_name):
    """Standardize UEN format"""
    return (df
        .withColumn(col_name, upper(trim(col(col_name))))
        .withColumn(col_name, regexp_replace(col(col_name), r'[^\w]', ''))
    )

def standardize_phone(df, col_name):

    return (
        df
        .withColumn(col_name, regexp_replace(col(col_name), r'[^0-9]', ''))

        # 8-digit SG numbers → +65XXXXXXXX
        .withColumn(
            col_name,
            when(length(col(col_name)) == 8,
                 concat(lit("+65"), col(col_name)))
            .otherwise(col(col_name))
        )

        # 10-digit numbers with 65 prefix → +65XXXXXXXX
        .withColumn(
            col_name,
            when((length(col(col_name)) == 10) & (col(col_name).startswith("65")),
                 concat(lit("+"), col(col_name)))
            .otherwise(col(col_name))
        )

        # # Allow +65XXXXXXXX format
        # .withColumn(
        #     col_name,
        #     when(col(col_name).startswith("65"), concat(lit("+"), col(col_name)))
        #     .otherwise(col(col_name))
        # )

        # Final validation → Keep only valid SG numbers
        .withColumn(
            col_name,
            when(col(col_name).rlike(r'^\+65[0-9]{8}$'), col(col_name))
            .otherwise(None)
        )
    )

# Scraped Websites

In [0]:
bronze_path_scrape_websites = "abfss://bronze@singaporecomadls.dfs.core.windows.net/scrape_websites/scraped_websites.csv"
silver_path_scrape_websites = "abfss://silver@singaporecomadls.dfs.core.windows.net/clean/scrape_websites/"

In [0]:
scrape_websites_schema = StructType([
    StructField("uen", StringType(), True),
    StructField("company_name", StringType(), True),
    StructField("website", StringType(), True),
    StructField("linkedin", StringType(), True),
    StructField("facebook", StringType(), True),
    StructField("instagram", StringType(), True),
    StructField("contact_email", StringType(), True),
    StructField("contact_phone", StringType(), True),
    StructField("keywords", StringType(), True),
    StructField("scrape_status", StringType(), True),
    StructField("html_saved", StringType(), True),
    StructField("html_size", StringType(), True),
    StructField("error", StringType(), True),
    StructField("scrape_time", StringType(), True)
])

In [0]:
scraped_websites_df=spark.read.format("csv").option("header",True).schema(scrape_websites_schema).load(bronze_path_scrape_websites)
display(scraped_websites_df)

In [0]:
scraped_prepared = scraped_websites_df.filter(
    col("scrape_status") == "success"
).select(
    col("uen").alias("uen_match"),
    col("website").alias("recordowl_website"),
    col("company_name").alias("scraped_company_name"),
    col("linkedin").alias("scraped_linkedin"),
    col("facebook").alias("scraped_facebook"),
    col("instagram").alias("scraped_instagram"),
    col("contact_email").alias("scraped_email"),
    col("contact_phone").alias("scraped_phone"),
    col("keywords").alias("scraped_keywords")
).filter(col("uen_match").isNotNull())
display(scraped_prepared)

In [0]:
scraped_prepared = standardize_uen(scraped_prepared, "uen_match")
scraped_prepared = standardize_text(scraped_prepared, "scraped_company_name")
scraped_prepared = standardize_url(scraped_prepared, "scraped_linkedin")
scraped_prepared = standardize_url(scraped_prepared, "scraped_facebook")
scraped_prepared = standardize_url(scraped_prepared, "scraped_instagram")
scraped_prepared = standardize_phone(scraped_prepared, "scraped_phone")
scraped_prepared = scraped_prepared.dropDuplicates(["uen_match"])
display(scraped_prepared)

In [0]:
scraped_prepared = scraped_prepared.withColumn("source_data",lit('scraped'))
display(scraped_prepared)

In [0]:
target_path = "abfss://silver@singaporecomadls.dfs.core.windows.net/clean/scrape_websites"
scraped_upsert_df = scraped_prepared.withColumn("updated_at", current_timestamp())

delta_target = DeltaTable.forPath(spark, target_path)

(
    delta_target.alias("t")
    .merge(scraped_upsert_df.alias("s"), "t.uen_match = s.uen_match")
    .whenMatchedUpdate(set={
        "recordowl_website": "s.recordowl_website",
        "scraped_company_name": "s.scraped_company_name",
        "scraped_linkedin": "s.scraped_linkedin",
        "scraped_facebook": "s.scraped_facebook",
        "scraped_instagram": "s.scraped_instagram",
        "scraped_email": "s.scraped_email",
        "scraped_phone": "s.scraped_phone",
        "scraped_keywords": "s.scraped_keywords",
        "source_data": "s.source_data",
        "updated_at": "current_timestamp()"
    })
    .whenNotMatchedInsert(values={
        "uen_match": "s.uen_match",
        "recordowl_website": "s.recordowl_website",
        "scraped_company_name": "s.scraped_company_name",
        "scraped_linkedin": "s.scraped_linkedin",
        "scraped_facebook": "s.scraped_facebook",
        "scraped_instagram": "s.scraped_instagram",
        "scraped_email": "s.scraped_email",
        "scraped_phone": "s.scraped_phone",
        "scraped_keywords": "s.scraped_keywords",
        "source_data": "s.source_data",
        "created_at": "current_timestamp()",
        "updated_at": "current_timestamp()"
    })
    .execute()
)


## Storing Cleaned Scraped `Data`

In [0]:
%sql
SELECT * from silver.clean.scrapped_wesbites

#ACRA DATA

In [0]:
bronze_path_acra = "abfss://bronze@singaporecomadls.dfs.core.windows.net/acra/acra_data.csv"
silver_path_acra = "abfss://silver@singaporecomadls.dfs.core.windows.net/clean/acra"

In [0]:
acra_schema = StructType([
    StructField("uen", StringType(), True),
    StructField("issuance_agency_id", StringType(), True),
    StructField("entity_name", StringType(), True),
    StructField("entity_type_description", StringType(), True),
    StructField("business_constitution_description", StringType(), True),
    StructField("company_type_description", StringType(), True),
    StructField("paf_constitution_description", StringType(), True),
    StructField("entity_status_description", StringType(), True),
    StructField("registration_incorporation_date", StringType(), True),
    StructField("uen_issue_date", StringType(), True),
    StructField("address_type", StringType(), True),
    StructField("block", StringType(), True),
    StructField("street_name", StringType(), True),
    StructField("level_no", StringType(), True),
    StructField("unit_no", StringType(), True),
    StructField("building_name", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("other_address_line1", StringType(), True),
    StructField("other_address_line2", StringType(), True),
    StructField("account_due_date", StringType(), True),
    StructField("annual_return_date", StringType(), True),
    StructField("primary_ssic_code", StringType(), True),
    StructField("primary_ssic_description", StringType(), True),
    StructField("primary_user_described_activity", StringType(), True),
    StructField("secondary_ssic_code", StringType(), True),
    StructField("secondary_ssic_description", StringType(), True),
    StructField("secondary_user_described_activity", StringType(), True),
    StructField("no_of_officers", StringType(), True),
])

In [0]:
acra_df=spark.read.format("csv").option("header",True).schema(acra_schema).load(bronze_path_acra)
display(acra_df)

In [0]:
from pyspark.sql.functions import when, col, trim, lower

acra_df = acra_df.select(
    *[
        when(
            (trim(lower(col(c))) == "na") |
            (trim(lower(col(c))) == "n/a") |
            (trim(col(c)) == "-"),
            None
        ).otherwise(col(c)).alias(c)
        for c in acra_df.columns
    ]
)

In [0]:
display(acra_df)

In [0]:
acra_prepared = acra_df.select(
    col("uen"),
    col("entity_name").alias("company_name"),
    col("entity_type_description"),
    col("entity_status_description"),
    col("registration_incorporation_date"),#.alias("founding_year"),
    col("primary_ssic_code").alias("industry_code"),
    col("primary_ssic_description").alias("industry_description"),
    col("secondary_ssic_code"),
    col("secondary_ssic_description"),
    col("no_of_officers"),
    concat_ws(", ",
        safe("block"),
        safe("street_name"),
        safe("building_name"),
        safe("postal_code")
    ).alias("address")
).filter(col("uen").isNotNull())

In [0]:
acra_prepared=acra_prepared.withColumn(
    "founding_year",
    year(
        coalesce(
            to_date(col("registration_incorporation_date"), "yyyy-MM-dd"),
            to_date(col("registration_incorporation_date"), "yyyy-MM-dd")

        )
    )
)

In [0]:
# Standardize
acra_prepared = standardize_uen(acra_prepared, "uen")
acra_prepared = standardize_text(acra_prepared, "company_name")
acra_prepared = acra_prepared.dropDuplicates(["uen"])

In [0]:
acra_prepared = acra_prepared.withColumn("source_data",lit("acra"))
display(acra_prepared)

In [0]:


# Example: ACRA
target_path = "abfss://silver@singaporecomadls.dfs.core.windows.net/clean/acra"

# Add timestamps to incoming data
acra_upsert_df = acra_prepared \
    .withColumn("updated_at", current_timestamp()) \
    .withColumn("created_at", current_timestamp())

# Load target table as DeltaTable
delta_target = DeltaTable.forPath(spark, target_path)

# Perform MERGE
(
    delta_target.alias("t")
    .merge(
        acra_upsert_df.alias("s"),
        "t.uen = s.uen"
    )
    .whenMatchedUpdate(set={
        # update all mutable columns + updated_at
        "company_name": "s.company_name",
        "entity_type_description": "s.entity_type_description",
        "entity_status_description": "s.entity_status_description",
        "registration_incorporation_date": "s.registration_incorporation_date",
        "industry_code": "s.industry_code",
        "industry_description": "s.industry_description",
        "secondary_ssic_code": "s.secondary_ssic_code",
        "secondary_ssic_description": "s.secondary_ssic_description",
        "no_of_officers": "s.no_of_officers",
        "address": "s.address",
        "founding_year": "s.founding_year",
        "source_data": "s.source_data",
        "updated_at": "current_timestamp()"
    })
    .whenNotMatchedInsert(values={
        "uen": "s.uen",
        "company_name": "s.company_name",
        "entity_type_description": "s.entity_type_description",
        "entity_status_description": "s.entity_status_description",
        "registration_incorporation_date": "s.registration_incorporation_date",
        "industry_code": "s.industry_code",
        "industry_description": "s.industry_description",
        "secondary_ssic_code": "s.secondary_ssic_code",
        "secondary_ssic_description": "s.secondary_ssic_description",
        "no_of_officers": "s.no_of_officers",
        "address": "s.address",
        "founding_year": "s.founding_year",
        "source_data": "s.source_data",
        "created_at": "current_timestamp()",
        "updated_at": "current_timestamp()"
    })
    .execute()
)


In [0]:
%sql
SELECT * from silver.clean.acra

# record_owl 

In [0]:
bronze_path_recordowl = "abfss://bronze@singaporecomadls.dfs.core.windows.net/recordowld/recordowl.csv"
silver_path_recordowl = "abfss://silver@singaporecomadls.dfs.core.windows.net/clean/recordowl"

In [0]:
record_owl_schema = StructType([
    StructField("uen", StringType(), True),
    StructField("company_name", StringType(), True),
    StructField("company_link", StringType(), True),
    StructField("registration_number", StringType(), True),
    StructField("registered_address", StringType(), True),
    StructField("operating_status", StringType(), True),
    StructField("company_age", StringType(), True),
    StructField("building", StringType(), True),
    StructField("contact_number", StringType(), True),
    StructField("website", StringType(), True),
    StructField("description", StringType(), True),
    StructField("primary_ssic_code", StringType(), True),
    StructField("primary_industry", StringType(), True),
    StructField("secondary_ssic_code", StringType(), True),
    StructField("secondary_industry", StringType(), True),
    StructField("company_founder", StringType(), True),
    StructField("facebook", StringType(), True),
    StructField("linkedin", StringType(), True),
    StructField("twitter", StringType(), True),
    StructField("instagram", StringType(), True),
    StructField("youtube", StringType(), True),
    StructField("tiktok", StringType(), True),
    StructField("pinterest", StringType(), True)
])

In [0]:
record_owl_df=spark.read.format("csv").option("header",True).schema(record_owl_schema).load(bronze_path_recordowl)
display(record_owl_df)

In [0]:
record_owl_prepared = (
    record_owl_df.select(
        col("uen").alias("uen_match"),
        col("company_name").alias("owl_company_name"),
        col("company_link").alias("recordowl_website"),
        col("website").alias("company_website"),  # PRIMARY website source!
        col("linkedin").alias("linkedin_url"),
        col("facebook").alias("facebook_url"),
        col("instagram").alias("instagram_url"),
        col("contact_number").alias("phone_number"),
        col("description").alias("company_description"),
        col("primary_ssic_code").alias("owl_ssic_code"),
        col("primary_industry").alias("owl_industry"),
        col("secondary_ssic_code").alias("owl_secondary_ssic"),
        col("secondary_industry").alias("owl_secondary_industry")
    )
    # ✅ Only keep rows with valid UENs (non-null and 9–10 characters)
    .filter(
        col("uen_match").isNotNull() &
        (length(col("uen_match")) >= 9) &
        (length(col("uen_match")) <= 10)
    )
)
display(record_owl_prepared)

In [0]:
record_owl_prepared = standardize_uen(record_owl_prepared, "uen_match")
record_owl_prepared = standardize_text(record_owl_prepared, "owl_company_name")
record_owl_prepared = standardize_url(record_owl_prepared, "recordowl_website")
record_owl_prepared = standardize_url(record_owl_prepared, "company_website")
record_owl_prepared = standardize_url(record_owl_prepared, "linkedin_url")
record_owl_prepared = standardize_url(record_owl_prepared, "facebook_url")
record_owl_prepared = standardize_url(record_owl_prepared, "instagram_url")
record_owl_prepared = standardize_phone(record_owl_prepared, "phone_number")
record_owl_prepared = record_owl_prepared.dropDuplicates(["uen_match"])
display(record_owl_prepared)

In [0]:
record_owl_prepared = record_owl_prepared.withColumn("source_data",lit('recordowl'))
display(record_owl_prepared)

In [0]:
delta_target = DeltaTable.forPath(spark, "abfss://silver@singaporecomadls.dfs.core.windows.net/clean/recordowl")

(
    delta_target.alias("t")
    .merge(record_owl_prepared.alias("s"), "t.uen_match = s.uen_match")
    .whenMatchedUpdate(set={
        "owl_company_name": "s.owl_company_name",
        "recordowl_website": "s.recordowl_website",
        "company_website": "s.company_website",
        "linkedin_url": "s.linkedin_url",
        "facebook_url": "s.facebook_url",
        "instagram_url": "s.instagram_url",
        "phone_number": "s.phone_number",
        "company_description": "s.company_description",
        "owl_ssic_code": "s.owl_ssic_code",
        "owl_industry": "s.owl_industry",
        "owl_secondary_ssic": "s.owl_secondary_ssic",
        "owl_secondary_industry": "s.owl_secondary_industry",
        "source_data": "s.source_data",
        "updated_at": "current_timestamp()"
    })
    .whenNotMatchedInsert(values={
        "uen_match": "s.uen_match",
        "owl_company_name": "s.owl_company_name",
        "recordowl_website": "s.recordowl_website",
        "company_website": "s.company_website",
        "linkedin_url": "s.linkedin_url",
        "facebook_url": "s.facebook_url",
        "instagram_url": "s.instagram_url",
        "phone_number": "s.phone_number",
        "company_description": "s.company_description",
        "owl_ssic_code": "s.owl_ssic_code",
        "owl_industry": "s.owl_industry",
        "owl_secondary_ssic": "s.owl_secondary_ssic",
        "owl_secondary_industry": "s.owl_secondary_industry",
        "source_data": "s.source_data",
        "created_at": "current_timestamp()",
        "updated_at": "current_timestamp()"
    })
    .execute()
)


In [0]:
%sql
SELECT * from silver.clean.recordowl

# Companies_SG

In [0]:
bronze_path_companiessg = "abfss://bronze@singaporecomadls.dfs.core.windows.net/companies_sg/companies_sg_data.csv"
silver_path_companies_sg = "abfss://silver@singaporecomadls.dfs.core.windows.net/clean/companies_sg"

In [0]:
companiessg = StructType([
    StructField("Entity Name", StringType(), True),
    StructField("UEN", StringType(), True),
    StructField("Registration Incorporation Date", StringType(), True),
    StructField("Company Type Description", StringType(), True),
    StructField("Entity Status Description", StringType(), True),
    StructField("Entity Type Description", StringType(), True),
    StructField("companies_sg_website", StringType(), True)
])

In [0]:
companies_sg_df=spark.read.format("csv").option("header",True).schema(companiessg).load(bronze_path_companiessg)
display(companies_sg_df)

In [0]:
companies_sg_prepared = companies_sg_df.select(
    col("UEN").alias("uen_match"),
    col("Entity Name").alias("sg_company_name"),
    col("companies_sg_website"),
    col("Registration Incorporation Date").alias("sg_reg_date"),
    col("Company Type Description").alias("sg_company_type"),
    col("Entity Status Description").alias("sg_entity_status"),
    col("companies_sg_website").alias("sg_website")
).filter(col("uen_match").isNotNull())

companies_sg_prepared = standardize_uen(companies_sg_prepared, "uen_match")
companies_sg_prepared = standardize_text(companies_sg_prepared, "sg_company_name")
companies_sg_prepared = standardize_url(companies_sg_prepared, "companies_sg_website")
companies_sg_prepared = standardize_text(companies_sg_prepared, "sg_company_name")
companies_sg_prepared = companies_sg_prepared.dropDuplicates(["uen_match"])
display(companies_sg_prepared)

In [0]:
companies_sg_prepared = companies_sg_prepared.withColumn("source_data",lit("companies_sg"))

In [0]:
companies_sg_prepared.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(silver_path_companies_sg)

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS silver.clean.companies_sg
USING DELTA
LOCATION '{silver_path_companies_sg}'
""")

In [0]:
%sql
SELECT * from silver.clean.companies_sg

# Stocks

In [0]:
bronze_path_stocks = "abfss://bronze@singaporecomadls.dfs.core.windows.net/stocks/sgx_stocks_extracted.csv"
silver_path_stocks = "abfss://silver@singaporecomadls.dfs.core.windows.net/clean/stocks"

In [0]:
stocks_schema = StructType([
    StructField("symbol", StringType(), True),
    StructField("company_name", StringType(), True),
    StructField("market_cap", StringType(), True),
    StructField("stock_price", StringType(), True),
    StructField("percent_change", StringType(), True),
    StructField("revenue", StringType(), True)
])

In [0]:
stocks_df=spark.read.format("csv").option("header",True).schema(stocks_schema).load(bronze_path_stocks)
display(stocks_df )

In [0]:
stocks_prepared = stocks_df.select(
    col("symbol").alias("stock_symbol"),
    col("company_name").alias("stock_company_name"),
    col("market_cap"),
    col("revenue"),
    col("stock_price"),
    col("percent_change")
)

stocks_prepared = standardize_text(stocks_prepared, "stock_company_name")
display(stocks_prepared)

In [0]:
stocks_prepared = stocks_prepared.withColumn("source_data",lit('sgx_stocks'))
display(stocks_prepared)

In [0]:
from pyspark.sql.functions import regexp_replace, col, when, trim
from pyspark.sql.types import DoubleType

def safe_numeric_cast(df, columns):
    """
    Safely cleans and casts numeric-like columns (e.g. '32.4M', '1.2B', '10,240', '-1.57%', '-') to DoubleType.
    
    Handles:
    - Commas, currency symbols, whitespace
    - Suffix multipliers (K, M, B)
    - Percent symbols (%)
    - Empty strings, '-' → NULL
    """

    for col_name in columns:
        if col_name in df.columns:
            df = (
                df
                # Trim leading/trailing whitespace
                .withColumn(col_name, trim(col(col_name)))
                # Remove commas
                .withColumn(col_name, regexp_replace(col(col_name), ",", ""))
                # Remove % sign
                .withColumn(col_name, regexp_replace(col(col_name), "%", ""))
                # Remove currency symbols and text like USD, INR, $
                .withColumn(col_name, regexp_replace(col(col_name), r"(?i)[$€₹usd\s]", ""))
                # Handle K, M, B suffixes (case-insensitive)
                .withColumn(col_name, regexp_replace(col(col_name), r"(?i)k", "e3"))
                .withColumn(col_name, regexp_replace(col(col_name), r"(?i)m", "e6"))
                .withColumn(col_name, regexp_replace(col(col_name), r"(?i)b", "e9"))
                # Replace '-' or empty strings with null
                .withColumn(col_name, when((col(col_name) == "-") | (col(col_name) == ""), None).otherwise(col(col_name)))
                # Finally, cast to DoubleType
                .withColumn(col_name, col(col_name).cast(DoubleType()))
            )
    return df


In [0]:

numeric_cols = ["market_cap", "revenue", "stock_price", "percent_change"]
stocks_prepared = safe_numeric_cast(stocks_prepared, numeric_cols)

In [0]:
display(stocks_prepared)

In [0]:
target_path = "abfss://silver@singaporecomadls.dfs.core.windows.net/clean/stocks"
stocks_upsert_df = stocks_prepared.withColumn("updated_at", current_timestamp())

delta_target = DeltaTable.forPath(spark, target_path)

(
    delta_target.alias("t")
    .merge(stocks_upsert_df.alias("s"), "t.stock_symbol = s.stock_symbol")
    .whenMatchedUpdate(set={
        "stock_company_name": "s.stock_company_name",
        "market_cap": "s.market_cap",
        "revenue": "s.revenue",
        "stock_price": "s.stock_price",
        "percent_change": "s.percent_change",
        "source_data": "s.source_data",
        "updated_at": "current_timestamp()"
    })
    .whenNotMatchedInsert(values={
        "stock_symbol": "s.stock_symbol",
        "stock_company_name": "s.stock_company_name",
        "market_cap": "s.market_cap",
        "revenue": "s.revenue",
        "stock_price": "s.stock_price",
        "percent_change": "s.percent_change",
        "source_data": "s.source_data",
        "created_at": "current_timestamp()",
        "updated_at": "current_timestamp()"
    })
    .execute()
)


In [0]:
%sql
SELECT * from silver.clean.stocks

In [0]:
final_unified = final_unified.withColumn(
    "data_completeness_score",
    (
        when(col("website").isNotNull(), 1).otherwise(0) +
        when(col("linkedin").isNotNull(), 1).otherwise(0) +
        when(col("facebook").isNotNull(), 1).otherwise(0) +
        when(col("instagram").isNotNull(), 1).otherwise(0) +
        when(col("contact_email").isNotNull(), 1).otherwise(0) +
        when(col("contact_phone").isNotNull(), 1).otherwise(0) +
        when(col("revenue").isNotNull(), 1).otherwise(0) +
        when(col("keywords").isNotNull(), 1).otherwise(0)
    ) / 8.0 * 100
)

print(f"✓ Final unified records: {final_unified.count():,}")

In [0]:
# Coverage statistics
coverage = final_unified.select(
    count("*").alias("total_companies"),
    countDistinct("uen").alias("unique_uens"),
    (count("website") / count("*") * 100).alias("website_%"),
    (count("linkedin") / count("*") * 100).alias("linkedin_%"),
    (count("facebook") / count("*") * 100).alias("facebook_%"),
    (count("instagram") / count("*") * 100).alias("instagram_%"),
    (count("contact_email") / count("*") * 100).alias("email_%"),
    (count("contact_phone") / count("*") * 100).alias("phone_%"),
    (count("revenue") / count("*") * 100).alias("revenue_%"),
    (count("company_type") / count("*") * 100).alias("company_type_%"),
    avg("data_completeness_score").alias("avg_completeness")
)

In [0]:
display(coverage)

In [0]:
import sys
import os
project_path = os.path.join(os.getcwd())
sys.path.append(project_path)

In [0]:
# MAGIC %run "/Repos/24huda.zaidi@fostiima.org/DE-Singapore-Companies-DB/Pyspark_Notebooks/Cleaning/data_quality_utils"

# COMMAND ----------

df = spark.read.format("delta").load(
    "abfss://silver@singaporecomadls.dfs.core.windows.net/unified_data/"
)

# ✅ Step 1: Add completeness score
df = calculate_data_completeness_score(df)

# ✅ Step 2: Run data quality checks
df = run_all_quality_checks(df)

# ✅ Step 3: Export reports
export_matching_report(
    df, "abfss://silver@singaporecomadls.dfs.core.windows.net/export_matching_report/"
)
export_statistics(
    df, "abfss://silver@singaporecomadls.dfs.core.windows.net/export_statistics/"
)
