In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, collect_set, size, desc, when, expr, regexp_replace, udf
import os

# Initialize Spark session
spark = SparkSession.builder.appName("JSON Analysis").getOrCreate()

workspace_dir = os.getcwd() 
json_path = os.path.join(workspace_dir, "data", "*.json")

# Read the JSON files
df = spark.read.option("multiline", "true").json(json_path).cache()

# Count rows in the DataFrame
df_count = df.count()
print(f"Total Records: {df_count}")

# Extract certificate details
out = (
    df.select(explode(col("certificates")).alias("certificate"))
    .select(
        col("certificate.commonName").alias("domain"),
        col("certificate.san"),
        col("certificate.address"),
        col("certificate.organization")
    )
    .select(
        col("domain"),
        explode(col("san")).alias("alternate_domain"),
        col("address"),
        col("organization")
    )
)

25/02/07 08:51:21 WARN CacheManager: Asked to cache already cached data.


Total Records: 2


In [2]:
# Aggregate 
classified = (
    out.withColumn("domain", regexp_replace(col("domain"), r"^\*\.", ""))
    .withColumn("is_self", col("domain") == col("alternate_domain"))
    .withColumn("is_subdomain", expr("alternate_domain LIKE concat('%.', domain)") & ~col("is_self"))
    .groupBy("domain")
    .agg(
        collect_set(when(col("is_subdomain") & ~col("is_self"), col("alternate_domain"))).alias("subdomains"),
        collect_set(when(~col("is_subdomain") & ~col("is_self"), col("alternate_domain"))).alias("alternate_domains"),
        collect_set("organization").alias("organizations"),
        collect_set("address").alias("addresses")
    )
    .withColumn("organizations_count", size(col("organizations")))
    .withColumn("alternate_domains_count", size(col("alternate_domains")))
    .orderBy(desc("alternate_domains_count"))
)

classified.printSchema()

root
 |-- domain: string (nullable = true)
 |-- subdomains: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- alternate_domains: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- organizations: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- addresses: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- organizations_count: integer (nullable = false)
 |-- alternate_domains_count: integer (nullable = false)



25/02/07 08:47:35 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [19]:
classified.show(truncate=False)

+---------------------------------+-------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+-------------------+-----------------------+
|domain                           |subdomains 

In [16]:
from pyspark.sql.types import StringType
import re
# UDF to normalize organization names
def normalize_org_name(org):
    if org is None:
        return None
    org = org.strip()                                           # Remove leading/trailing spaces
    org = re.sub(r",", "", org)                                 # Remove commas
    org = re.sub(r"\s+", " ", org)                              # Normalize multiple spaces
    org = re.sub(r"\bLLC\b", "LLC.", org, flags=re.IGNORECASE)  # Standardize LLC
    org = re.sub(r"\bINC\b", "Inc.", org, flags=re.IGNORECASE)  # Standardize Inc
    org = re.sub(r"\bLTD\b", "Ltd.", org, flags=re.IGNORECASE)  # Standardize Ltd
    org = re.sub(r"\bSKG\b", "SKG", org, flags=re.IGNORECASE)   # Keep SKG unchanged
    org = " ".join(word.capitalize() for word in org.split())   # Convert to title case
    return org
normalize_org_udf = udf(normalize_org_name, StringType())

organizations = (
    out.filter(col("organization").isNotNull()) 
    .withColumn("organization", normalize_org_udf(col("organization")))
    .withColumn("domain", regexp_replace(col("domain"), r"^\*\.", ""))
    .withColumn("alternate_domain", regexp_replace(col("alternate_domain"), r"^\*\.", ""))
    .groupBy("organization")
    .agg(
        collect_set("domain").alias("domains"),
        collect_set("alternate_domain").alias("alternate_domains"),
        collect_set("address").alias("addresses")
    )
)

In [17]:

organizations.show(truncate=False)

+--------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------