In [1]:
# Import required modules
from pyspark.sql import SparkSession

# Create a new SparkSession
spark = SparkSession.builder.getOrCreate()

# Get SparkContext
sc = spark.sparkContext

In [2]:
# Read Domains CSV File into an RDD
common_crawl_domain_counts = sc.textFile('./crawl/cc-main-limited-domains.csv')

common_crawl_domain_counts.take(5)

['367855\t172-in-addr\tarpa\t1',
 '367856\taddr\tarpa\t1',
 '367857\tamphic\tarpa\t1',
 '367858\tbeta\tarpa\t1',
 '367859\tcallic\tarpa\t1']

In [3]:
#function format an entry
def fmt_domain_graph_entry(entry):
    # Split the entry on delimiter ('\t') into site_id, domain, tld, and num_subdomains
    site_id, domain, tld, num_subdomains = entry.split('\t')        
    return int(site_id), domain, tld, int(num_subdomains)

In [4]:
# Apply `fmt_domain_graph_entry` to the raw data RDD
formatted_host_counts = common_crawl_domain_counts.map(lambda x: fmt_domain_graph_entry(x))

# Display the first few entries of the new RDD
formatted_host_counts.take(5)

[(367855, '172-in-addr', 'arpa', 1),
 (367856, 'addr', 'arpa', 1),
 (367857, 'amphic', 'arpa', 1),
 (367858, 'beta', 'arpa', 1),
 (367859, 'callic', 'arpa', 1)]

In [5]:
#function extract subdomain count from an entry
def extract_subdomain_counts(entry):
    # Split the entry on delimiter ('\t') into site_id, domain, tld, and num_subdomains
    site_id, domain, tld, num_subdomains = entry.split('\t')
    
    # return ONLY the num_subdomains
    return int(num_subdomains)

In [6]:
# Apply `extract_subdomain_counts` to the raw data RDD
host_counts = common_crawl_domain_counts.map(lambda x: extract_subdomain_counts(x))

In [7]:
# Display the first few entries
host_counts.take(10)

[1, 1, 1, 1, 1, 1, 1, 7, 1, 1]

In [9]:
# Reduce the RDD to the sum of subdomains with the reduce function
total_host_counts = host_counts.reduce(lambda a, b: a + b)

total_host_counts

595466

In [10]:
# Stop the sparkContext and the SparkSession
spark.stop()