In [1]:
from operator import add
import re
from collections import OrderedDict
from operator import itemgetter 
import itertools
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.251:7077") \
        .appName("Lecture1_Example5_common_crawl")\
        .config("spark.executor.cores",4)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()


# RDD API 
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/09 16:53:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:

# (*/*) - out of memory›
# ~6.4mins for 39496 files. (...00000/)  (takes 1 minute with 40 partitions)
# ~5 secs for 10 files (...00000/0*) 
# ~20 secs for 11110 files (...00000/1*) 


rdd = spark_context.newAPIHadoopFile(
    "hdfs://192.168.2.251:9000/data/crawl/CC-MAIN-20230921073711-20230921103711-00010.warc.wet",
    "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
    "org.apache.hadoop.io.LongWritable",
    "org.apache.hadoop.io.Text",
    conf={"textinputformat.record.delimiter": "WARC/1.0"}
)\
.cache() # Keep this RDD in memory!

rdd.count()
# Only one job (previous .cache() did not trigger a job)

                                                                                

37759

In [3]:
rdd.take(3)
# [(line_number, partition)]

[(0, ''),
 (8,
  '\r\nWARC-Type: warcinfo\r\nWARC-Date: 2023-10-05T06:13:02Z\r\nWARC-Filename: CC-MAIN-20230921073711-20230921103711-00010.warc.wet.gz\r\nWARC-Record-ID: <urn:uuid:413ca23a-a8f2-4996-a251-2450d1aa2d50>\r\nContent-Type: application/warc-fields\r\nContent-Length: 382\r\n\r\nSoftware-Info: ia-web-commons.1.1.10-SNAPSHOT-20230912101454\r\nExtracted-Date: Thu, 05 Oct 2023 06:13:02 GMT\r\nrobots: checked via crawler-commons 1.5-SNAPSHOT (https://github.com/crawler-commons/crawler-commons)\r\nisPartOf: CC-MAIN-2023-40\r\noperator: Common Crawl Admin (info@commoncrawl.org)\r\ndescription: Wide crawl of the web for September/October 2023\r\npublisher: Common Crawl\r\n\r\n\r\n\r\n'),
 (657,
  '\r\nWARC-Type: conversion\r\nWARC-Target-URI: http://0-50.ru/news/tag/%EC%E0%F1%F1%EE%E2%E0%FF+%E4%F0%E0%EA%E0\r\nWARC-Date: 2023-09-21T09:25:16Z\r\nWARC-Record-ID: <urn:uuid:110df249-8518-4c26-88b7-69b82f2a425b>\r\nWARC-Refers-To: <urn:uuid:e68e19b8-b09c-46c7-8858-3fb3041b4ded>\r\nWARC-Blo

In [4]:
rdd.getNumPartitions()

3

In [5]:
print(spark_context.uiWebUrl)

http://de1-spark-host-193:4040


In [6]:
## Example #1 - Filter by Top_level Domain and compute most common words ##

# Try .ac.uk, .ru, .se, .com
p = re.compile("WARC-Target-URI: \S+\.ac.uk", re.IGNORECASE)


# Note: .partition(..) returns a 3-tuple: the string before the separator (index 0), 
# the separotor (index 1), and the part of the string afterwards (index 2) -- which is the part we want.
all_words = rdd\
    .filter(lambda doc: bool(p.search(doc[1])))\
    .map(lambda web_text: web_text[1].partition("\r\n\r\n")[2])\
    .flatMap(lambda t: t.split(" "))\
    .flatMap(lambda w: w.split("\n"))\



all_words_and_count = all_words.map(lambda w: w.strip())\
    .map(lambda w: (w,1))


word_counts = all_words_and_count.reduceByKey(add)

print(word_counts.takeOrdered(60, key=lambda x: -x[1]))





[('the', 2711), ('of', 1911), ('to', 1807), ('and', 1803), ('a', 1352), ('in', 1259), ('', 766), ('is', 750), ('for', 732), ('-', 660), ('on', 468), ('that', 433), ('The', 418), ('topic', 416), ('with', 409), ('be', 401), ('are', 397), ('you', 381), ('can', 370), ('TWiki', 369), ('by', 367), ('as', 323), ('this', 303), ('or', 299), ('I', 283), ('not', 257), ('an', 244), ('from', 243), ('|', 233), ('your', 217), ('TWiki:Main.PeterThoeny', 212), ('at', 212), ('use', 208), ('new', 202), ('Research', 198), ('if', 195), ('will', 188), ('all', 185), ('web', 185), ('it', 178), ('have', 177), ('--', 172), ('used', 171), ('Details', 171), ('You', 160), ('This', 160), ('name', 154), ('text', 146), ('University', 145), ('search', 140), ('user', 139), ('&', 137), ('was', 134), ('when', 133), ('topics', 129), ('form', 127), ('has', 127), ('access', 127), ('+4411732', 124), ('cookies', 119)]


                                                                                

In [7]:
## Example #2 - Group by TLD and compute most common words for each ##

ex = "WARC-Type: conversion\
WARC-Target-URI: http://news.bbc.co.uk/2/hi/africa/3414345.stm\
WARC-Date: 2014-08-02T09:52:13Z"

p = re.compile("WARC-Target-URI: \S+\.([a-zA-Z]{2,3})/", re.IGNORECASE)
# print(p.search(ex).group(1))
# uk

def get_tld(content):
    match = p.search(content)
    if match is not None:
        return match.group(1)
    else:
        return None

# discard the line number
# partition() -- python function -- split on the first occurance, returns (before,split,after)
# filter out those with no TLD

    
words_by_tld_rdd = rdd\
    .map(lambda filename_content: filename_content[1])\
    .map(lambda content: (get_tld(content), content.partition("\r\n\r\n")[2]))\
    .filter(lambda tld_content: tld_content[0] is not None)\
    .flatMapValues(lambda words: words.split(" "))\
    .flatMapValues(lambda words: words.split("\n"))\
    .mapValues(lambda word: word.strip())
    #.take(10)

# print(words_by_tld_rdd.take(10))

tlds = words_by_tld_rdd.countByKey()
#print(tlds)

tlds = OrderedDict(sorted(tlds.items(), key = itemgetter(1), reverse = True))
# print(tlds)  

top_tlds = dict(itertools.islice(tlds.items(), 10))

# print(top_tlds)

print("Results:")

for tld in top_tlds:
    print(tld)
    top_words_for_tld = words_by_tld_rdd\
        .filter(lambda tld_word: tld_word[0] == tld)\
        .values()\
        .map(lambda w: (w,1))\
        .reduceByKey(add)\
        .takeOrdered(20, key=lambda x: -x[1])
    print(top_words_for_tld)

                                                                                

Results:
com


                                                                                

[('the', 243397), ('to', 197694), ('and', 189410), ('of', 150865), ('a', 147587), ('in', 116589), ('de', 113455), ('-', 94412), ('for', 87235), ('&', 73689), ('is', 72456), ('', 69987), ('on', 54889), ('with', 54683), ('you', 54452), ('|', 52258), ('your', 50644), ('►', 49979), ('The', 49594), ('that', 43193)]
org


                                                                                

[('the', 42541), ('', 38009), ('of', 30338), ('and', 28112), ('to', 26676), ('a', 18925), ('in', 17242), ('for', 12331), ('-', 10524), ('is', 10450), ('de', 10083), ('The', 8016), ('on', 7445), ('by', 7313), ('that', 7157), ('with', 6360), ('&', 6115), ('are', 5100), ('as', 4890), ('you', 4645)]
ru


                                                                                

[('и', 37068), ('в', 31114), ('для', 18509), ('на', 17930), ('с', 12828), ('-', 10394), ('не', 8474), ('по', 7973), ('В', 6459), ('', 5918), ('от', 5081), ('1', 4787), ('из', 4769), ('что', 4648), ('к', 4514), ('—', 4483), ('или', 3719), ('0', 3618), ('–', 3471), ('о', 3412)]
de


                                                                                

[('und', 24385), ('der', 16504), ('die', 14528), ('/', 13808), ('in', 12383), ('für', 10697), ('-', 10661), ('von', 9587), ('&', 8679), ('zu', 7883), ('Sie', 7051), ('Airport', 6836), ('mit', 6765), ('den', 6290), ('', 6209), ('€', 5420), ('auf', 5342), ('a', 4922), ('im', 4886), ('ist', 4465)]
net


                                                                                

[('the', 12256), ('to', 8951), ('and', 8544), ('de', 7765), ('of', 7642), ('a', 7599), ('-', 6983), ('in', 6048), ('', 5238), ('(1)', 4133), ('►', 4120), ('for', 3712), ('is', 3636), ('on', 2819), ('The', 2752), ('|', 2699), ('Tax', 2695), ('&', 2605), ('la', 2523), ('you', 2475)]
it


                                                                                

[('di', 22102), ('e', 13472), ('per', 6739), ('in', 6625), ('a', 5709), ('il', 5698), ('la', 5518), ('-', 5296), ('del', 4828), ('che', 4262), ('duplicato', 3995), ('della', 3387), ('un', 3355), ('con', 3053), ('da', 3040), ('i', 2990), ('al', 2922), ('è', 2901), ('le', 2617), ('', 2410)]
pl


                                                                                

[('------', 10407), ('i', 9897), ('w', 9006), ('do', 7490), ('na', 6248), ('z', 5830), ('-', 4119), ('--------', 3692), ('się', 3179), ('to', 2840), ('/', 2738), ('0', 2416), ('–', 2103), ('zł', 2079), ('', 2000), ('o', 1949), ('dla', 1915), ('nie', 1865), ('�', 1769), ('----', 1739)]
fr


                                                                                

[('de', 25034), ('et', 10318), ('la', 9160), ('à', 8632), ('des', 7091), ('les', 6426), ('-', 6333), (':', 5891), ('le', 5839), ('en', 5564), ('du', 5007), ('pour', 4405), ('un', 3512), ('sur', 3185), ('une', 2459), ('dans', 2450), ('au', 2394), ('par', 2392), ('vous', 2391), ('/', 2283)]
nl


                                                                                

[('de', 9868), ('en', 8741), ('van', 6361), ('een', 5546), ('in', 5360), ('het', 4508), ('je', 4221), ('met', 3970), ('voor', 3945), ('op', 3665), ('-', 3629), ('te', 3380), ('&', 3272), ('', 3257), ('is', 3186), ('of', 2838), ('De', 1848), ('om', 1734), ('aan', 1595), ('2021', 1556)]
edu




[('and', 9372), ('of', 9045), ('the', 8818), ('to', 5854), ('in', 4636), ('for', 3604), ('a', 3455), ('-', 2413), ('&', 2214), ('by', 1587), ('The', 1537), ('on', 1488), ('', 1480), ('is', 1456), ('with', 1333), ('Search', 1180), ('Research', 1168), ('at', 1140), ('University', 1090), ('that', 1072)]


                                                                                

In [8]:
spark_session.stop()