In [1]:
// 1. Installation

//install libraries
import org.archive.archivespark._
import org.archive.archivespark.functions._
import org.archive.archivespark.specific.warc._

// data files - generic path from docker run -v
val cdxPath = "/data/arc_cdx/*.cdx"
val warcPath = "/data/warc"

import org.apache.spark.sql.{Row, SparkSession}

val session = spark.newSession

// collect all records

val r = ArchiveSpark.load(WarcSpec.fromFiles(cdxPath, warcPath))

In [2]:
// 3. generate basic data frame

val m = r.map(f=> (f.originalUrl,f.digest,f.status,f.mime, f.timestamp))
val df = session.createDataFrame(m).toDF("originalUrl","digest","status","mime","timestamp").cache()

In [3]:
df.show()

+--------------------+--------------------+------+----+--------------+
|         originalUrl|              digest|status|mime|     timestamp|
+--------------------+--------------------+------+----+--------------+
|http://www.stopkl...|PMUYSQN6O7BISA2A4...|   200|  im|19991008021609|
|http://www.stopkl...|A7INTAJ2IYZI2YCV7...|   200|  im|19991008030711|
|http://www.stopkl...|A7INTAJ2IYZI2YCV7...|   200|  im|19991116132425|
|http://www.stopkl...|A7INTAJ2IYZI2YCV7...|   200|  im|20000309171347|
|http://www.stopkl...|A7INTAJ2IYZI2YCV7...|   200|  im|20000611132140|
|http://www.stopkl...|7H3SNEXZ5GTSJXCBL...|   200|  im|19991008040548|
|http://www.stopkl...|VHX3XTMT47JUSZA3I...|   200|  im|19991008055238|
|http://www.stopkl...|VHX3XTMT47JUSZA3I...|   200|  im|19991116152608|
|http://www.stopkl...|73XLK7LCKIRK24HK7...|   200|  im|19991008064836|
|http://www.stopkl...|73XLK7LCKIRK24HK7...|   200|  im|19991116165401|
|http://www.stopkl...|3R63VZAJO22ILAFRV...|   200|  im|19991008092215|
|http:

In [4]:
// 5. srednia i mediana liczby wersji na pojedynczy URL
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val mDu = df.groupBy("originalUrl").agg(count("digest") as "digests").orderBy(desc("digests")).cache()

In [5]:
mDu.show(false)

+-------------------------------------------------------------------+-------+
|originalUrl                                                        |digests|
+-------------------------------------------------------------------+-------+
|http://www.rzeczpospolita.pl:80/gifs/rek1.gif                      |164076 |
|http://www.rzeczpospolita.pl:80/gifs/subheadm.gif                  |109050 |
|http://www.rzeczpospolita.pl:80/gifs/tlo.gif                       |69606  |
|http://www.rzeczpospolita.pl:80/gifs/tlopastyll.gif                |63126  |
|http://www.rzeczpospolita.pl:80/gifs/head.gif                      |61952  |
|http://img.wp.pl:80/pixel.gif                                      |49326  |
|http://www.wp.pl:80/robots.txt                                     |35122  |
|http://www.rzeczpospolita.pl:80/gifs/rek2.gif                      |30536  |
|http://of.pl:80/robots.txt                                         |27586  |
|http://www.hg.pl:80/robots.txt                                 

In [13]:
val mDu2 = df.groupBy("originalUrl").agg(count("digest") as "digests").orderBy(asc("digests")).cache()

In [15]:
mDu2.show(false)

+----------------------------------------------------------------------------------+-------+
|originalUrl                                                                       |digests|
+----------------------------------------------------------------------------------+-------+
|http://mail.ids.pl:80/szkoly/docs/mias_osiek_�u�yck_(woj._jeleniogorsie)i.html    |1      |
|http://www.ped.uni.torun.pl:80/Studenci/Strony!!!!!!!/Anna%20lawska/anka4.2.jpg  |1      |
|http://www.poznan.tpnet.pl:80/ela/News/Bitmap/pożar.jpg                           |1      |
|http://www.pztkd.lublin.pl:80/EMocisz.html                                       |1      |
|http://main.wsm.szczecin.pl:80/pl/uczelnia/struktura/nawigacyjny/inm/zach�d.jpg   |1      |
|http://www.meil.pw.edu.pl:80/~st1016/sp5keh/Grafika/Mr�wka98.gif                  |1      |
|http://mac_bbs.oeiizk.waw.pl:80/kadra/TB/wycieczki/WiedeÄ1.JPG                    |1      |
|http://jacek.fuw.edu.pl:80/serwis/scisle_lot.html?ś                  

In [13]:
// captures per url

val mDu3 = df.groupBy("originalUrl").count().orderBy(asc("count")).cache()

In [16]:
mDu3.show(1000)

+--------------------+-----+
|         originalUrl|count|
+--------------------+-----+
|http://mail.ids.p...|    1|
|http://www.ped.un...|    1|
|http://www.poznan...|    1|
|http://www.pztkd....|    1|
|http://main.wsm.s...|    1|
|http://www.meil.p...|    1|
|http://mac_bbs.oe...|    1|
|http://jacek.fuw....|    1|
|http://thfs1.fuw....|    1|
|http://www.bielsk...|    1|
|http://www.bielsk...|    1|
|http://baza.dialc...|    1|
|http://lech.wokis...|    1|
|http://www.ids.pl...|    1|
|http://www.ids.ed...|    1|
|http://www.ids.ed...|    1|
|http://www.waw.id...|    1|
|http://idsserv.wa...|    1|
|http://www.ipe.pw...|    1|
|http://www.ids.pl...|    1|
|http://dns1.inwar...|    1|
|http://www.icm.ed...|    1|
|http://www.ids.ed...|    1|
|http://www.ids.ed...|    1|
|http://www.webmed...|    1|
|http://www.bielsk...|    1|
|http://www.bielsk...|    1|
|http://www.bielsk...|    1|
|http://www.pztkd....|    1|
|http://www.pztkd....|    1|
|http://plwww.fuw....|    1|
|http://www.pz

In [20]:
// captures per url
mDu3.agg(avg("count")).show()

                                                                                +-----------------+
|       avg(count)|
+-----------------+
|3.787504512485132|
+-----------------+



In [21]:
// 5.2 median
mDu3.agg(expr("percentile(count, 0.5)").as("median")).cache().show()

                                                                                +------+
|median|
+------+
|   2.0|
+------+



In [12]:

val vBu = df.groupBy("originalUrl").agg(countDistinct("digest") as "distDigests").cache()


In [14]:
// avg
vBu.agg(avg("distDigests") as "avgDigests").show()


+------------------+
|        avgDigests|
+------------------+
|1.2547012033555849|
+------------------+



In [15]:
vBu.agg(expr("percentile(distDigests, 0.5)").as("medDigests")).cache().show()

+----------+
|medDigests|
+----------+
|       1.0|
+----------+

