In [1]:
// 1. Installation

//install libraries
import org.archive.archivespark._
import org.archive.archivespark.functions._
import org.archive.archivespark.specific.warc._

// data files - generic path from docker run -v
val cdxPath = "/data/arc_cdx/*.cdx"
val warcPath = "/data/warc"

import org.apache.spark.sql.{Row, SparkSession}

val session = spark.newSession

// collect all records

val r = ArchiveSpark.load(WarcSpec.fromFiles(cdxPath, warcPath))

In [2]:
val researchYear = "1998"

In [3]:
// 2. Count ALL objects from research Year
// 2.1 get data for text/html & HTTP 200 for each Year
val t1 = r.filter(r => r.timestamp.startsWith(researchYear))


In [4]:
// 2.2 count all captures
t1.count()

2250522

In [4]:
// 3. generate basic data frame

val m = t1.map(f=> (f.originalUrl,f.digest,f.status,f.mime))
val df = session.createDataFrame(m).toDF("originalUrl","digest","status","mime").cache()

In [6]:
df.show()

+--------------------+--------------------+------+---------+
|         originalUrl|              digest|status|     mime|
+--------------------+--------------------+------+---------+
|http://www.stopkl...|G7SQ6F3IMHLTBAZ5Z...|   302|text/html|
|http://www.stopkl...|5MX53IFSYWBYOZVBN...|   200|text/html|
|http://www.stopkl...|Y4UAHXJNEK7NPP2T6...|   200|text/html|
|http://www.stopkl...|BHPSWSK5XFB4VP2S6...|   200|       im|
|http://www.stopkl...|R2TANLIXFELSNSEMX...|   200|       im|
|http://www.stopkl...|Q4A5MNMPVNGTPVZSD...|   200|       im|
|http://www.stopkl...|KGNNCNPZCNB3CAI62...|   200|       im|
|http://www.stopkl...|HMYVBW5MJ2B4J2JM3...|   200|       im|
|http://www.stopkl...|H52KVNASDQSWKJCSA...|   200|       im|
|http://www.stopkl...|5WDIUDCJ65L5ZDWLT...|   200|       im|
|http://www.stopkl...|7L34GZ3RAUKPWKOUD...|   200|text/html|
|http://www.stopkl...|3QM4MVL6QMDQYJG5T...|   200|text/html|
|http://www.stopkl...|EML734Z5AU2RTK3CQ...|   200|text/html|
|http://www.stopkl...|PJ

In [7]:
// 4. Count distinct originalUrls
val dOu = df.select(df("originalUrl")).distinct.count()

In [8]:
dOu

875773

In [9]:
// 5. srednia i mediana liczby wersji na pojedynczy URL
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val mDu = df.groupBy("originalUrl").agg(count("digest") as "digests").orderBy(desc("digests")).cache()

In [10]:
mDu.show(false)

                                                                                +---------------------------------------------------------------+-------+
|originalUrl                                                    |digests|
+---------------------------------------------------------------+-------+
|http://waclaw.fema.krakow.pl:80/ifem98/ifem98.html             |28     |
|http://kos.man.koszalin.pl:80/doc/UNIXhelp1.3/index.html       |26     |
|http://waclaw.fema.krakow.pl:80/fema98/menu.html               |26     |
|http://waclaw.fema.krakow.pl:80/biblio.htm                     |26     |
|http://omega.uni.man.szczecin.pl:80/firma/struktur.htm         |24     |
|http://waclaw.fema.krakow.pl:80/fema/nieruch.htm               |22     |
|http://omega.uni.man.szczecin.pl:80/firma/chrakter.htm         |22     |
|http://www.usaemb.pl:80/default.htm                            |22     |
|http://kos.man.koszalin.pl:80/doc/UNIXhelp1.3/index/index.html |22     |
|http://www.ichem.com.pl:80/Ob

In [11]:
// 5.1 mean
mDu.agg(avg("digests")).show()

                                                                                +-----------------+
|     avg(digests)|
+-----------------+
|2.569754947914585|
+-----------------+



In [12]:
// 5.2 median
mDu.agg(expr("percentile(digests, 0.5)").as("median")).cache().show()

+------+
|median|
+------+
|   2.0|
+------+



In [13]:
// tab 2. A - statusy inne niz 200
df.where("status != '200'").count()

33776

In [14]:
// tab 2. A - statusy HTML inne niz 200
df.where("mime == 'text/html'").where("status != '200'").count()

33740

In [15]:
// test pls
df.where("status != '200'").show(150)

+--------------------+--------------------+------+----------+
|         originalUrl|              digest|status|      mime|
+--------------------+--------------------+------+----------+
|http://www.stopkl...|G7SQ6F3IMHLTBAZ5Z...|   302| text/html|
|http://www.stopkl...|DAF3VWTQBW5MK6RTJ...|   302| text/html|
|http://www.stopkl...|VS3LPF4JWKMKNWBXN...|   302| text/html|
|http://www.storm....|T3I4MPQKV3GXCROU6...|   301| text/html|
|http://www.storm....|3HBQ2HGGI7XR7RMRG...|   301| text/html|
|http://www.stpbis...|K4NQKMQW3GW7P7R2B...|   302| text/html|
|http://studio-net...|WSFLSREBXGWJGUEXB...|   301| text/html|
|http://www.studio...|INYVOMXANN57HQRH5...|   302| text/html|
|http://www.studio...|NRHYITSX7WLSR7CQA...|   302| text/html|
|http://www.studio...|7A5NFFVJVFGHAPUAA...|   302| text/html|
|http://www.studio...|YYE5YDOLGIPVFGWX5...|   302| text/html|
|http://www.studio...|24LQY7EILDLMGLJ2Y...|   302| text/html|
|http://www.studio...|MCS4OEONBXOJ65RXZ...|   302| text/html|
|http://

In [16]:
// tab basic2.2
// count distinct mime http 200
df.where("status == '200'").agg(countDistinct("mime")).show()

+--------------------+
|count(DISTINCT mime)|
+--------------------+
|                 102|
+--------------------+



In [17]:
// count all mimes counts
df.where("status == '200'").groupBy("mime").count().orderBy(desc("count")).cache().show(120)

+--------------------+-------+
|                mime|  count|
+--------------------+-------+
|           text/html|1534252|
|           image/gif| 364786|
|          image/jpeg| 205568|
|          text/plain|  49442|
|             text/pl|  20120|
|     application/zip|   8688|
|                  im|   5944|
|                 unk|   4006|
|application/octet...|   3666|
|application/posts...|   2970|
|  application/x-gzip|   2636|
|     application/pdf|   2592|
|   application/x-tar|   2194|
|         audio/x-wav|   1394|
|audio/x-pn-realaudio|   1048|
|   application/x-dvi|    956|
|  application/msword|    800|
|     image/x-xbitmap|    790|
|          audio/midi|    714|
|         audio/mpeg3|    438|
|   application/x-tex|    256|
|     image/x-xpixmap|    212|
|     video/quicktime|    212|
|         audio/basic|    184|
|            text/css|    162|
|   application/x-tcl|    158|
|     audio/x-mpegurl|    152|
|         audio/x-mp3|    152|
|        audio/x-midi|    132|
|   audi

In [5]:
// for tests - liczba wersji na origina;Url
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val vBu = df.groupBy("originalUrl").agg(countDistinct("digest") as "distDigests").cache()


In [6]:
// avg
vBu.agg(avg("distDigests") as "avgDigests").show()


+------------------+
|        avgDigests|
+------------------+
|1.0536360449568554|
+------------------+



In [7]:
vBu.agg(expr("percentile(distDigests, 0.5)").as("medDigests")).cache().show()


+----------+
|medDigests|
+----------+
|       1.0|
+----------+

