In [1]:
// 1. Installation

//install libraries
import org.archive.archivespark._
import org.archive.archivespark.functions._
import org.archive.archivespark.specific.warc._

// data files - generic path from docker run -v
val cdxPath = "/data/arc_cdx/*.cdx"
val warcPath = "/data/warc"

import org.apache.spark.sql.{Row, SparkSession}

val session = spark.newSession

// collect all records

val r = ArchiveSpark.load(WarcSpec.fromFiles(cdxPath, warcPath))

In [2]:
val researchYear = "1996"

In [3]:
// 2. Count ALL objects from research Year
// 2.1 get data for text/html & HTTP 200 for each Year
val t1 = r.filter(r => r.timestamp.startsWith("1996"))


In [4]:
// 2.2 count all captures
t1.count()

161318

In [4]:
// 3. generate basic data frame

val m = t1.map(f=> (f.originalUrl,f.digest,f.status,f.mime))
val df = session.createDataFrame(m).toDF("originalUrl","digest","status","mime").cache()

In [8]:
df.show()

+--------------------+--------------------+------+----------+
|         originalUrl|              digest|status|      mime|
+--------------------+--------------------+------+----------+
|http://studio-net...|SEIFFLPZ24OLR4RJP...|   200| text/html|
|http://studio-net...|J7RVGRQDPWWKUZIHO...|   200| image/gif|
|http://studio-net...|CHZGLOERIPVY7PJKF...|   200| image/gif|
|http://studio-net...|4IEAZP7DIEV4BFZA6...|   200| image/gif|
|http://studio-net...|LRWGP7FT2X5LK5MGP...|   200| text/html|
|http://peclet.sun...|JXPE6KEXM43S6O5UO...|   200| text/html|
|http://peclet.sun...|JXPE6KEXM43S6O5UO...|   200| text/html|
|http://peclet.sun...|JXPE6KEXM43S6O5UO...|   200| text/html|
|http://peclet.sun...|FMWX5T2OZBC3ZRCTL...|   200| text/html|
|http://peclet.sun...|QMOAWPTVNA5QOU3DG...|   200|image/jpeg|
|http://peclet.sun...|DJLKGQHMMBHFKHPTZ...|   200|image/jpeg|
|http://peclet.sun...|DJLKGQHMMBHFKHPTZ...|   200|image/jpeg|
|http://peclet.sun...|QMOAWPTVNA5QOU3DG...|   200|image/jpeg|
|http://

In [26]:
// 161318 OK
df.count()

161318

In [9]:
// 4. Count distinct originalUrls
val dOu = df.select(df("originalUrl")).distinct.count()

In [13]:
dOu

72229

In [20]:
// 5. srednia i mediana liczby wersji na pojedynczy URL
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val mDu = df.groupBy("originalUrl").agg(count("digest") as "digests").orderBy(desc("digests")).cache()

In [22]:
mDu.show(false)

+-----------------------------------------------------------------------------+-------+
|originalUrl                                                                  |digests|
+-----------------------------------------------------------------------------+-------+
|http://www.apple.com.pl:80/HTML/DT/Technews/20AppleQTConf.html               |12     |
|http://www.apple.com.pl:80/HTML/ApplewInternecie/Wellconnected/wcm.gif       |12     |
|http://www.apple.com.pl:80/HTML/DT/InfoAlley/96.09.04.html                   |12     |
|http://www.apple.com.pl:80/HTML/ApplewPolsce/CIT/szkol.gif                   |12     |
|http://www.apple.com.pl:80/HTML/ApplewPolsce/CIT/opi.gif                     |12     |
|http://www.apple.com.pl:80/HTML/ApplewPolsce/CIT/pi.gif                      |12     |
|http://www.apple.com.pl:80/HTML/DT/InfoAlley/96.06.2520.html                 |12     |
|http://www.apple.com.pl:80/HTML/ApplewInternecie/EverythingMac/welcomeold.gif|12     |
|http://www.apple.com.pl:80/HTML

In [23]:
// 5.1 mean
mDu.agg(avg("digests")).show()

+-----------------+
|     avg(digests)|
+-----------------+
|2.233424247878276|
+-----------------+



In [25]:
// 5.2 median
mDu.agg(expr("percentile(digests, 0.5)").as("median")).cache().show()

+------+
|median|
+------+
|   2.0|
+------+



In [28]:
// tab 2. A - statusy inne niz 200
df.where("status != '200'").count()

126

In [31]:
// tab 2. A - statusy HTML inne niz 200
df.where("mime == 'text/html'").where("status != '200'").count()

126

In [37]:
// test pls
df.where("status != '200'").show(150)

+--------------------+--------------------+------+---------+
|         originalUrl|              digest|status|     mime|
+--------------------+--------------------+------+---------+
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://zeus.polsl...|CEVC55YON762RHVFH...|   500|text/html|
|http://www.loiv.t...|A6MYLAQFTIGXAHGA3...|   302|text/html|
|http://www.loiv.t...|4R

In [38]:
// tab basic2.2
// count distinct mime http 200
df.where("status == '200'").agg(countDistinct("mime")).show()

+--------------------+
|count(DISTINCT mime)|
+--------------------+
|                  42|
+--------------------+



In [42]:
// count all mimes counts
df.where("status == '200'").groupBy("mime").count().orderBy(desc("count")).cache().show(100)

+--------------------+-----+
|                mime|count|
+--------------------+-----+
|           text/html|85006|
|           image/gif|47762|
|          text/plain|16064|
|          image/jpeg|10034|
|application/posts...|  420|
|     image/x-xbitmap|  338|
|                 unk|  268|
|         audio/x-wav|  232|
|     application/pdf|  180|
|     application/zip|  170|
|application/octet...|  152|
|   application/x-dvi|  134|
| application/x-troff|  110|
|   application/x-tex|   72|
|   application/x-tar|   44|
|application/x-msd...|   30|
|audio/x-pn-realaudio|   28|
|          video/mpeg|   28|
| application/x-excel|   16|
|     application/rtf|   16|
|          image/tiff|   14|
|         audio/basic|    8|
|     image/x-xpixmap|    6|
|             unknown|    6|
|     video/quicktime|    6|
|application/mac-b...|    6|
|application/x-tro...|    6|
|    application/x-sh|    4|
|  application/msword|    4|
|     multipart/x-zip|    4|
|        audio/x-aiff|    2|
|            g

In [6]:
// for tests - liczba wersji na origina;Url
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val vBu = df.groupBy("originalUrl").agg(countDistinct("digest") as "distDigests").cache()

In [7]:
// avg
vBu.agg(avg("distDigests") as "avgDigests").show()

+------------------+
|        avgDigests|
+------------------+
|1.0083346024450013|
+------------------+



In [8]:
vBu.agg(expr("percentile(distDigests, 0.5)").as("medDigests")).cache().show()

+----------+
|medDigests|
+----------+
|       1.0|
+----------+

