In [1]:
// 1. Installation

//install libraries
import org.archive.archivespark._
import org.archive.archivespark.functions._
import org.archive.archivespark.specific.warc._

// data files - generic path from docker run -v
val cdxPath = "/data/arc_cdx/*.cdx"
val warcPath = "/data/warc"

import org.apache.spark.sql.{Row, SparkSession}

val session = spark.newSession

// collect all records

val r = ArchiveSpark.load(WarcSpec.fromFiles(cdxPath, warcPath))

In [2]:
val researchYear = "1997"

In [3]:
// 2. Count ALL objects from research Year
// 2.1 get data for text/html & HTTP 200 for each Year
val t1 = r.filter(r => r.timestamp.startsWith(researchYear))


In [4]:
// 2.2 count all captures
t1.count()

2781718

In [4]:
// 3. generate basic data frame

val m = t1.map(f=> (f.originalUrl,f.digest,f.status,f.mime))
val df = session.createDataFrame(m).toDF("originalUrl","digest","status","mime").cache()

In [6]:
df.show()

+--------------------+--------------------+------+---------+
|         originalUrl|              digest|status|     mime|
+--------------------+--------------------+------+---------+
|http://www.studio...|5X7EV33KXBGQUF7XW...|   200|text/html|
|http://studio-net...|TVN2SQJDU3XFTES5E...|   200|text/html|
|http://www.studio...|L2M6VE6737CGB65YW...|   200|text/html|
|http://www.studio...|VLPJ6Q7ZMVDEVXXF3...|   200|text/html|
|http://www.studio...|J4R7IMDXCMXRNTPXJ...|   200|text/html|
|http://www.studio...|MBYKBKK36SHB7TWQO...|   200|text/html|
|http://www.studio...|CVZV3EDBNJLXNY5S7...|   200|text/html|
|http://www.studio...|WI6PMP3M5L2VQORHT...|   200|text/html|
|http://www.studio...|NRGJMHR7XLW2IOL5Q...|   200|text/html|
|http://www.studio...|7S6HCTHOWNN52U5LP...|   200|text/html|
|http://www.studio...|ZZX3XDRVLL2UQLSEK...|   200|text/html|
|http://www.studio...|KGG5MPKE6AG5IG3IY...|   200|text/html|
|http://www.studio...|NYC6TZ3X7U5MEUUNA...|   200|text/html|
|http://www.studio...|YY

In [7]:
// 2781718 OK
df.count()

2781718

In [8]:
// 4. Count distinct originalUrls
val dOu = df.select(df("originalUrl")).distinct.count()

In [9]:
dOu

1087617

In [10]:
// 5. srednia i mediana liczby wersji na pojedynczy URL
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val mDu = df.groupBy("originalUrl").agg(count("digest") as "digests").orderBy(desc("digests")).cache()

In [11]:
mDu.show(false)

+----------------------------------------------------+-------+
|originalUrl                                         |digests|
+----------------------------------------------------+-------+
|http://www.mat.uni.torun.pl:80/res/mpi/up.xbm       |42     |
|http://www.chem.uni.wroc.pl:80/back.gif             |42     |
|http://www.mat.uni.torun.pl:80/res/mpi/previous.xbm |42     |
|http://www.mat.uni.torun.pl:80/res/mpi/next.xbm     |42     |
|http://sol.put.poznan.pl:8000/dz96/owa/Dz96.MainMenu|36     |
|http://www.chem.uni.wroc.pl:80/inforeng.htm         |32     |
|http://www.chem.uni.wroc.pl:80/info.gif             |32     |
|http://www.chem.uni.wroc.pl:80/science.gif          |32     |
|http://www.chem.uni.wroc.pl:80/indexpol.htm         |30     |
|http://www.chem.uni.wroc.pl:80/index.htm            |30     |
|http://www.chem.uni.wroc.pl:80/sobczyk.htm          |28     |
|http://www.chem.uni.wroc.pl:80/ziolkow.htm          |28     |
|http://www.chem.uni.wroc.pl:80/rudolf.htm           |2

In [12]:
// 5.1 mean
mDu.agg(avg("digests")).show()

+------------------+
|      avg(digests)|
+------------------+
|2.5576264438676484|
+------------------+



In [14]:
// 5.2 median
mDu.agg(expr("percentile(digests, 0.5)").as("median")).cache().show()

+------+
|median|
+------+
|   2.0|
+------+



In [15]:
// tab 2. A - statusy inne niz 200
df.where("status != '200'").count()

0

In [16]:
// tab 2. A - statusy HTML inne niz 200
df.where("mime == 'text/html'").where("status != '200'").count()

0

In [17]:
// test pls
df.where("status != '200'").show(150)

+-----------+------+------+----+
|originalUrl|digest|status|mime|
+-----------+------+------+----+
+-----------+------+------+----+



In [18]:
// tab basic2.2
// count distinct mime http 200
df.where("status == '200'").agg(countDistinct("mime")).show()

+--------------------+
|count(DISTINCT mime)|
+--------------------+
|                 111|
+--------------------+



In [20]:
// count all mimes counts
df.where("status == '200'").groupBy("mime").count().orderBy(desc("count")).cache().show(120)

+--------------------+-------+
|                mime|  count|
+--------------------+-------+
|           text/html|1697478|
|           image/gif| 629700|
|          image/jpeg| 291938|
|          text/plain|  65666|
|     application/zip|  20426|
|     application/pdf|  17040|
|application/posts...|  11130|
|application/octet...|  10068|
|        audio/x-midi|   6820|
|         audio/x-wav|   4704|
|   application/x-tar|   3904|
|         audio/x-mid|   3574|
|     image/x-xbitmap|   2790|
|  application/x-gzip|   2008|
|          audio/midi|   1838|
|   application/x-dvi|   1532|
|audio/x-pn-realaudio|   1162|
|   application/x-tex|   1094|
|     image/x-xpixmap|    988|
|application/mac-b...|    954|
|         audio/basic|    842|
|         audio/x-mod|    676|
|  application/msword|    584|
| application/x-troff|    558|
|application/x-tar...|    450|
|     multipart/x-zip|    344|
|     video/quicktime|    330|
|          video/mpeg|    318|
|         audio/x-mp3|    262|
|       

In [5]:
// for tests - liczba wersji na origina;Url
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val vBu = df.groupBy("originalUrl").agg(countDistinct("digest") as "distDigests").cache()


In [6]:
// avg
vBu.agg(avg("distDigests") as "avgDigests").show()


+------------------+
|        avgDigests|
+------------------+
|1.0393419742427712|
+------------------+



In [7]:
vBu.agg(expr("percentile(distDigests, 0.5)").as("medDigests")).cache().show()

                                                                                +----------+
|medDigests|
+----------+
|       1.0|
+----------+

