In [1]:
// 1. Installation

//install libraries
import org.archive.archivespark._
import org.archive.archivespark.functions._
import org.archive.archivespark.specific.warc._

// data files - generic path from docker run -v
val cdxPath = "/data/arc_cdx/*.cdx"
val warcPath = "/data/warc"

import org.apache.spark.sql.{Row, SparkSession}

val session = spark.newSession

// collect all records

val r = ArchiveSpark.load(WarcSpec.fromFiles(cdxPath, warcPath))

In [2]:
val researchYear = "1999"

In [3]:
// 2. Count ALL objects from research Year
// 2.1 get data for text/html & HTTP 200 for each Year
val t1 = r.filter(r => r.timestamp.startsWith(researchYear))


In [4]:
// 2.2 count all captures
t1.count()

4578548

In [4]:
// 3. generate basic data frame

val m = t1.map(f=> (f.originalUrl,f.digest,f.status,f.mime))
val df = session.createDataFrame(m).toDF("originalUrl","digest","status","mime").cache()

In [6]:
df.show()

+--------------------+--------------------+------+----+
|         originalUrl|              digest|status|mime|
+--------------------+--------------------+------+----+
|http://www.stopkl...|PMUYSQN6O7BISA2A4...|   200|  im|
|http://www.stopkl...|A7INTAJ2IYZI2YCV7...|   200|  im|
|http://www.stopkl...|A7INTAJ2IYZI2YCV7...|   200|  im|
|http://www.stopkl...|7H3SNEXZ5GTSJXCBL...|   200|  im|
|http://www.stopkl...|VHX3XTMT47JUSZA3I...|   200|  im|
|http://www.stopkl...|VHX3XTMT47JUSZA3I...|   200|  im|
|http://www.stopkl...|73XLK7LCKIRK24HK7...|   200|  im|
|http://www.stopkl...|73XLK7LCKIRK24HK7...|   200|  im|
|http://www.stopkl...|3R63VZAJO22ILAFRV...|   200|  im|
|http://www.stopkl...|W2X7RI5SYM7MFKUO7...|   200|  im|
|http://www.stopkl...|EICU253YO7TPFAGTQ...|   200|  im|
|http://www.stopkl...|INNKIN2JHJ3YYQPJP...|   200|  im|
|http://www.stopkl...|INNKIN2JHJ3YYQPJP...|   200|  im|
|http://www.stopkl...|HYB3QW4IX2T7IHVMI...|   200|  im|
|http://www.stopkl...|HYB3QW4IX2T7IHVMI...|   20

In [7]:
// 4. Count distinct originalUrls
val dOu = df.select(df("originalUrl")).distinct.count()

In [8]:
dOu

1454305

In [9]:
// 5. srednia i mediana liczby wersji na pojedynczy URL
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val mDu = df.groupBy("originalUrl").agg(count("digest") as "digests").orderBy(desc("digests")).cache()

In [10]:
mDu.show(false)

                                                                                +----------------------------------------------------------------------+-------+
|originalUrl                                                           |digests|
+----------------------------------------------------------------------+-------+
|http://www.astro.amu.edu.pl:80/Library/TeX/latex2e.html               |348    |
|http://rabarbar.se.com.pl:80/netware/02workst.html                    |336    |
|http://tichy.ch.uj.edu.pl:80/lists/kola-l/date.html                   |312    |
|http://tichy.ch.uj.edu.pl:80/lists/kola-l/index.html                  |266    |
|http://geoinfo.amu.edu.pl:80/wpk/pro/D1B.HTM                          |262    |
|http://tichy.ch.uj.edu.pl:80/lists/kola-l/author.html                 |238    |
|http://geoinfo.amu.edu.pl:80/wpk/pro/D4.HTM                           |196    |
|http://rabarbar.se.com.pl:80/netware/03serv.html                      |190    |
|http://www.biblos.pk.edu.p

In [11]:
// 5.1 mean
mDu.agg(avg("digests")).show()

                                                                                +------------------+
|      avg(digests)|
+------------------+
|3.1482721987478555|
+------------------+



In [12]:
// 5.2 median
mDu.agg(expr("percentile(digests, 0.5)").as("median")).cache().show()

+------+
|median|
+------+
|   2.0|
+------+



In [13]:
// tab 2. A - statusy inne niz 200
df.where("status != '200'").count()

882080

In [14]:
// tab 2. A - statusy HTML inne niz 200
df.where("mime == 'text/html'").where("status != '200'").count()

877678

In [15]:
// test pls
df.where("status != '200'").show(150)

+--------------------+--------------------+------+---------+
|         originalUrl|              digest|status|     mime|
+--------------------+--------------------+------+---------+
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GVN3W4BRKV5HTNYTR...|   404|text/html|
|http://www.stopkl...|GV

In [16]:
// tab basic2.2
// count distinct mime http 200
df.where("status == '200'").agg(countDistinct("mime")).show()

+--------------------+
|count(DISTINCT mime)|
+--------------------+
|                  28|
+--------------------+



In [17]:
// count all mimes counts
df.where("status == '200'").groupBy("mime").count().orderBy(desc("count")).cache().show(120)

+---------------+-------+
|           mime|  count|
+---------------+-------+
|      text/html|3199434|
|        text/pl| 207710|
|             im| 196910|
|            unk|  88550|
|     text/plain|   2678|
|         text/x|    344|
|       text/css|    332|
|              x|    216|
|       text/rtf|     48|
|          text/|     46|
|         multip|     42|
|         text/c|     34|
|              m|     22|
|     model/vrml|     16|
|     video/mpeg|     16|
|        video/x|     14|
|        unknown|     12|
|        test/pl|     10|
|              -|      6|
|     httpd/send|      4|
|video/quicktime|      4|
|           mess|      4|
|            tpl|      4|
|    www/unknown|      4|
|       text/xml|      2|
|        text/pt|      2|
|         text/j|      2|
|         text/t|      2|
+---------------+-------+



In [5]:
// for tests - liczba wersji na origina;Url
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._
val vBu = df.groupBy("originalUrl").agg(countDistinct("digest") as "distDigests").cache()

In [6]:
// avg
vBu.agg(avg("distDigests") as "avgDigests").show()


+------------------+
|        avgDigests|
+------------------+
|1.1653408329064399|
+------------------+



In [7]:
vBu.agg(expr("percentile(distDigests, 0.5)").as("medDigests")).cache().show()


                                                                                +----------+
|medDigests|
+----------+
|       1.0|
+----------+

