In [1]:
// 1. Installation

//install libraries
import org.archive.archivespark._
import org.archive.archivespark.functions._
import org.archive.archivespark.specific.warc._

// data files - generic path from docker run -v
val cdxPath = "/data/arc_cdx/*.cdx"
val warcPath = "/data/warc"

import org.apache.spark.sql.{Row, SparkSession}

val session = spark.newSession

// collect all records

val r = ArchiveSpark.load(WarcSpec.fromFiles(cdxPath, warcPath))

In [2]:
val researchYear = "2000"

In [3]:
// 2. Count ALL objects from research Year
// 2.1 get data for text/html & HTTP 200 for each Year
val t1 = r.filter(r => r.timestamp.startsWith(researchYear))


In [4]:
// 2.2 count all captures
t1.count()

15300386

In [5]:
// 3. generate basic data frame

val m = t1.map(f=> (f.originalUrl,f.digest,f.status,f.mime))
val df = session.createDataFrame(m).toDF("originalUrl","digest","status","mime").cache()

In [8]:
df.show()

+--------------------+--------------------+------+---------+
|         originalUrl|              digest|status|     mime|
+--------------------+--------------------+------+---------+
|http://www.stopkl...|A7INTAJ2IYZI2YCV7...|   200|       im|
|http://www.stopkl...|A7INTAJ2IYZI2YCV7...|   200|       im|
|http://www.stopkl...|INNKIN2JHJ3YYQPJP...|   200|       im|
|http://www.stopkl...|HYB3QW4IX2T7IHVMI...|   200|       im|
|http://www.stopkl...|HYB3QW4IX2T7IHVMI...|   200|       im|
|http://www.stopkl...|HQOFHLUGKK5EQNGTV...|   200|       im|
|http://www.stopkl...|HQOFHLUGKK5EQNGTV...|   200|       im|
|http://www.stopkl...|Q7QEUAUIS4OU3TKLF...|   200|       im|
|http://www.stopkl...|Q7QEUAUIS4OU3TKLF...|   200|       im|
|http://www.stopkl...|VN3W2CGZUFPX3UJI3...|   200|       im|
|http://www.stopkl...|VN3W2CGZUFPX3UJI3...|   200|       im|
|http://www.stopkl...|G4FTDNNZNJO6GUH7R...|   301|text/html|
|http://www.stopkl...|7PNSIOSDKXETXXE5G...|   200|       im|
|http://www.stopkl...|IF

In [9]:
// 4. Count distinct originalUrls
val dOu = df.select(df("originalUrl")).distinct.count()

In [10]:
dOu

3565614

In [11]:
// 5. srednia i mediana liczby wersji na pojedynczy URL
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val mDu = df.groupBy("originalUrl").agg(count("digest") as "digests").orderBy(desc("digests")).cache()

In [10]:
mDu.show(false)

                                                                                +----------------------------------------------------------------------+-------+
|originalUrl                                                           |digests|
+----------------------------------------------------------------------+-------+
|http://www.astro.amu.edu.pl:80/Library/TeX/latex2e.html               |348    |
|http://rabarbar.se.com.pl:80/netware/02workst.html                    |336    |
|http://tichy.ch.uj.edu.pl:80/lists/kola-l/date.html                   |312    |
|http://tichy.ch.uj.edu.pl:80/lists/kola-l/index.html                  |266    |
|http://geoinfo.amu.edu.pl:80/wpk/pro/D1B.HTM                          |262    |
|http://tichy.ch.uj.edu.pl:80/lists/kola-l/author.html                 |238    |
|http://geoinfo.amu.edu.pl:80/wpk/pro/D4.HTM                           |196    |
|http://rabarbar.se.com.pl:80/netware/03serv.html                      |190    |
|http://www.biblos.pk.edu.p

In [12]:
// 5.1 mean
mDu.agg(avg("digests")).show()

+-----------------+
|     avg(digests)|
+-----------------+
|4.291094324848399|
+-----------------+



In [13]:
// 5.2 median
mDu.agg(expr("percentile(digests, 0.5)").as("median")).cache().show()

+------+
|median|
+------+
|   2.0|
+------+



In [14]:
// tab 2. A - statusy inne niz 200
df.where("status != '200'").count()

2538930

In [15]:
// tab 2. A - statusy HTML inne niz 200
df.where("mime == 'text/html'").where("status != '200'").count()

2428970

In [16]:
// test pls
df.where("status != '200'").show(150)

+--------------------+--------------------+------+---------+
|         originalUrl|              digest|status|     mime|
+--------------------+--------------------+------+---------+
|http://www.stopkl...|G4FTDNNZNJO6GUH7R...|   301|text/html|
|http://www.stopkl...|5VUKB7Z6GOILJARHX...|   301|text/html|
|http://www.stopkl...|KU3WCWBV5ACYZDZZL...|   301|text/html|
|http://www.stopkl...|CNHJDUTLTAJ5VGVHD...|   301|text/html|
|http://www.stopkl...|BG7BJ3N2FCKJXFLRA...|   301|text/html|
|http://www.stopkl...|MEMZFUOSNAOWUB74X...|   301|text/html|
|http://www.stopkl...|RKDCERG3DRYP547FD...|   301|text/html|
|http://www.stopkl...|BCUMF5QN2AWU3HV3O...|   301|text/html|
|http://www.stopkl...|AGFISCLBAHFIIX2GK...|   301|text/html|
|http://www.stopkl...|JV5PDUMMJSZYSTACK...|   301|text/html|
|http://www.stopkl...|TCUJUC2WNR3KY44VZ...|   301|text/html|
|http://www.stopkl...|I2HNYR7UKCVSI2JDD...|   301|text/html|
|http://www.stopkl...|PB4DVYXVJG2KI7374...|   301|text/html|
|http://www.stopkl...|PB

In [17]:
// tab basic2.2
// count distinct mime http 200
df.where("status == '200'").agg(countDistinct("mime")).show()

+--------------------+
|count(DISTINCT mime)|
+--------------------+
|                 108|
+--------------------+



In [18]:
// count all mimes counts
df.where("status == '200'").groupBy("mime").count().orderBy(desc("count")).cache().show(120)

+--------------------+-------+
|                mime|  count|
+--------------------+-------+
|           text/html|8201580|
|                  im|3611894|
|           alexa/dat| 324962|
|           image/gif| 178262|
|          image/jpeg| 157048|
|             text/pl| 115800|
|                 unk|  90994|
|          text/plain|  65152|
|     application/zip|   2060|
|            text/css|   1466|
|audio/x-pn-realaudio|   1406|
|application/posts...|   1212|
|application/octet...|   1048|
|application/x-jav...|    936|
|             unknown|    802|
|     application/pdf|    734|
|           image/png|    694|
|application/x-zip...|    674|
|          audio/midi|    644|
|          audio/mpeg|    566|
|              text/x|    390|
|  application/msword|    300|
|         audio/basic|    290|
|            text/rtf|    216|
|          video/mpeg|    174|
|     video/quicktime|    160|
|         audio/x-wav|    154|
|   application/x-tar|    136|
|             video/x|    130|
|applica

In [6]:
// for tests - liczba wersji na origina;Url
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.functions._

val vBu = df.groupBy("originalUrl").agg(countDistinct("digest") as "distDigests").cache()


In [7]:
// avg
vBu.agg(avg("distDigests") as "avgDigests").show()


+------------------+
|        avgDigests|
+------------------+
|1.3603255989010588|
+------------------+



In [8]:
vBu.agg(expr("percentile(distDigests, 0.5)").as("medDigests")).cache().show()

                                                                                +----------+
|medDigests|
+----------+
|       1.0|
+----------+

