In [1]:
#r "nuget:Microsoft.Spark,0.12.1"

Installed package Microsoft.Spark version 0.12.1

##  Import Spark packages

In [2]:
open Microsoft.Spark
open Microsoft.Spark.Sql

## Define data path

In [3]:
let DATA_DIR = "/datadrive/Data/Kaggle/arxiv-metadata-oai-snapshot-2020-08-14.json"

## Initialize spark session

In [4]:
let sparkSession = 
    SparkSession
        .Builder()
        .AppName("arxiv-analytics")
        .GetOrCreate()

[2020-10-05T14:19:18.0495643Z] [fsharp-analytics-vm] [Info] [ConfigurationService] 'DOTNETBACKEND_PORT' environment variable is not set.
[2020-10-05T14:19:18.0530161Z] [fsharp-analytics-vm] [Info] [ConfigurationService] Using port 5567 for connection.
[2020-10-05T14:19:18.0536339Z] [fsharp-analytics-vm] [Info] [JvmBridge] JvMBridge port is 5567


## Load data

In [7]:
let arxivData = 
    sparkSession
        .Read()
        .Option("inferSchema",true)
        .Json([|DATA_DIR|])

##  Display column names

In [8]:
arxivData.Columns()

index,value
0,abstract
1,authors
2,authors_parsed
3,categories
4,comments
5,doi
6,id
7,journal-ref
8,license
9,report-no


## Display DataFrame schema

In [9]:
arxivData.PrintSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)



## Count number of articles

In [16]:
let articleCount = 
    arxivData
        .Select(
            Functions.Count(Functions.Col("id")).Alias("articles"),
            Functions.CountDistinct(Functions.Col("id")).Alias("distinct_articles"))

In [17]:
articleCount.Show()

+--------+-----------------+
|articles|distinct_articles|
+--------+-----------------+
| 1747307|          1747307|
+--------+-----------------+



## Get the category count in descending order

In [41]:
let categories = 
    arxivData
        .Select(Functions.Col("id"),Functions.Col("categories").Alias("categories"))
        .GroupBy(Functions.Col("id"), Functions.Col("categories"))
        .Count()
        .OrderBy(Functions.Col("count").Desc())

In [42]:
categories.Show()

+---------+--------------------+-----+
|       id|          categories|count|
+---------+--------------------+-----+
|0704.0042|physics.gen-ph qu...|    1|
|0704.0140|              hep-th|    1|
|0704.0176|cond-mat.other co...|    1|
|0704.0560|physics.atom-ph p...|    1|
|0704.0585|cond-mat.mtrl-sci...|    1|
|0704.0599|cond-mat.supr-con...|    1|
|0704.0742|cond-mat.dis-nn c...|    1|
|0704.0933|     cond-mat.str-el|    1|
|0704.1127|            astro-ph|    1|
|0704.1567|               gr-qc|    1|
|0704.1811|   q-bio.QM q-bio.PE|    1|
|0704.1930|               gr-qc|    1|
|0704.2173|physics.optics ph...|    1|
|0704.2230|              hep-th|    1|
|0704.2324|      physics.soc-ph|    1|
|0704.2348|cond-mat.other co...|    1|
|0704.2383|       cs.IT math.IT|    1|
|0704.2481|     math.HO math.CO|    1|
|0704.2870|             math.AG|    1|
|0704.3444|       cond-mat.soft|    1|
+---------+--------------------+-----+
only showing top 20 rows



### Total number of distinct categories

In [43]:
let totalDistinctCategories = 
    categories
        .Select(Functions.Col("categories"))
        .Distinct()
        .Count()

totalDistinctCategories

### Categories more than or equal to 1k articles

In [44]:
let categoriesGtEq1k = 
    categories
        .Filter(Functions.Col("count").Gt(999))

In [None]:
categoriesGtEq1k.Show(10)

In [45]:
let totalCategoriesGtEq1k = 
    categoriesGtEq1k
        .Agg(Functions.CountDistinct(Functions.Col("categories")))

In [None]:
totalCategoriesGtEq1k.Show()

### Categories with less than 1k articles

In [46]:
let categoriesLessThan1k = 
    categories
        .Filter(Functions.Col("count").Lt(1000))

In [37]:
categoriesLessThan1k.Show(10)

+--------------------+-----+
|          categories|count|
+--------------------+-----+
|hep-ph hep-ex nuc...|  997|
|      physics.bio-ph|  993|
|     math.PR math.CO|  990|
|     math.AC math.AG|  982|
|physics.app-ph co...|  974|
|   astro-ph.HE gr-qc|  960|
|hep-th astro-ph.C...|  950|
|            q-bio.QM|  950|
|     math.AP math.DG|  950|
|      hep-lat hep-th|  945|
+--------------------+-----+
only showing top 10 rows



### Total distinct categories with less than 1k articles

In [47]:
let totalCategoriesLessThan1k = 
    categoriesLessThan1k
        .Agg(Functions.CountDistinct(Functions.Col("categories")))

In [None]:
totalCategoriesLessThan1k.Show()

In [51]:
let flattenedCategories = 
    categories
        .Select(Functions.Col("id"), Functions.Split(Functions.Col("categories")," ").Alias("multiplecategories"))
        .Select(Functions.Col("id"), Functions.Explode(Functions.Col("multiplecategories")).Alias("category"))
        .GroupBy(Functions.Col("id"),Functions.Col("category"))
        .Count()
        .OrderBy(Functions.Col("count").Desc())

In [52]:
flattenedCategories.Show(10)

+---------+---------------+-----+
|       id|       category|count|
+---------+---------------+-----+
|0704.0140|         hep-th|    1|
|0704.0933|cond-mat.str-el|    1|
|0704.1127|       astro-ph|    1|
|0704.1567|          gr-qc|    1|
|0704.1930|          gr-qc|    1|
|0704.2230|         hep-th|    1|
|0704.2324| physics.soc-ph|    1|
|0704.2870|        math.AG|    1|
|0704.3444|  cond-mat.soft|    1|
|0704.3968|         hep-ph|    1|
+---------+---------------+-----+
only showing top 10 rows



In [53]:
flattenedCategories
    .Select(Functions.Col("category"))
    .Distinct()
    .Count()

In [55]:
let flattenedCategoryCounts =
    flattenedCategories
        .Select(Functions.Col("category"))
        .GroupBy(Functions.Col("category"))
        .Count()
        .OrderBy(Functions.Col("count").Desc())

In [56]:
flattenedCategoryCounts.Show()

+------------------+------+
|          category| count|
+------------------+------+
|            hep-ph|151526|
|            hep-th|138496|
|          quant-ph|105728|
|          astro-ph|105380|
|             gr-qc| 83270|
| cond-mat.mes-hall| 69440|
| cond-mat.mtrl-sci| 65658|
|             cs.LG| 62754|
|           math.MP| 62417|
|           math-ph| 62417|
|cond-mat.stat-mech| 59239|
|   cond-mat.str-el| 57498|
|       astro-ph.CO| 51733|
|           nucl-th| 47746|
|           math.CO| 46221|
|             cs.CV| 44813|
|       astro-ph.SR| 43806|
|           stat.ML| 43711|
|       astro-ph.GA| 42638|
|           math.AP| 42124|
+------------------+------+
only showing top 20 rows



In [60]:
flattenedCategories
    .Agg(Functions.CountDistinct(Functions.Col("id")))
    .Show()

+------------------+
|count(DISTINCT id)|
+------------------+
|           1747307|
+------------------+



In [64]:
let flattenedDistinctCategories = 
    flattenedCategories
        .DropDuplicates("id")

In [65]:
flattenedDistinctCategories.Agg(Functions.Count(Functions.Col("id"))).Show()

+---------+
|count(id)|
+---------+
|  1747307|
+---------+



### Convert categories DataFrame to IEnumerable

In [66]:
let categoriesIEnumerable = flattenedDistinctCategories.Collect()

In [67]:
open System

### Get category name and count columns

In [68]:
let categoryName = 
    flattenedDistinctCategories
        .Select(Functions.Col("category"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString())

In [69]:
let categoryCount = 
    flattenedDistinctCategories
        .Select(Functions.Col("count"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString() |> Int32.Parse)

### Plot categories

In [70]:
#r "nuget:XPlot.Plotly"

Installed package XPlot.Plotly version 3.0.1

In [71]:
open XPlot
open XPlot.Plotly

In [72]:
let trace =
    Bar(
        x = categoryName,
        y = categoryCount,
        name= "ArXiv Categories"            
    )

In [73]:
let layout = Layout(title = "Basic Bar Chart")

In [None]:
[trace]
|> Chart.Plot
|> Chart.WithLayout layout
|> Chart.WithHeight 500
|> Chart.WithWidth 700

##  Get the total number of  unique categories

In [12]:
let totalCategories = 
    categories
        .Select(Functions.Col("categories"))
        .Distinct()
        .Count()

In [14]:
totalCategories

## Published categories by year

In [42]:
let categoriesByYear = 
    arxivData
        .Select(Functions.Col("categories"),Functions.Year(Functions.Col("update_date")).Alias("year"))
        .GroupBy(Functions.Col("categories"),Functions.Col("year"))
        .Count()
        .OrderBy(Functions.Col("year").Desc(),Functions.Col("categories").Asc())

In [43]:
categoriesByYear.Show(10)

+--------------------+----+-----+
|          categories|year|count|
+--------------------+----+-----+
|            astro-ph|2020|   21|
|astro-ph gr-qc he...|2020|    1|
|         astro-ph.CO|2020|  871|
|astro-ph.CO astro...|2020|    1|
|astro-ph.CO astro...|2020|    1|
|astro-ph.CO astro...|2020|    1|
|astro-ph.CO astro...|2020|  225|
|astro-ph.CO astro...|2020|    4|
|astro-ph.CO astro...|2020|    3|
|astro-ph.CO astro...|2020|   17|
+--------------------+----+-----+
only showing top 10 rows



In [46]:
let totalArticlesPerYear = 
    categoriesByYear
        .GroupBy(Functions.Col("year"))
        .Count()
        .OrderBy(Functions.Col("year").Desc())

In [47]:
totalArticlesPerYear.Show(10)

+----+-----+
|year|count|
+----+-----+
|2020|16932|
|2019|16097|
|2018|12951|
|2017|11501|
|2016|11651|
|2015|13665|
|2014| 8873|
|2013| 8220|
|2012| 7361|
|2011| 6966|
+----+-----+
only showing top 10 rows



In [48]:
totalArticlesPerYear
    .Select(Functions.Col("year"))
    .Distinct()
    .Count()