In [1]:
#r "nuget:Microsoft.Spark,0.12.1"

Installed package Microsoft.Spark version 0.12.1

##  Import Spark packages

In [2]:
open Microsoft.Spark
open Microsoft.Spark.Sql

## Define data path

In [3]:
let DATA_DIR = "/datadrive/Data/Kaggle/arxiv-metadata-oai-snapshot-2020-08-14.json"

## Initialize spark session

In [4]:
let sparkSession = 
    SparkSession
        .Builder()
        .AppName("arxiv-analytics")
        .GetOrCreate()

[2020-09-21T14:23:34.3904805Z] [fsharp-analytics-vm] [Info] [ConfigurationService] 'DOTNETBACKEND_PORT' environment variable is not set.
[2020-09-21T14:23:34.3944297Z] [fsharp-analytics-vm] [Info] [ConfigurationService] Using port 5567 for connection.
[2020-09-21T14:23:34.3952651Z] [fsharp-analytics-vm] [Info] [JvmBridge] JvMBridge port is 5567


## Load data

In [5]:
let arxivData = 
    sparkSession
        .Read()
        .Json([|DATA_DIR|])

##  Display column names

In [6]:
arxivData.Columns()

index,value
0,abstract
1,authors
2,authors_parsed
3,categories
4,comments
5,doi
6,id
7,journal-ref
8,license
9,report-no


## Get the category count in descending order

In [7]:
let categories = 
    arxivData
        .Select(Functions.Col("categories").Alias("categories"))
        .GroupBy("categories")
        .Count()
        .OrderBy(Functions.Col("count").Desc())

In [8]:
let categoriesIEnumerable = categories.Collect()

In [24]:
open System

In [65]:
let categoryName = 
    categories
        .Select(Functions.Col("categories"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString())

In [66]:
let categoryCount = 
    categories
        .Select(Functions.Col("count"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString() |> Int32.Parse)

In [12]:
#r "nuget:XPlot.Plotly"

Installed package XPlot.Plotly version 3.0.1

In [13]:
open XPlot
open XPlot.Plotly

In [70]:
let trace =
    Bar(
        x = categoryName,
        y = categoryCount,
        name= "ArXiv Categories"            
    )

In [71]:
let layout = Layout(title = "Basic Bar Chart")

In [73]:
[trace]
|> Chart.Plot
|> Chart.WithLayout layout
|> Chart.WithHeight 500
|> Chart.WithWidth 700

In [9]:
categories.Show(10)

+-----------------+-----+
|       categories|count|
+-----------------+-----+
|         astro-ph|86914|
|           hep-ph|73082|
|         quant-ph|52956|
|           hep-th|52888|
|cond-mat.mes-hall|29312|
|cond-mat.mtrl-sci|29278|
|            gr-qc|25221|
|            cs.CV|23986|
|          math.AP|23484|
|      astro-ph.SR|22578|
+-----------------+-----+
only showing top 10 rows



##  Get the total number of  unique categories

In [12]:
let totalCategories = 
    categories
        .Select(Functions.Col("categories"))
        .Distinct()
        .Count()

In [14]:
totalCategories

## Published categories by year

In [42]:
let categoriesByYear = 
    arxivData
        .Select(Functions.Col("categories"),Functions.Year(Functions.Col("update_date")).Alias("year"))
        .GroupBy(Functions.Col("categories"),Functions.Col("year"))
        .Count()
        .OrderBy(Functions.Col("year").Desc(),Functions.Col("categories").Asc())

In [43]:
categoriesByYear.Show(10)

+--------------------+----+-----+
|          categories|year|count|
+--------------------+----+-----+
|            astro-ph|2020|   21|
|astro-ph gr-qc he...|2020|    1|
|         astro-ph.CO|2020|  871|
|astro-ph.CO astro...|2020|    1|
|astro-ph.CO astro...|2020|    1|
|astro-ph.CO astro...|2020|    1|
|astro-ph.CO astro...|2020|  225|
|astro-ph.CO astro...|2020|    4|
|astro-ph.CO astro...|2020|    3|
|astro-ph.CO astro...|2020|   17|
+--------------------+----+-----+
only showing top 10 rows



In [46]:
let totalArticlesPerYear = 
    categoriesByYear
        .GroupBy(Functions.Col("year"))
        .Count()
        .OrderBy(Functions.Col("year").Desc())

In [47]:
totalArticlesPerYear.Show(10)

+----+-----+
|year|count|
+----+-----+
|2020|16932|
|2019|16097|
|2018|12951|
|2017|11501|
|2016|11651|
|2015|13665|
|2014| 8873|
|2013| 8220|
|2012| 7361|
|2011| 6966|
+----+-----+
only showing top 10 rows



In [48]:
totalArticlesPerYear
    .Select(Functions.Col("year"))
    .Distinct()
    .Count()