## Install Microsoft.Spark NuGet package

In [1]:
#r "nuget:Microsoft.Spark,0.12.1"

Installed package Microsoft.Spark version 0.12.1

## Import packages

In [2]:
open Microsoft.Spark
open Microsoft.Spark.Sql

## Define data directory

In [3]:
let DATA_DIR = "/datadrive/Data/Kaggle/arxiv-metadata-oai-snapshot-2020-08-14.json"

## Initialize SparkSession

In [4]:
let sparkSession = 
    SparkSession
        .Builder()
        .AppName("arxiv-analytics")
        .GetOrCreate()

[2020-10-19T14:23:16.5259043Z] [fsharp-analytics-vm] [Info] [ConfigurationService] 'DOTNETBACKEND_PORT' environment variable is not set.
[2020-10-19T14:23:16.5302790Z] [fsharp-analytics-vm] [Info] [ConfigurationService] Using port 5567 for connection.
[2020-10-19T14:23:16.5309461Z] [fsharp-analytics-vm] [Info] [JvmBridge] JvMBridge port is 5567


## Load data into DataFrame

In [5]:
let arxivData = 
    sparkSession
        .Read()
        .Option("inferSchema",true)
        .Json([|DATA_DIR|])

## Get article categories in dataset

In [6]:
let categories = 
    arxivData
        .Select(Functions.Col("id"),Functions.Col("categories").Alias("categories"))
        .GroupBy(Functions.Col("id"), Functions.Col("categories"))
        .Count()
        .OrderBy(Functions.Col("count").Desc())

## Flatten articles containing multiple categories

In [25]:
let flattenedCategories = 
    categories
        .Select(Functions.Col("id"), Functions.Split(Functions.Col("categories")," ").Alias("multiplecategories"))
        .Select(Functions.Col("id"), Functions.Explode(Functions.Col("multiplecategories")).Alias("category"))
        .GroupBy(Functions.Col("id"),Functions.Col("category"))
        .Count()
        .OrderBy(Functions.Col("count").Desc())

## Drop duplicate articles

In [26]:
let flattenedDistinctCategories = 
    flattenedCategories
        .DropDuplicates("id")

## Aggregate distinct article categories

In [27]:
let flattenedCategoryCounts =
    flattenedDistinctCategories
        .Select(Functions.Col("id"),Functions.Col("category"))
        .GroupBy(Functions.Col("category"))
        .Count()
        .OrderBy(Functions.Col("count").Desc())

In [29]:
flattenedCategoryCounts.Show(5)

+--------+------+
|category| count|
+--------+------+
|  hep-ph|105482|
|astro-ph| 94315|
|  hep-th| 85276|
|quant-ph| 73608|
|   gr-qc| 48521|
+--------+------+
only showing top 5 rows



In [30]:
flattenedCategoryCounts
    .Agg(Functions.CountDistinct(Functions.Col("category")))
    .Show()

+------------------------+
|count(DISTINCT category)|
+------------------------+
|                     176|
+------------------------+



In [31]:
flattenedCategoryCounts
    .Agg(Functions.Sum(Functions.Col("count")))
    .Show()

+----------+
|sum(count)|
+----------+
|   1747307|
+----------+



In [33]:
flattenedCategoryCounts.Describe().Show()

+-------+--------+------------------+
|summary|category|             count|
+-------+--------+------------------+
|  count|     176|               176|
|   mean|    null| 9927.880681818182|
| stddev|    null|15881.645753239685|
|    min|acc-phys|                 7|
|    max|supr-con|            105482|
+-------+--------+------------------+



In [35]:
let top20Categories = 
    flattenedCategoryCounts
        .Limit(20)

In [41]:
let categoriesIEnumerable = flattenedCategoryCounts.Collect()

In [42]:
let categoryName = 
    flattenedCategoryCounts
        .Select(Functions.Col("category"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString())

In [49]:
let categoryCount = 
    flattenedCategoryCounts
        .Select(Functions.Col("count"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString() |> Int32.Parse)

## Plot categories

In [50]:
#r "nuget:XPlot.Plotly"

In [51]:
open XPlot
open XPlot.Plotly

In [52]:
let trace =
    Bar(
        x = categoryName,
        y = categoryCount,
        name= "ArXiv Categories"            
    )

In [53]:
let layout = Layout(title = "Basic Bar Chart")

In [54]:
[trace]
|> Chart.Plot
|> Chart.WithLayout layout
|> Chart.WithHeight 500
|> Chart.WithWidth 700