## Install Microsoft.Spark NuGet package

In [1]:
#r "nuget:Microsoft.Spark,0.12.1"

Installed package Microsoft.Spark version 0.12.1

## Import packages

In [2]:
open Microsoft.Spark
open Microsoft.Spark.Sql

## Define data directory

In [3]:
let DATA_DIR = "/datadrive/Data/Kaggle/arxiv-metadata-oai-snapshot-2020-08-14.json"

## Initialize SparkSession

In [4]:
let sparkSession = 
    SparkSession
        .Builder()
        .AppName("arxiv-analytics")
        .GetOrCreate()

[2020-10-21T14:10:58.7793153Z] [fsharp-analytics-vm] [Info] [ConfigurationService] 'DOTNETBACKEND_PORT' environment variable is not set.
[2020-10-21T14:10:58.7837183Z] [fsharp-analytics-vm] [Info] [ConfigurationService] Using port 5567 for connection.
[2020-10-21T14:10:58.7842855Z] [fsharp-analytics-vm] [Info] [JvmBridge] JvMBridge port is 5567


## Load data into DataFrame

In [5]:
let arxivData = 
    sparkSession
        .Read()
        .Option("inferSchema",true)
        .Json([|DATA_DIR|])

## Get article categories in dataset

In [6]:
let categories = 
    arxivData
        .Select(Functions.Col("id"),Functions.Col("categories").Alias("categories"))
        .GroupBy(Functions.Col("id"), Functions.Col("categories"))
        .Count()
        .OrderBy(Functions.Col("count").Desc())

## Flatten articles containing multiple categories

In [7]:
let flattenedCategories = 
    categories
        .Select(Functions.Col("id"), Functions.Split(Functions.Col("categories")," ").Alias("multiplecategories"))
        .Select(Functions.Col("id"), Functions.Explode(Functions.Col("multiplecategories")).Alias("category"))
        .GroupBy(Functions.Col("id"),Functions.Col("category"))
        .Count()
        .OrderBy(Functions.Col("count").Desc())

## Drop duplicate articles

In [8]:
let flattenedDistinctCategories = 
    flattenedCategories
        .DropDuplicates("id")

## Aggregate distinct article categories

In [9]:
let flattenedCategoryCounts =
    flattenedDistinctCategories
        .Select(Functions.Col("id"),Functions.Col("category"))
        .GroupBy(Functions.Col("category"))
        .Count()
        .OrderBy(Functions.Col("count").Desc())

In [15]:
flattenedCategoryCounts.Show(10)

+------------------+------+
|          category| count|
+------------------+------+
|            hep-ph|105482|
|          astro-ph| 94315|
|            hep-th| 85276|
|          quant-ph| 73608|
|             gr-qc| 48521|
| cond-mat.mes-hall| 47404|
| cond-mat.mtrl-sci| 45916|
|   cond-mat.str-el| 38075|
|cond-mat.stat-mech| 34181|
|             cs.CV| 32269|
+------------------+------+
only showing top 10 rows



In [30]:
flattenedCategoryCounts
    .Agg(Functions.CountDistinct(Functions.Col("category")))
    .Show()

+------------------------+
|count(DISTINCT category)|
+------------------------+
|                     176|
+------------------------+



In [None]:
flattenedCategoryCounts
    .Agg(Functions.Sum(Functions.Col("count")))
    .Show()

## Aggregate distinct categories based on top level category

In [39]:
let topLevelCategoryCounts = 
    flattenedCategoryCounts
        .Select(Functions.Split(Functions.Col("category"),"\.").GetItem(0).As("toplevelcategory"),Functions.Col("count"))
        .GroupBy(Functions.Col("toplevelcategory"))
        .Sum()
        .OrderBy(Functions.Col("sum(count)").Desc())

In [40]:
topLevelCategoryCounts.Show(10)

+----------------+----------+
|toplevelcategory|sum(count)|
+----------------+----------+
|            math|    382066|
|        cond-mat|    247515|
|        astro-ph|    241432|
|              cs|    229964|
|         physics|    123758|
|          hep-ph|    105482|
|          hep-th|     85276|
|        quant-ph|     73608|
|           gr-qc|     48521|
|            stat|     41863|
+----------------+----------+
only showing top 10 rows



In [41]:
topLevelCategoryCounts
    .Agg(Functions.CountDistinct(Functions.Col("toplevelcategory")))
    .Show()

+--------------------------------+
|count(DISTINCT toplevelcategory)|
+--------------------------------+
|                              38|
+--------------------------------+



In [42]:
topLevelCategoryCounts
    .Agg(Functions.Sum(Functions.Col("sum(count)")))
    .Show()

+---------------+
|sum(sum(count))|
+---------------+
|        1747307|
+---------------+



In [33]:
flattenedCategoryCounts.Describe().Show()

+-------+--------+------------------+
|summary|category|             count|
+-------+--------+------------------+
|  count|     176|               176|
|   mean|    null| 9927.880681818182|
| stddev|    null|15881.645753239685|
|    min|acc-phys|                 7|
|    max|supr-con|            105482|
+-------+--------+------------------+



In [44]:
let categoryName = 
    topLevelCategoryCounts
        .Select(Functions.Col("toplevelcategory"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString())

In [45]:
let categoryCount = 
    topLevelCategoryCounts
        .Select(Functions.Col("sum(count)"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString() |> Int32.Parse)

## Plot categories

In [46]:
#r "nuget:XPlot.Plotly"

Installed package XPlot.Plotly version 3.0.1

In [47]:
open XPlot
open XPlot.Plotly

In [48]:
let trace =
    Bar(
        x = categoryName,
        y = categoryCount,
        name= "ArXiv Categories"            
    )

In [49]:
let layout = Layout(title = "Basic Bar Chart")

In [50]:
[trace]
|> Chart.Plot
|> Chart.WithLayout layout
|> Chart.WithHeight 500
|> Chart.WithWidth 700