In [1]:
#r "nuget:Microsoft.Spark,0.12.1"

Installed package Microsoft.Spark version 0.12.1

##  Import Spark packages

In [2]:
open Microsoft.Spark
open Microsoft.Spark.Sql

## Define data path

In [3]:
let DATA_DIR = "/datadrive/Data/Kaggle/arxiv-metadata-oai-snapshot-2020-08-14.json"

## Initialize spark session

In [4]:
let sparkSession = 
    SparkSession
        .Builder()
        .AppName("arxiv-analytics")
        .GetOrCreate()

[2020-09-23T14:08:57.4017988Z] [fsharp-analytics-vm] [Info] [ConfigurationService] 'DOTNETBACKEND_PORT' environment variable is not set.
[2020-09-23T14:08:57.4064359Z] [fsharp-analytics-vm] [Info] [ConfigurationService] Using port 5567 for connection.
[2020-09-23T14:08:57.4087576Z] [fsharp-analytics-vm] [Info] [JvmBridge] JvMBridge port is 5567


## Load data

In [5]:
let arxivData = 
    sparkSession
        .Read()
        .Json([|DATA_DIR|])

##  Display column names

In [6]:
arxivData.Columns()

index,value
0,abstract
1,authors
2,authors_parsed
3,categories
4,comments
5,doi
6,id
7,journal-ref
8,license
9,report-no


## Get the category count in descending order

In [6]:
let categories = 
    arxivData
        .Select(Functions.Col("categories").Alias("categories"))
        .GroupBy("categories")
        .Count()
        .OrderBy(Functions.Col("count").Desc())

### Total number of distinct categories

In [13]:
let totalDistinctCategories = 
    categories
        .Select(Functions.Col("categories"))
        .Distinct()
        .Count()

totalDistinctCategories

### Total number of articles

In [71]:
categories
    .Select(Functions.Sum(Functions.Col("count")))
    .Show()

+----------+
|sum(count)|
+----------+
|   1747307|
+----------+



### Categories more than or equal to 1k articles

In [32]:
let categoriesGtEq1k = 
    categories
        .Filter(Functions.Col("count").Gt(999))

categoriesGtEq1k.Show(10)

+-----------------+-----+
|       categories|count|
+-----------------+-----+
|         astro-ph|86914|
|           hep-ph|73082|
|         quant-ph|52956|
|           hep-th|52888|
|cond-mat.mes-hall|29312|
|cond-mat.mtrl-sci|29278|
|            gr-qc|25221|
|            cs.CV|23986|
|          math.AP|23484|
|      astro-ph.SR|22578|
+-----------------+-----+
only showing top 10 rows



In [33]:
let totalCategoriesGtEq1k = 
    categoriesGtEq1k
        .Select(Functions.Col("categories"))
        .Distinct()
        .Count()

totalCategoriesGtEq1k

### Categories with less than 1k articles

In [14]:
let categoriesLessThan1k = 
    categories
        .Filter(Functions.Col("count").Lt(1000))

In [15]:
categoriesLessThan1k.Show(10)

+--------------------+-----+
|          categories|count|
+--------------------+-----+
|hep-ph hep-ex nuc...|  997|
|      physics.bio-ph|  993|
|     math.PR math.CO|  990|
|     math.AC math.AG|  982|
|physics.app-ph co...|  974|
|   astro-ph.HE gr-qc|  960|
|hep-th astro-ph.C...|  950|
|            q-bio.QM|  950|
|     math.AP math.DG|  950|
|      hep-lat hep-th|  945|
+--------------------+-----+
only showing top 10 rows



### Total distinct categories with less than 1k articles

In [12]:
let totalCategoriesLessThan1k = 
    categoriesLessThan1k
        .Select(Functions.Col("categories"))
        .Distinct()
        .Count()

totalCategoriesLessThan1k

In [31]:
let splitCategories = 
    categoriesLessThan1k
        .Select(Functions.Split(Functions.Col("categories")," ").Alias("multiplecategories"))
        .Select(Functions.Explode(Functions.Col("multiplecategories")).Alias("category"))
        .GroupBy("category")
        .Count()
        .OrderBy(Functions.Col("count").Desc())

splitCategories.Show(10)

+------------------+-----+
|          category|count|
+------------------+-----+
|           math.MP| 6457|
|           math-ph| 6457|
|             cs.LG| 6174|
|          quant-ph| 5243|
|cond-mat.stat-mech| 5137|
|            hep-th| 4833|
|           stat.ML| 4483|
|   physics.comp-ph| 3192|
|           math.OC| 3161|
|           math.PR| 3105|
+------------------+-----+
only showing top 10 rows



In [35]:
let flattenedCategories = 
    categories
        .Select(Functions.Split(Functions.Col("categories")," ").Alias("multiplecategories"))
        .Select(Functions.Explode(Functions.Col("multiplecategories")).Alias("category"))
        .GroupBy("category")
        .Count()
        .OrderBy(Functions.Col("count").Desc())

flattenedCategories.Show(10)

+------------------+-----+
|          category|count|
+------------------+-----+
|           math.MP| 6464|
|           math-ph| 6464|
|             cs.LG| 6181|
|          quant-ph| 5252|
|cond-mat.stat-mech| 5144|
|            hep-th| 4844|
|           stat.ML| 4488|
|   physics.comp-ph| 3193|
|           math.OC| 3162|
|           math.PR| 3107|
+------------------+-----+
only showing top 10 rows



In [36]:
flattenedCategories
    .Select(Functions.Col("category"))
    .Distinct()
    .Count()

In [69]:
flattenedCategories
    .Filter(Functions.Col("category").Contains("astro-ph"))
    .Select(Functions.Sum(Functions.Col("count")))
    .Show()

+----------+
|sum(count)|
+----------+
|      7960|
+----------+



In [70]:
flattenedCategories
    .Select(Functions.Sum(Functions.Col("count")))
    .Show()

+----------+
|sum(count)|
+----------+
|    218210|
+----------+



### Convert categories DataFrame to IEnumerable

In [40]:
let categoriesIEnumerable = flattenedCategories.Collect()

In [38]:
open System

### Get category name and count columns

In [51]:
let categoryName = 
    flattenedCategories
        .Select(Functions.Col("category"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString())

In [52]:
let categoryCount = 
    flattenedCategories
        .Select(Functions.Col("count"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString() |> Int32.Parse)

### Plot categories

In [53]:
#r "nuget:XPlot.Plotly"

In [54]:
open XPlot
open XPlot.Plotly

In [55]:
let trace =
    Bar(
        x = categoryName,
        y = categoryCount,
        name= "ArXiv Categories"            
    )

In [56]:
let layout = Layout(title = "Basic Bar Chart")

In [57]:
[trace]
|> Chart.Plot
|> Chart.WithLayout layout
|> Chart.WithHeight 500
|> Chart.WithWidth 700

##  Get the total number of  unique categories

In [12]:
let totalCategories = 
    categories
        .Select(Functions.Col("categories"))
        .Distinct()
        .Count()

In [14]:
totalCategories

## Published categories by year

In [42]:
let categoriesByYear = 
    arxivData
        .Select(Functions.Col("categories"),Functions.Year(Functions.Col("update_date")).Alias("year"))
        .GroupBy(Functions.Col("categories"),Functions.Col("year"))
        .Count()
        .OrderBy(Functions.Col("year").Desc(),Functions.Col("categories").Asc())

In [43]:
categoriesByYear.Show(10)

+--------------------+----+-----+
|          categories|year|count|
+--------------------+----+-----+
|            astro-ph|2020|   21|
|astro-ph gr-qc he...|2020|    1|
|         astro-ph.CO|2020|  871|
|astro-ph.CO astro...|2020|    1|
|astro-ph.CO astro...|2020|    1|
|astro-ph.CO astro...|2020|    1|
|astro-ph.CO astro...|2020|  225|
|astro-ph.CO astro...|2020|    4|
|astro-ph.CO astro...|2020|    3|
|astro-ph.CO astro...|2020|   17|
+--------------------+----+-----+
only showing top 10 rows



In [46]:
let totalArticlesPerYear = 
    categoriesByYear
        .GroupBy(Functions.Col("year"))
        .Count()
        .OrderBy(Functions.Col("year").Desc())

In [47]:
totalArticlesPerYear.Show(10)

+----+-----+
|year|count|
+----+-----+
|2020|16932|
|2019|16097|
|2018|12951|
|2017|11501|
|2016|11651|
|2015|13665|
|2014| 8873|
|2013| 8220|
|2012| 7361|
|2011| 6966|
+----+-----+
only showing top 10 rows



In [48]:
totalArticlesPerYear
    .Select(Functions.Col("year"))
    .Distinct()
    .Count()