In [4]:
#r "nuget:Microsoft.Spark,0.12.1"

Installed package Microsoft.Spark version 0.12.1

In [5]:
open Microsoft.Spark
open Microsoft.Spark.Sql

## Load training data

In [6]:
let DATA_DIR = "/datadrive/Data/ArXivTrainData"

In [7]:
let sparkSession = 
    SparkSession
        .Builder()
        .AppName("arxiv-analytics")
        .GetOrCreate()

[2020-11-16T15:22:00.5038198Z] [fsharp-analytics-vm] [Info] [ConfigurationService] 'DOTNETBACKEND_PORT' environment variable is not set.
[2020-11-16T15:22:00.5050412Z] [fsharp-analytics-vm] [Info] [ConfigurationService] Using port 5567 for connection.
[2020-11-16T15:22:00.5057788Z] [fsharp-analytics-vm] [Info] [JvmBridge] JvMBridge port is 5567


In [8]:
let trainData = 
    sparkSession
        .Read()
        .Option("inferSchema",true)
        .Option("header", true)
        .Csv([|DATA_DIR|])

## Load category data

In [9]:
let CATEGORY_DIR = "/datadrive/Data/ArXivCategoryData"

In [10]:
let categoryData = 
    sparkSession
        .Read()
        .Option("inferSchema",true)
        .Option("header", true)
        .Csv([|CATEGORY_DIR|])

In [11]:
categoryData.Show()

+----------+
|categories|
+----------+
|     gr-qc|
|  acc-phys|
|  funct-an|
|  cond-mat|
|        cs|
|  supr-con|
|  chao-dyn|
|     q-alg|
|     dg-ga|
|     q-fin|
|      eess|
|   chem-ph|
|     q-bio|
|  quant-ph|
|      stat|
|      nlin|
|   nucl-ex|
|    hep-th|
|  astro-ph|
|      econ|
+----------+
only showing top 20 rows



In [36]:
let categoryList = 
    categoryData
        .Select(Functions.Col("categories"))
        .Collect()
        |> Seq.map(fun row -> row.[0].ToString())
        |> Array.ofSeq

In [37]:
categoryList

index,value
0,gr-qc
1,acc-phys
2,funct-an
3,cond-mat
4,cs
5,supr-con
6,chao-dyn
7,q-alg
8,dg-ga
9,q-fin


In [38]:
categoryList.GetType()

In [41]:
let topLevelCategoriesDf = 
    trainData
        .Filter(Functions.Col("categories").IsIn(categoryList))

In [42]:
topLevelCategoriesDf.Show()

+--------------------+--------------------+----------+
|               title|            abstract|categories|
+--------------------+--------------------+----------+
|   the shear modulus| and the shear vi...|  astro-ph|
|            Lemaitre| although his pap...|  astro-ph|
|            universe| including dark e...|     gr-qc|
|       momentum maps| that should be t...|      math|
|gamma energies th...|   standard models."|   nucl-ex|
|         self-energy| is taken into ac...|  cond-mat|
|together cluster ...| graph theory and...|        cs|
|         tr_P(x_P)=1| where P varies o...|      math|
|          systematic| and the upper li...|    hep-ex|
|Comment: Expert E...|Comment: Expert E...|      stat|
| with the BCG itself| extending out to...|  astro-ph|
|probably already ...| it qualifies as ...|  astro-ph|
|  $\\epsilon$ values| the forward regi...|   physics|
|        linearisable| they possess non...|   math-ph|
|parametrized by t...|H)$ by $Sp_\\thet...|      math|
|arbitrari

## Get null category counts

In [20]:
trainData
    .Filter(Functions.Col("categories").IsNull())
    .Count()

In [21]:
trainData.Count()

## Drop null categories

In [17]:
let noNullCategoriesDf = 
    trainData
        .Na()
        .Drop(seq {"categories"})

In [18]:
noNullCategoriesDf.Show(10)

+--------------------+--------------------+--------------------+
|               title|            abstract|          categories|
+--------------------+--------------------+--------------------+
|               (IRAS|                 MSX|              2MASS)|
|that any such sta...| in a well define...|  a state of minimum|
|   the shear modulus| and the shear vi...|            astro-ph|
|be linearly unsta...| I study how a small|           localized|
|10 keV based on I...|           Swift/BAT| and BeppoSAX/PDS...|
|of the emitting h...| while the amount...|                 due|
|             403-435|      1986). Finally| we will discuss ...|
|cosmology from 19...| based on the sem...|                  de|
|              Sitter|           Friedmann|            Lemaitre|
|            Lemaitre| although his pap...|            astro-ph|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [28]:
noNullCategoriesDf.Count()

In [40]:
noNullCategoriesDf
    .Filter(Functions.Col("categories").IsIn(categoryList))
    .Show()

+--------------------+--------------------+----------+
|               title|            abstract|categories|
+--------------------+--------------------+----------+
|   the shear modulus| and the shear vi...|  astro-ph|
|            Lemaitre| although his pap...|  astro-ph|
|            universe| including dark e...|     gr-qc|
|       momentum maps| that should be t...|      math|
|gamma energies th...|   standard models."|   nucl-ex|
|         self-energy| is taken into ac...|  cond-mat|
|together cluster ...| graph theory and...|        cs|
|         tr_P(x_P)=1| where P varies o...|      math|
|          systematic| and the upper li...|    hep-ex|
|Comment: Expert E...|Comment: Expert E...|      stat|
| with the BCG itself| extending out to...|  astro-ph|
|probably already ...| it qualifies as ...|  astro-ph|
|  $\\epsilon$ values| the forward regi...|   physics|
|        linearisable| they possess non...|   math-ph|
|parametrized by t...|H)$ by $Sp_\\thet...|      math|
|arbitrari