## Install .NET for Apache Spark & ML.NET NuGet packages

In [3]:
#r "nuget:Microsoft.Spark,0.12.1"
#r "nuget:Microsoft.ML,1.5.2"
#r "nuget:Microsoft.ML.AutoML,0.17.2"

Installed package Microsoft.ML.AutoML version 0.17.2

## Import packages

In [5]:
open Microsoft.Spark
open Microsoft.Spark.Sql
open Microsoft.ML
open Microsoft.ML.AutoML

## Define data directory

In [6]:
let DATA_DIR = "/datadrive/Data/Kaggle/arxiv-metadata-oai-snapshot-2020-08-14.json"

## Initialize SparkSession

In [7]:
let sparkSession = 
    SparkSession
        .Builder()
        .AppName("arxiv-analytics")
        .GetOrCreate()

[2020-10-26T14:14:34.8833965Z] [fsharp-analytics-vm] [Info] [ConfigurationService] 'DOTNETBACKEND_PORT' environment variable is not set.
[2020-10-26T14:14:34.8855541Z] [fsharp-analytics-vm] [Info] [ConfigurationService] Using port 5567 for connection.
[2020-10-26T14:14:34.8863004Z] [fsharp-analytics-vm] [Info] [JvmBridge] JvMBridge port is 5567


## Load data into DataFrame

In [8]:
let arxivData = 
    sparkSession
        .Read()
        .Option("inferSchema",true)
        .Json([|DATA_DIR|])

## Inspect original DataFrame schema

In [9]:
arxivData.PrintSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_parsed: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- categories: string (nullable = true)
 |-- comments: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- id: string (nullable = true)
 |-- journal-ref: string (nullable = true)
 |-- license: string (nullable = true)
 |-- report-no: string (nullable = true)
 |-- submitter: string (nullable = true)
 |-- title: string (nullable = true)
 |-- update_date: string (nullable = true)
 |-- versions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- version: string (nullable = true)



## Select a subset of columns

In [13]:
let columnNames = 
    [|
        Functions.Col("id")
        Functions.Col("title")
        Functions.Col("abstract")
        Functions.Col("categories")
    |]

In [14]:
let dfSubset = 
    arxivData
        .Select(columnNames)

In [15]:
dfSubset.Show(3)

+---------+--------------------+--------------------+--------------+
|       id|               title|            abstract|    categories|
+---------+--------------------+--------------------+--------------+
|0704.0001|Calculation of pr...|  A fully differe...|        hep-ph|
|0704.0002|Sparsity-certifyi...|  We describe a n...| math.CO cs.CG|
|0704.0003|The evolution of ...|  The evolution o...|physics.gen-ph|
+---------+--------------------+--------------------+--------------+
only showing top 3 rows



## Get only top level category

In [23]:
let multipleCategorySelection = 
    Functions.Split(Functions.Col("categories"), " ").GetItem(0)

let subcategorySelection = 
    Functions.Split(Functions.Col("categories"),"\.").GetItem(0)

In [24]:
let topLevelCategoryDf = 
    dfSubset
        .WithColumn("categories",multipleCategorySelection)
        .WithColumn("categories",subcategorySelection)

In [25]:
topLevelCategoryDf.Show(3)

+---------+--------------------+--------------------+----------+
|       id|               title|            abstract|categories|
+---------+--------------------+--------------------+----------+
|0704.0001|Calculation of pr...|  A fully differe...|    hep-ph|
|0704.0002|Sparsity-certifyi...|  We describe a n...|      math|
|0704.0003|The evolution of ...|  The evolution o...|   physics|
+---------+--------------------+--------------------+----------+
only showing top 3 rows



## Drop duplicates

In [31]:
// original article count
topLevelCategoryDf
    .Agg(Functions.Count(Functions.Col("id")))
    .Show()

+---------+
|count(id)|
+---------+
|  1747307|
+---------+



In [39]:
let distinctTopLevelCategoryDf = 
    topLevelCategoryDf
        .DropDuplicates("id")
        .Drop(Functions.Col("id"))

In [40]:
distinctTopLevelCategoryDf.Show(3)

+--------------------+--------------------+----------+
|               title|            abstract|categories|
+--------------------+--------------------+----------+
|Binary Systems as...|  We review the g...|     gr-qc|
|Cosmology from St...|  We explore the ...|    hep-ph|
|Domain Switching ...|  We investigated...|  cond-mat|
+--------------------+--------------------+----------+
only showing top 3 rows



## Output cleaned results to files

In [43]:
let outputDir = "/datadrive/Data/ArXivSparkOutput"

In [51]:
distinctTopLevelCategoryDf
    .Coalesce(1)
    .Write()
    .Mode("overwrite")
    .Option("header", true)
    .Csv(outputDir)

## Define paths and variables

In [54]:
let trainDataFile = "/datadrive/Data/ArXivTrainData/traindata.csv"
let labelColumnName = "categories"

## Initialize MLContext

In [53]:
let mlContext = MLContext()

In [61]:
let colInference = mlContext.Auto().InferColumns(trainDataFile,labelColumnName=labelColumnName,separatorChar=',',allowQuoting=)

Stopped due to error

input.fsx (1,20)-(1,86) typecheck error No overloads match for method 'InferColumns'.

Known types of arguments: string * string * bool

Available overloads:
 - AutoCatalog.InferColumns(path: string, columnInformation: ColumnInformation,?separatorChar: Nullable<char>,?allowQuoting: Nullable<bool>,?allowSparse: Nullable<bool>,?trimWhitespace: bool,?groupColumns: bool) : ColumnInferenceResults // Argument 'columnInformation' doesn't match
 - AutoCatalog.InferColumns(path: string, labelColumnIndex: uint32,?hasHeader: bool,?separatorChar: Nullable<char>,?allowQuoting: Nullable<bool>,?allowSparse: Nullable<bool>,?trimWhitespace: bool,?groupColumns: bool) : ColumnInferenceResults // Argument 'labelColumnIndex' doesn't match
 - AutoCatalog.InferColumns(path: string,?labelColumnName: string,?separatorChar: Nullable<char>,?allowQuoting: Nullable<bool>,?allowSparse: Nullable<bool>,?trimWhitespace: bool,?groupColumns: bool) : ColumnInferenceResults // Argument 'separatorChar

Cell not executed: compilation error