# Geo-Semantic Clustering Interactive Notebook
JN Matthews - 19.8.2021
(modified by Parker Rule for interoperability with main data pipeline)

In [None]:
#r "nuget: MathNet.Numerics.FSharp, 4.15.0"
#r "nuget:FSharp.Data"
#r "nuget: Plotly.NET, 2.0.0-beta9"
#r "nuget: Plotly.NET.Interactive, 2.0.0-beta9"
#r "nuget: Newtonsoft.Json, 12.0.3"

In [None]:
#load "/Users/pjrule/Dropbox/MGGG/coi-states/common/notebooks/GeoSemanticClusters/GeoSemanticClusters.fs"

In [None]:
open MathNet.Numerics.LinearAlgebra
open MathNet.Numerics.Optimization
open MathNet.Numerics
open FSharp.Data
open Plotly.NET
open Newtonsoft.Json
open System.IO
open GeoSemanticClusters

### Papermill parameters

In [None]:
let output_dir = "/Users/pjrule/MGGG/coi-states/MI/outputs"
let hausdorff_dists_filename = "mi_cluster_db_20210823_cluster_22_hausdorff_dists.txt"
let jaccard_sims_filename = "mi_cluster_db_20210823_cluster_22_jaccard_sims.txt"
let output_filename = "mi_cluster_db_20210823_cluster_22_clustering.json"
let num_bins = 2
let beta = 2.
let iters = 1000000

#### Load metric matricies for documents and initial clustering.

In [None]:
let dists = CsvFile.Load(output_dir + "/" + hausdorff_dists_filename,
                         separators=" ", hasHeaders=false).Cache()
let sims = CsvFile.Load(output_dir + "/" + jaccard_sims_filename,
                         separators=" ", hasHeaders=false).Cache()

In [None]:
let bins = [1 .. num_bins]
let docs = [0 .. dists.NumberOfColumns - 1]

In [None]:
// Start with random cluster assignments.
// see https://stackoverflow.com/a/58368747
let random = new System.Random()
let a () = random.Next(1, num_bins + 1)
let init = [ for i in 1 .. dists.NumberOfColumns -> a ()]

In [None]:
let parseDistMatrixFile (rows: seq<CsvRow>) =
    rows |> Seq.map (fun r -> docs |> List.map (fun d -> r.[d].AsFloat()) ) |> Seq.toList |> array2D

let (geoDistance: GeoScore[,]) = parseDistMatrixFile dists.Rows
let (semanticSim: SemanticScore[,]) = parseDistMatrixFile sims.Rows

#### Configure Markov Chains

In [None]:
let (initCluster: Clustering) = init |> List.zip docs |> Map.ofList

In [None]:
let ChainIter (chain: MarkovChain) = 
    let runChain = function
    | [(curState, curScore)] -> chain.StateTransistionStep(curState, curScore):: [(curState, curScore)]
    | (curState, curScore)::cs -> chain.StateTransistionStep(curState, curScore):: (curState, curScore)::cs
    | _ -> raise (System.ArgumentException("Initial State cannot be empty"))
    runChain

In [None]:
let (flip: Proposal) = FlipProposal bins docs
let (score: ScoreFunction) = AvgIntraCluster bins

let (accept: AcceptanceFunction) = GeoSemAcceptanceProb beta
let (acceptSemantic: AcceptanceFunction) = SemAcceptanceProb beta
let (acceptGeographic: AcceptanceFunction) = GeoAcceptanceProb beta

let (chain: MarkovChain) = MarkovChain(flip, score, geoDistance, semanticSim, accept)
let (chainSemantic: MarkovChain) = MarkovChain(flip, score, geoDistance, semanticSim, acceptSemantic)
let (chainGeographic: MarkovChain) = MarkovChain(flip, score, geoDistance, semanticSim, acceptGeographic)

let chainBoth = ChainIter chain
let results = [1 .. iters] |> List.fold (fun acc i -> chainBoth acc) [(initCluster, chain.ScoreFunction initCluster)]
let scoreResults = results |> List.unzip |> snd |> List.rev

let chainSem = ChainIter chainSemantic
let resultsSemantic = [1 .. iters] |> List.fold (fun acc i -> chainSem acc) [(initCluster, chain.ScoreFunction initCluster)]
let scoreResultsSemantic = resultsSemantic |> List.unzip |> snd |> List.rev

let chainGeo = ChainIter chainGeographic
let resultsGeo = [1 .. iters] |> List.fold (fun acc i -> chainGeo acc) [(initCluster, chain.ScoreFunction initCluster)]
let scoreResultsGeographic = resultsGeo |> List.unzip |> snd |> List.rev

#### Function to run chain and accumulated reuslts.

In [None]:
let ChainIter (chain: MarkovChain) = 
    let runChain = function
    | [(curState, curScore)] -> chain.StateTransistionStep(curState, curScore):: [(curState, curScore)]
    | (curState, curScore)::cs -> chain.StateTransistionStep(curState, curScore):: (curState, curScore)::cs
    | _ -> raise (System.ArgumentException("Initial State cannot be empty"))
    runChain

In [None]:
let chainBoth = ChainIter chain
let results = [1 .. iters] |> List.fold (fun acc i -> chainBoth acc) [(initCluster, chain.ScoreFunction initCluster)]

In [None]:
let maxSemJsonBoth = results |> List.maxBy (fun (_, (_, s)) -> s) |> fst |> JsonConvert.SerializeObject
File.WriteAllText(output_dir + "/" + output_filename, maxSemJsonBoth)

In [None]:
let scoreResultTypes = [(scoreResults, "Both"); (scoreResultsSemantic, "Optimize Semantic"); (scoreResultsGeographic, "Optimize Geographic")]

In [None]:
let steps = scoreResults.Length - 1
let lines = 
    scoreResultTypes |> List.map (fun (res, name) ->
        [for x in 1 .. steps -> (x, res.[x] |> fst)] |> Chart.Line 
                                                     |> Chart.withTraceName(name) 
                                                     |> Chart.withY_AxisStyle("Hausdorff Distance")
        )
    |> Chart.Combine
    |> Chart.withLayoutGridStyle(XSide=StyleParam.LayoutGridXSide.Bottom,YGap= 0.1)
    |> Chart.withTitle("Average Intra-Cluster Geographic Scores")
    |> Chart.withX_AxisStyle("Steps")

lines |> Chart.SaveHtmlAs(sprintf "%s/geographic_scores_beta_%f.html" output_dir beta)

In [None]:
let steps = scoreResults.Length - 1
let lines = 
    scoreResultTypes |> List.map (fun (res, name) ->
        [for x in 1 .. steps -> (x, res.[x] |> snd)] |> Chart.Line 
                                                     |> Chart.withTraceName(name) 
                                                     |> Chart.withY_AxisStyle("Jaccard Index")
        )
    |> Chart.Combine
    |> Chart.withLayoutGridStyle(XSide=StyleParam.LayoutGridXSide.Bottom,YGap= 0.1)
    |> Chart.withTitle("Average Intra-Cluster Semantic Scores")
    |> Chart.withX_AxisStyle("Steps")

lines |> Chart.SaveHtmlAs(sprintf "%s/semantic_scores_beta_%f.html" output_dir beta)