## Configuring Jupyter notebook

In [None]:
val myBT = coursierapi.MavenRepository.of("https://dl.bintray.com/neelsmith/maven")
interp.repositories() ++= Seq(myBT)

In [None]:
import $ivy.`edu.holycross.shot::ohco2:10.18.2`
import $ivy.`edu.holycross.shot.cite::xcite:4.2.0`
import $ivy.`edu.holycross.shot::midvalidator:10.0.0`
import $ivy.`edu.holycross.shot::latincorpus:2.2.1`
import $ivy.`edu.holycross.shot::latphone:2.7.2`

## Load corpus from URL

In [None]:
import edu.holycross.shot.cite._
import edu.holycross.shot.ohco2._

val hyginusUrl = "https://raw.githubusercontent.com/neelsmith/hctexts/master/cex/hyginus.cex"

val corpus = CorpusSource.fromUrl(hyginusUrl, cexHeader = true)

## Create tokenizable corpus

Load FST parser output.

In [None]:
val hyginusFstUrl = "https://raw.githubusercontent.com/neelsmith/hctexts/master/parser-output/hyginus/hyginus-parses.txt"
import scala.io.Source
val fstOutput = Source.fromURL(hyginusFstUrl).getLines.toVector

Tokenize corpus according to its orthographic system (here, `Latin23Alphabet`).

In [None]:
import edu.holycross.shot.latin._

import edu.holycross.shot.mid.validator._


val tcorpus = TokenizableCorpus(corpus, Latin23Alphabet )
val wordList =  tcorpus.wordList

Combine parser output with tokenized corpus to get a `LatinCorpus` instance.

In [None]:
import edu.holycross.shot.latincorpus._

val lc = LatinCorpus.fromFstLines(
      corpus,
       Latin23Alphabet,
     fstOutput,
      strict = false
    )


In [None]:
// This should be the number of distinct analyzed tokens
lc.lexemeTokenIndex.size

In [None]:
// This is the histogram of recognized lexemes:
lc.labelledLexemeHistogram

In [None]:
// It would be nice to visualize, so let's use the 
// plotly library with ammonite sh:
// Make plotly libraries available to this notebook:
import $ivy.`org.plotly-scala::plotly-almond:0.7.1`

In [None]:
// Import plotly libraries, and set display defaults suggested for use in Jupyter NBs:
import plotly._, plotly.element._, plotly.layout._, plotly.Almond._
repl.pprinter() = repl.pprinter().copy(defaultHeight = 3)

## Zipf's Law for analyzed lexemes in Hyginus

In [None]:
val items = lc.labelledLexemeHistogram.frequencies.map(fr => fr.item)
val counts = lc.labelledLexemeHistogram.frequencies.map(fr => fr.count)
lc.labelledLexemeHistogram
val zipf = Vector(
  Bar(x = items, y = counts)
)
plot(zipf)

## Zipf's Law for analyzed tokens in Hyginus

In [None]:
val items = lc.tcorpus.lexHistogram.frequencies.map(fr => fr.item)
val counts = lc.tcorpus.lexHistogram.frequencies.map(fr => fr.count)
lc.labelledLexemeHistogram
val zipfTokens = Vector(
  Bar(x = items, y = counts)
)
plot(zipfTokens)

# To work out

- relation of counts: 
    - lexical tokens in corpus
    - analyzed lexical tokens
    - recognized lexemes
- PoS distribution:  map each lexeme in lexeme histogram to its PoS 

(This is OK since lexical ambiguity is effectively 0)


## Histogram of disambiguated forms

In [None]:
val items = lc.formsHistogram.sorted.frequencies.map(fr => fr.item)
val counts = lc.formsHistogram.sorted.frequencies.map(fr => fr.count)
lc.labelledLexemeHistogram
val zipfForms = Vector(
  Bar(x = items, y = counts)
)
plot(zipfForms)

## Create a map of lexeme to  PoS


In [None]:
val sampleForm = lc.analyzed.map (a => a.analyses(0))
val lexemePoSpairing = sampleForm.map (f => f.lemmaId -> f.posLabel)
val lexemeToPosMap = lexemePoSpairing.toMap

In [None]:
val example = "ls.n16278"
lexemeToPosMap(example)

## Map lexeme histogram to PoS histogram

In [None]:
val freqOpts = lc.lexemeHistogram.frequencies.map(
  fr => {
    if (lexemeToPosMap.contains(fr.item)) {
      Some(edu.holycross.shot.histoutils.Frequency(lexemeToPosMap(fr.item),  fr.count))
    } else {
      None
    }
    
  })
val freqs = freqOpts.flatten

## Look at PoS distribution for top 400 lexemes

In [None]:
val top400Items = freqs.map(f => f.item).take(400)
val top400Counts = freqs.map(f => f.count).take(400)

In [None]:
val top400Freqs = freqs.take(400)

In [None]:
val posGroups = top400Freqs.groupBy(fr => fr.item)
val posCounts = posGroups.toVector.map{ case (pos, freqsV) => pos -> freqsV.map(f => f.count).sum }

In [None]:
val topPosCounts = posCounts.toVector.sortBy( _._2).map{ case(p,c) => edu.holycross.shot.histoutils.Frequency(p,c)}

In [None]:
val topPosHisto = edu.holycross.shot.histoutils.Histogram(topPosCounts).sorted

In [None]:
val items = topPosHisto.sorted.frequencies.map(fr => fr.item)
val counts = topPosHisto.sorted.frequencies.map(fr => fr.count)

val topPosPlot = Vector(
  Bar(x = items, y = counts)
)
plot(topPosPlot)

## Repeat for second 400 item

In [None]:
val second400Freqs = freqs.slice(400, 800)

In [None]:
val tier2Groups = second400Freqs.groupBy(fr => fr.item)
val tier2Counts = tier2Groups.toVector.map{ case (pos, freqsV) => pos -> freqsV.map(f => f.count).sum }

In [None]:
val tier2PosCounts = tier2Counts.toVector.sortBy( _._2).map{ case(p,c) => edu.holycross.shot.histoutils.Frequency(p,c)}

In [None]:
val tier2PosHisto = edu.holycross.shot.histoutils.Histogram(tier2PosCounts).sorted

In [None]:
val items = tier2PosHisto.sorted.frequencies.map(fr => fr.item)
val counts = tier2PosHisto.sorted.frequencies.map(fr => fr.count)

val tierPosPlot = Vector(
  Bar(x = items, y = counts)
)
plot(tierPosPlot)