# Building aligned corpora

This is the background work organizing data for an analysis of horizontal variation.

For that purpose, we will both align overlapping passages in multiple versions of the *Iliad*, and simplify the Greek texts: for the purposes of our comparisons, we want to ignore differences in case, accent or breathing.

In [None]:
// Configure notebook
val personalRepo = coursierapi.MavenRepository.of("https://dl.bintray.com/neelsmith/maven")
interp.repositories() ++= Seq(personalRepo)


In [None]:
import $ivy.`edu.holycross.shot.cite::xcite:4.3.0`
import $ivy.`edu.holycross.shot::ohco2:10.20.3`
import $ivy.`edu.holycross.shot::greek:5.5.1`
import $ivy.`edu.holycross.shot.mid::orthography:2.0.0`

In [None]:
import edu.holycross.shot.cite._
import edu.holycross.shot.ohco2._
import edu.holycross.shot.greek._
import edu.holycross.shot.mid.orthography._


val venetusAUrl = "https://raw.githubusercontent.com/neelsmith/summer2020nbs/master/data/vaIliad-2020i.cex"
val twins10Url = "https://raw.githubusercontent.com/neelsmith/summer2020nbs/master/data/twins10corpus.cex"
val allenUrl = "https://raw.githubusercontent.com/neelsmith/summer2020nbs/master/data/iliad-allen.cex"



In [None]:
// create  source corpora
val twins10 = CorpusSource.fromUrl(twins10Url)
val allen = CorpusSource.fromUrl(allenUrl)
val venetusA = CorpusSource.fromUrl(venetusAUrl)

In [None]:
// Iliad, here book 10 only
val venetusAIliad10 = venetusA  ~~ CtsUrn("urn:cts:greekLit:tlg0012.tlg001.msA:10")
val oopsIliad = twins10 ~~ CtsUrn("urn:cts:greekLit:tlg0012.tlg001.e3:")
val allenIliad10 = allen ~~ CtsUrn("urn:cts:greekLit:tlg0012.tlg001.allen:10")



In [None]:
/*
- tokenize, keep only lexical tokens
- make LiteraryGreekStrings from lexical tokens, drop accents and breathings
- recompose into a single stripped-down string for each line
*/
def curateNode(cn: CitableNode, siglum: String) : CitableNode = {
  if (cn.text.isEmpty){
    println("EMPTY TEXT: " + cn.urn)
    cn
  } else {

    val lexTokens = LiteraryGreekString.tokenizeNode(cn).filter(_.tokenCategory == Some(LexicalToken))
    val lgs = lexTokens.map(tkn => LiteraryGreekString(tkn.text).toLower.stripBreathingAccent.ascii)
    val simpleAscii = lgs.mkString(" ")
    CitableNode(cn.urn.addVersion(s"${siglum}_simpleascii"),simpleAscii)
  }
}


def asciiCorpus(c: Corpus, siglum: String) : Corpus = {
  Corpus(c.nodes.map(n => curateNode(n, siglum)))
}


In [None]:
// These are agonizingly slow
val oopsIliad10ascii = asciiCorpus(oopsIliad, "e3")


In [None]:
val venetusAIliad10ascii = asciiCorpus(venetusAIliad10, "msA")


In [None]:
val allenIliad10ascii = asciiCorpus(allenIliad10, "allen")


In [None]:
// align corpora.
def extractMatches(c: Corpus, ulist: Vector[CtsUrn]) = {
  val nodes = for (urn <- ulist) yield {
    val matchCorpus = c ~~ urn
    //println("MATCHED " + matchCorpus.size)
    matchCorpus.size match {
      case 0 => Vector(CitableNode(urn, ""))
      case _ => matchCorpus.nodes
    }
  }
  Corpus(nodes.flatten)
}


val urnList = oopsIliad10ascii.nodes.map(_.urn.dropVersion)


In [None]:
val names = Vector(
  "Upsilon 1.1",
  "Venetus A",
  "Allen OCT"
)

val alignedTexts = Vector(
  oopsIliad10ascii,
  extractMatches(venetusAIliad10ascii, urnList),
  extractMatches(allenIliad10ascii, urnList)
)