# Constructing a matrix of edit distances as a measure of horizontal variation


This notebook first builds a corpus of aligned texts. (See fuller explanation in related NB.)

It then computes a matrix of edit-distance scores for each line of each MS against the corresponding line of every MS.





## 1. Building aligned corpora

In [None]:
// Configure notebook
val personalRepo = coursierapi.MavenRepository.of("https://dl.bintray.com/neelsmith/maven")
interp.repositories() ++= Seq(personalRepo)


In [None]:
import $ivy.`edu.holycross.shot.cite::xcite:4.3.0`
import $ivy.`edu.holycross.shot::ohco2:10.20.3`
import $ivy.`edu.holycross.shot::greek:5.5.1`
import $ivy.`edu.holycross.shot.mid::orthography:2.0.0`

In [None]:
import edu.holycross.shot.cite._
import edu.holycross.shot.ohco2._
import edu.holycross.shot.greek._
import edu.holycross.shot.mid.orthography._


val venetusAUrl = "https://raw.githubusercontent.com/neelsmith/summer2020nbs/master/data/vaIliad-2020i.cex"
val twins9Url = "https://raw.githubusercontent.com/neelsmith/summer2020nbs/master/data/twins9corpus.cex"
val allenUrl = "https://raw.githubusercontent.com/neelsmith/summer2020nbs/master/data/iliad-allen.cex"



In [None]:
// create  source corpora
val twins9 = CorpusSource.fromUrl(twins9Url)
val allen = CorpusSource.fromUrl(allenUrl)
val venetusA = CorpusSource.fromUrl(venetusAUrl)

In [None]:
// Iliad, here book 10 only
val venetusAIliad9 = venetusA  ~~ CtsUrn("urn:cts:greekLit:tlg0012.tlg001.msA:9")
val venetusBIliad9 = twins9 ~~ CtsUrn("urn:cts:greekLit:tlg0012.tlg001.msB:9")
val oopsIliad9 = twins9 ~~ CtsUrn("urn:cts:greekLit:tlg0012.tlg001.e3:9")
val allenIliad9 = allen ~~ CtsUrn("urn:cts:greekLit:tlg0012.tlg001.allen:9")



In [None]:
/*
- tokenize, keep only lexical tokens
- make LiteraryGreekStrings from lexical tokens, drop accents and breathings
- recompose into a single stripped-down string for each line
*/
def curateNode(cn: CitableNode, siglum: String) : CitableNode = {
  if (cn.text.isEmpty){
    println("EMPTY TEXT: " + cn.urn)
    cn
  } else {

    val lexTokens = LiteraryGreekString.tokenizeNode(cn).filter(_.tokenCategory == Some(LexicalToken))
    val lgs = lexTokens.map(tkn => LiteraryGreekString(tkn.text).toLower.stripBreathingAccent.ascii)
    val simpleAscii = lgs.mkString(" ")
    CitableNode(cn.urn.addVersion(s"${siglum}_simpleascii"),simpleAscii)
  }
}


def asciiCorpus(c: Corpus, siglum: String) : Corpus = {
  Corpus(c.nodes.map(n => curateNode(n, siglum)))
}


In [None]:
// These are agonizingly slow
val oopsIliad9ascii = asciiCorpus(oopsIliad9, "e3")


In [None]:
val venetusBIliad9ascii = asciiCorpus(oopsIliad9, "msB")


In [None]:
println(oopsIliad9ascii.size + " vs " + venetusBIliad9ascii.size)

In [None]:
val venetusAIliad9ascii = asciiCorpus(venetusAIliad9, "msA")


In [None]:
val allenIliad9ascii = asciiCorpus(allenIliad9, "allen")


In [None]:
// align corpora.
def extractMatches(c: Corpus, ulist: Vector[CtsUrn]) = {
  val nodes = for (urn <- ulist) yield {
    val matchCorpus = c ~~ urn
    //println("MATCHED " + matchCorpus.size)
    matchCorpus.size match {
      case 0 => Vector(CitableNode(urn, ""))
      case _ => matchCorpus.nodes
    }
  }
  Corpus(nodes.flatten)
}


val urnList = oopsIliad9ascii.nodes.map(_.urn.dropVersion)


Here are the final results we want:

In [None]:
val alignedTexts = Vector(
  oopsIliad9ascii,
  extractMatches(venetusBIliad9ascii, urnList),
  extractMatches(venetusAIliad9ascii, urnList),
  extractMatches(allenIliad9ascii, urnList)
)

## 2. Compute the matrix

In [None]:
//////////////////////////////////////////////////////////////
// Edit distance using Levenshtein method
import scala.collection.mutable
import scala.collection.parallel.ParSeq

// Implementation from RosettaCode:
// https://rosettacode.org/wiki/Levenshtein_distance
def levenshteinMemo(s1: String, s2: String): mutable.Map[(Int, Int), Int] = {
  val memoizedCosts = mutable.Map[(Int, Int), Int]()

  def lev: ((Int, Int)) => Int = {
    case (k1, k2) =>
      memoizedCosts.getOrElseUpdate((k1, k2), (k1, k2) match {
        case (i, 0) => i
        case (0, j) => j
        case (i, j) =>
          ParSeq(1 + lev((i - 1, j)),
                 1 + lev((i, j - 1)),
                 lev((i - 1, j - 1))
                   + (if (s1(i - 1) != s2(j - 1)) 1 else 0)).min
      })
  }
  lev((s1.length, s2.length))
  memoizedCosts
}

def editDistance(s1: String, s2: String) : Int = {
  levenshteinMemo(s1, s2)((s1.length, s2.length))
}

In [None]:
def rowData(baseLine: String, cfTexts: Vector[String]) : Vector[Int] = {
  val data = for (i <- 0 until cfTexts.size) yield {
    editDistance(baseLine, cfTexts(i))
  }
  data.toVector
}


In [None]:
// get text content of parallel URNs
def parallelTexts(urn: CtsUrn, corpora: Vector[Corpus]): Vector[String] = {
  corpora.map(c => {
    val matches = c ~~ urn
    matches.nodes.head.text })
}



In [None]:
val sigla = alignedTexts.map(c => c.nodes(3).urn.version)
val colLabels = "base," + sigla.mkString(",")

In [None]:
def dataMatrix = for (documentsIndex <- 0 until alignedTexts.size) yield {
  println("Document " + documentsIndex + s" (of ${alignedTexts.size})")
  
  val baseText = alignedTexts(documentsIndex)
  val colName = baseText.nodes.head.urn.version
  println(colName)
  val datacorpus = for (lineIndex <- 0 until baseText.size) yield {
    val baseTextPassage = baseText.nodes(lineIndex)
    val rowLabel = s"${sigla(documentsIndex)}:" + baseTextPassage.urn.passageComponent
    println(rowLabel)
    val cfLines = parallelTexts(baseTextPassage.urn.dropVersion, alignedTexts)

    println("Computing edit distance against " + rowLabel + s" (psg ${lineIndex} in document ${documentsIndex})")
    val data = rowData(baseTextPassage.text, cfLines)
    rowLabel + "," + data.mkString(",") 
  }
  println("Done.")
  datacorpus
}


In [None]:
val dm = dataMatrix

Here's the final .csv output:

In [None]:
println(colLabels + "\n" + dm.flatten.mkString("\n"))