Skip to content

Commit

Permalink
Add LanguageModelFilesWriter (#37)
Browse files Browse the repository at this point in the history
  • Loading branch information
pemistahl committed Jun 7, 2020
1 parent 3172038 commit d64574b
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 64 deletions.
@@ -0,0 +1,90 @@
package com.github.pemistahl.lingua.api

import com.github.pemistahl.lingua.internal.Ngram
import com.github.pemistahl.lingua.internal.TrainingDataLanguageModel
import java.io.FileNotFoundException
import java.nio.file.Files
import java.nio.file.NoSuchFileException
import java.nio.file.NotDirectoryException
import java.nio.file.Path

object LanguageModelFilesWriter {

@JvmStatic
fun createAndWriteLanguageModelFiles(
inputFilePath: Path,
outputDirectoryPath: Path,
language: Language,
charClass: String?
) {
checkInputFilePath(inputFilePath)
checkOutputDirectoryPath(outputDirectoryPath)

val unigramModel = createLanguageModel(inputFilePath, language, 1, charClass, emptyMap())
val bigramModel = createLanguageModel(inputFilePath, language, 2, charClass, unigramModel.absoluteFrequencies)
val trigramModel = createLanguageModel(inputFilePath, language, 3, charClass, bigramModel.absoluteFrequencies)
val quadrigramModel = createLanguageModel(inputFilePath, language, 4, charClass, trigramModel.absoluteFrequencies)
val fivegramModel = createLanguageModel(inputFilePath, language, 5, charClass, quadrigramModel.absoluteFrequencies)

writeLanguageModel(unigramModel, outputDirectoryPath, "unigrams.json")
writeLanguageModel(bigramModel, outputDirectoryPath, "bigrams.json")
writeLanguageModel(trigramModel, outputDirectoryPath, "trigrams.json")
writeLanguageModel(quadrigramModel, outputDirectoryPath, "quadrigrams.json")
writeLanguageModel(fivegramModel, outputDirectoryPath, "fivegrams.json")
}

private fun checkInputFilePath(inputFilePath: Path) {
if (!inputFilePath.isAbsolute()) {
throw IllegalArgumentException("Input file path '$inputFilePath' is not absolute")
}
if (!Files.exists(inputFilePath)) {
throw NoSuchFileException("Input file '$inputFilePath' does not exist")
}
if (!Files.isRegularFile(inputFilePath)) {
throw FileNotFoundException("Input file path '$inputFilePath' does not represent a regular file")
}
}

private fun checkOutputDirectoryPath(outputDirectoryPath: Path) {
if (!outputDirectoryPath.isAbsolute()) {
throw IllegalArgumentException("Output directory path '$outputDirectoryPath' is not absolute")
}
if (!Files.exists(outputDirectoryPath)) {
throw NotDirectoryException("Output directory '$outputDirectoryPath' does not exist")
}
if (!Files.isDirectory(outputDirectoryPath)) {
throw NotDirectoryException("Output directory path '$outputDirectoryPath' does not represent a directory")
}
}

private fun createLanguageModel(
inputFilePath: Path,
language: Language,
ngramLength: Int,
charClass: String?,
lowerNgramAbsoluteFrequencies: Map<Ngram, Int>
): TrainingDataLanguageModel {

lateinit var model: TrainingDataLanguageModel
inputFilePath.toFile().bufferedReader().useLines { lines ->
model = TrainingDataLanguageModel.fromText(
text = lines,
language = language,
ngramLength = ngramLength,
charClass = charClass,
lowerNgramAbsoluteFrequencies = lowerNgramAbsoluteFrequencies
)
}
return model
}

private fun writeLanguageModel(
model: TrainingDataLanguageModel,
outputDirectoryPath: Path,
fileName: String
) {
outputDirectoryPath.resolve(fileName).toFile().bufferedWriter().use { writer ->
writer.write(model.toJson())
}
}
}
Expand Up @@ -50,7 +50,7 @@ internal data class TrainingDataLanguageModel(
text: Sequence<String>,
language: Language,
ngramLength: Int,
charClass: String,
charClass: String?,
lowerNgramAbsoluteFrequencies: Map<Ngram, Int>
): TrainingDataLanguageModel {

Expand Down Expand Up @@ -93,11 +93,15 @@ internal data class TrainingDataLanguageModel(
private fun computeAbsoluteFrequencies(
text: Sequence<String>,
ngramLength: Int,
charClass: String
charClass: String?
): Map<Ngram, Int> {

val absoluteFrequencies = hashMapOf<Ngram, Int>()
val regex = Regex("""[\p{L}&&\p{$charClass}]+""")
val regex = if (charClass != null) {
Regex("""[\p{L}&&\p{$charClass}]+""")
} else {
Regex("""[\p{L}]+""")
}

for (line in text) {
val lowerCasedLine = line.toLowerCase()
Expand Down
Expand Up @@ -101,61 +101,3 @@ internal fun writeTestDataFiles(inputPath: String, outputPath: String, isoCode:
}
println("Done.\n")
}

internal fun writeLanguageModelsFromLeipzigCorpusFile(
inputPath: String,
outputPath: String,
language: Language,
charClass: String
) {
println("Writing unigrams...")
lateinit var unigramModel: TrainingDataLanguageModel
inputPath.asLineSequenceResource { lines ->
unigramModel = TrainingDataLanguageModel.fromText(lines, language, 1, charClass, emptyMap())
File("$outputPath/unigrams.json").bufferedWriter().use { writer ->
writer.write(unigramModel.toJson())
}
}
println("Done.\n")

println("Writing bigrams...")
lateinit var bigramModel: TrainingDataLanguageModel
inputPath.asLineSequenceResource { lines ->
bigramModel = TrainingDataLanguageModel.fromText(lines, language, 2, charClass, unigramModel.absoluteFrequencies)
File("$outputPath/bigrams.json").bufferedWriter().use { writer ->
writer.write(bigramModel.toJson())
}
}
println("Done.\n")

println("Writing trigrams...")
lateinit var trigramModel: TrainingDataLanguageModel
inputPath.asLineSequenceResource { lines ->
trigramModel = TrainingDataLanguageModel.fromText(lines, language, 3, charClass, bigramModel.absoluteFrequencies)
File("$outputPath/trigrams.json").bufferedWriter().use { writer ->
writer.write(trigramModel.toJson())
}
}
println("Done.\n")

println("Writing quadrigrams...")
lateinit var quadrigramModel: TrainingDataLanguageModel
inputPath.asLineSequenceResource { lines ->
quadrigramModel = TrainingDataLanguageModel.fromText(lines, language, 4, charClass, trigramModel.absoluteFrequencies)
File("$outputPath/quadrigrams.json").bufferedWriter().use { writer ->
writer.write(quadrigramModel.toJson())
}
}
println("Done.\n")

println("Writing fivegrams...")
lateinit var fivegramModel: TrainingDataLanguageModel
inputPath.asLineSequenceResource { lines ->
fivegramModel = TrainingDataLanguageModel.fromText(lines, language, 5, charClass, quadrigramModel.absoluteFrequencies)
File("$outputPath/fivegrams.json").bufferedWriter().use { writer ->
writer.write(fivegramModel.toJson())
}
}

println("Done.")
}
@@ -0,0 +1,144 @@
package com.github.pemistahl.lingua.api

import org.assertj.core.api.Assertions
import org.assertj.core.api.Assertions.assertThat
import org.junit.jupiter.api.AfterEach
import org.junit.jupiter.api.BeforeEach
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.io.TempDir
import java.nio.file.Files
import java.nio.file.Path
import java.util.stream.Collectors
import java.util.stream.Collectors.toList

class LanguageModelFilesWriterTest {

private lateinit var inputFilePath: Path

private val replacementRegex = Regex("\n\\s*")

private val text = """
These sentences are intended for testing purposes.
Do not use them in production!
By the way, they consist of 23 words in total.
""".toLowerCase().trimIndent()

private val expectedUnigramLanguageModel = """
{
"language":"ENGLISH",
"ngrams":{
"3/100":"a c p u y",
"1/100":"b g l m",
"1/20":"d r",
"7/50":"e",
"1/50":"f w",
"1/25":"h",
"3/50":"i",
"1/10":"n o s",
"13/100":"t"
}
}
""".replace(replacementRegex, "")

private val expectedBigramLanguageModel = """
{
"language":"ENGLISH",
"ngrams":{
"1/5":"de do ds du rd re ro rp nt on st or ot",
"1/3":"pr pu uc ur us al ar ay ce co ct po",
"1/14":"ed em ey",
"2/3":"in",
"1/6":"io is",
"3/14":"en",
"2/7":"es",
"1/10":"nc nd ng no ns od si of os",
"1/2":"fo wa wo",
"2/5":"se",
"1/1":"by he",
"1/13":"ta to",
"3/13":"te",
"4/13":"th",
"2/13":"ti"
}
}
""".replace(replacementRegex, "")

private val expectedTrigramLanguageModel = """
{
"language":"ENGLISH",
"ngrams":{
"1/1":"rds ose ded con use ion ist pur cti wor tal uct pro odu nsi rod for ces nce not pos are tot sis nte way nde rpo the urp duc",
"1/4":"est hem hes hey sen ses ing int ese",
"1/2":"tin tio ota sti ord ons",
"2/3":"ten",
"1/3":"tes end enc ent"
}
}
""".replace(replacementRegex, "")

private val expectedQuadrigramLanguageModel = """
{
"language":"ENGLISH",
"ngrams":{
"1/1":"onsi sist ende ords esti oduc nces rpos ting nsis nten tota cons tion prod otal test ence pose oses nded inte urpo duct sent stin ucti ente purp ctio rodu word hese",
"1/2":"tenc tend",
"1/4":"thes they them"
}
}
""".replace(replacementRegex, "")

private val expectedFivegramLanguageModel = """
{
"language":"ENGLISH",
"ngrams":{
"1/1":"testi sente ences tende ducti these onsis total uctio enten poses ction produ inten nsist words sting purpo tence estin roduc urpos rpose ended oduct consi",
"1/2":"ntenc ntend"
}
}
""".replace(replacementRegex, "")

@BeforeEach
fun beforeEach() {
inputFilePath = Files.createTempFile(null, null)
inputFilePath.toFile().writeText(text)
}

@AfterEach
fun afterEach() {
inputFilePath.toFile().delete()
}

@Test
fun createAndWriteLanguageModelFiles(@TempDir outputDirectoryPath: Path) {
LanguageModelFilesWriter.createAndWriteLanguageModelFiles(
inputFilePath = inputFilePath,
outputDirectoryPath = outputDirectoryPath,
language = Language.ENGLISH,
charClass = "IsLatin"
)

val modelFilePaths = Files.list(outputDirectoryPath).sorted { first, second ->
first.fileName.compareTo(second.fileName)
}.collect(toList())

assertThat(modelFilePaths.size).`as`("number of language model files").isEqualTo(5)

testModelFile(modelFilePaths[4], "unigrams.json", expectedUnigramLanguageModel)
testModelFile(modelFilePaths[0], "bigrams.json", expectedBigramLanguageModel)
testModelFile(modelFilePaths[3], "trigrams.json", expectedTrigramLanguageModel)
testModelFile(modelFilePaths[2], "quadrigrams.json", expectedQuadrigramLanguageModel)
testModelFile(modelFilePaths[1], "fivegrams.json", expectedFivegramLanguageModel)
}

private fun testModelFile(
modelFilePath: Path,
expectedFileName: String,
expectedModelContent: String
) {
val modelFileName = modelFilePath.fileName.toString()
val modelContent = modelFilePath.toFile().readText()

assertThat(modelFileName).`as`("model file name").isEqualTo(expectedFileName)
assertThat(modelContent).`as`("model content").isEqualTo(expectedModelContent)
}
}
Expand Up @@ -35,7 +35,7 @@ class TrainingDataLanguageModelTest {
Fraction(numerator, denominator)
}

private val unigramLanguageModelJson = """
private val expectedUnigramLanguageModel = """
{
"language":"ENGLISH",
"ngrams":{
Expand Down Expand Up @@ -249,12 +249,12 @@ class TrainingDataLanguageModelTest {
charClass = "IsLatin",
lowerNgramAbsoluteFrequencies = emptyMap()
)
assertThat(model.toJson()).isEqualTo(unigramLanguageModelJson)
assertThat(model.toJson()).isEqualTo(expectedUnigramLanguageModel)
}

@Test
fun `assert that unigram language model is correctly deserialized from json`() {
val model = TrainingDataLanguageModel.fromJson(unigramLanguageModelJson)
val model = TrainingDataLanguageModel.fromJson(expectedUnigramLanguageModel)
assertThat(model.language).isEqualTo(Language.ENGLISH)
assertThat(model.absoluteFrequencies).isEmpty()
assertThat(model.relativeFrequencies).containsExactlyInAnyOrderEntriesOf(expectedUnigramRelativeFrequencies)
Expand Down

0 comments on commit d64574b

Please sign in to comment.