In [None]:
import java.nio.file.Files
import java.nio.file.Path
import java.util.stream.Collectors
import kotlin.math.ln

data class Message(
    val type: Int,
    val words: List<Int>,
)

class DataFrame(
    private val n: Int,
    private val messagePrefix: String,
) {
    private companion object {
        const val PRIME = 31
    }

    fun dataFrame(): List<List<Message>> {
        val data = mutableListOf<List<Message>>()

        for (i in 1..10) {
            data.add(readPart(i))
        }

        return data
    }

    private fun readPart(p: Int) = Files.walk(Path.of(messagePrefix + p))
        .collect(Collectors.toList())
        .drop(1)
        .map {
            Message(
                convertFileName(it.fileName.toString()),
                convertData(it)
            )
        }

    private fun convertFileName(fileName: String) = if (fileName.contains("spmsg")) 0 else 1

    private fun convertData(path: Path): List<Int> {
        val words = Files
            .readAllLines(path)
            .flatMap { it.split(" ").filter { s -> s.isNotEmpty() } }
            .drop(1)
            .map { it.toInt() }

        return (0 until words.size - n)
            .map { words.subList(it, it + n).reduce { acc, i -> (acc + i) * PRIME } }
    }
}

In [None]:
class NaiveBayesClassifier(
    private val train: List<Message>,
    private val lambdas: List<Int>,
    private val alpha: Double,
    private val countClasses: Int = 2
) {
    private val words = HashSet<Int>()
    private val classDocumentSize = DoubleArray(countClasses)
    private val classWordSize = DoubleArray(countClasses)
    private val classWordCount = List(countClasses) { HashMap<Int, Int>() }
    private val classWordProbability = List(countClasses) { HashMap<Int, Pair<Double, Double>>() } // numerator and denominator

    fun fit(): NaiveBayesClassifier {
        train.forEach {
            words.addAll(it.words)
            classDocumentSize[it.type]++
            classWordSize[it.type] += it.words.size.toDouble()
            it.words.forEach { word ->
                classWordCount[it.type].merge(word, 1, Int::plus)
            }
        }

        classWordCount.forEachIndexed { i, it ->
            it.forEach { (key, value) ->
                classWordProbability[i][key] = Pair(alpha + value, classWordSize[i] + alpha * words.size)
            }
        }

        return this
    }

    fun predict(message: List<Int>): List<Double> {
        return List(countClasses) {
            var prob: Double = ln(classDocumentSize[it] * lambdas[it]) - ln(train.size.toDouble())
            message.forEach { word ->
                val cur = classWordProbability[it][word] ?: Pair(alpha, classDocumentSize[it] + words.size * alpha)
                prob += ln(cur.first) - ln(cur.second)
            }
            prob
        }
    }

    fun accuracy(test: List<Message>): Double {
        var correctPredict = 0.0
        test.forEach {
            val listPr = predict(it.words)
            if (listPr[it.type] >= listPr[1 - it.type])
                correctPredict++
        }
        return correctPredict / test.size
    }
}

In [None]:
val MESSAGES_PREFIX = "./messages/part"

fun accuracy(lambdas: List<Int>, alpha: Double, n: Int): Double {
    var averageAccuracy = 0.0
    val countParts = 10
    for (testPart in 0 until  countParts) {
        val parts = DataFrame(n, MESSAGES_PREFIX).dataFrame()
        val train: List<Message> = (parts.subList(0, testPart) + parts.subList(testPart + 1, countParts)).flatten()
        val test = parts[testPart]
        averageAccuracy += NaiveBayesClassifier(train, lambdas, alpha).fit().accuracy(test)
    }
    return averageAccuracy / countParts
}

fun hyperparameterOptimization(): MutableList<Any> {
    var bestParams: MutableList<Any>? = null
    var accuracyBest = 0.0
    for (n in 1..3) {
        for (lambda in 1..10) {
            for (alpha in listOf(1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001)) {
                val cur = accuracy(listOf(1, lambda), alpha, n)
                if (cur > accuracyBest) {
                    accuracyBest = cur
                    bestParams = mutableListOf(n, lambda, alpha)
                }
                println(
                    """
                        Current iteration:
                        n = $n
                        lambda = $lambda
                        alpha = $alpha
                        accuracy = $cur
                    """.trimIndent()
                )
            }
        }
    }

    return bestParams!!.also { it.add(accuracyBest) }
}

In [None]:
val optimalParams = hyperparameterOptimization()

Current iteration:
n = 1
lambda = 1
alpha = 1.0
accuracy = 0.9018348623853212
Current iteration:
n = 1
lambda = 1
alpha = 0.1
accuracy = 0.9045871559633027
Current iteration:
n = 1
lambda = 1
alpha = 0.01
accuracy = 0.9302752293577982
Current iteration:
n = 1
lambda = 1
alpha = 0.001
accuracy = 0.9550458715596329
Current iteration:
n = 1
lambda = 1
alpha = 1.0E-4
accuracy = 0.9724770642201837
Current iteration:
n = 1
lambda = 1
alpha = 1.0E-5
accuracy = 0.9770642201834864
Current iteration:
n = 1
lambda = 1
alpha = 1.0E-6
accuracy = 0.9798165137614682
Current iteration:
n = 1
lambda = 1
alpha = 1.0E-7
accuracy = 0.9816513761467892
Current iteration:
n = 1
lambda = 2
alpha = 1.0
accuracy = 0.9027522935779817
Current iteration:
n = 1
lambda = 2
alpha = 0.1
accuracy = 0.9045871559633027
Current iteration:
n = 1
lambda = 2
alpha = 0.01
accuracy = 0.9284403669724771
Current iteration:
n = 1
lambda = 2
alpha = 0.001
accuracy = 0.9559633027522937
Current iteration:
n = 1
lambda = 2
alpha = 1.

Current iteration:
n = 2
lambda = 4
alpha = 1.0
accuracy = 0.9275229357798166
Current iteration:
n = 2
lambda = 4
alpha = 0.1
accuracy = 0.9339449541284404
Current iteration:
n = 2
lambda = 4
alpha = 0.01
accuracy = 0.940366972477064
Current iteration:
n = 2
lambda = 4
alpha = 0.001
accuracy = 0.9477064220183486
Current iteration:
n = 2
lambda = 4
alpha = 1.0E-4
accuracy = 0.9504587155963302
Current iteration:
n = 2
lambda = 4
alpha = 1.0E-5
accuracy = 0.9394495412844037
Current iteration:
n = 2
lambda = 4
alpha = 1.0E-6
accuracy = 0.9321100917431193
Current iteration:
n = 2
lambda = 4
alpha = 1.0E-7
accuracy = 0.9275229357798164
Current iteration:
n = 2
lambda = 5
alpha = 1.0
accuracy = 0.9266055045871558
Current iteration:
n = 2
lambda = 5
alpha = 0.1
accuracy = 0.9339449541284404
Current iteration:
n = 2
lambda = 5
alpha = 0.01
accuracy = 0.9412844036697248
Current iteration:
n = 2
lambda = 5
alpha = 0.001
accuracy = 0.9477064220183486
Current iteration:
n = 2
lambda = 5
alpha = 1.0

Current iteration:
n = 3
lambda = 6
alpha = 1.0E-7
accuracy = 0.9706422018348626
Current iteration:
n = 3
lambda = 7
alpha = 1.0
accuracy = 0.9055045871559633
Current iteration:
n = 3
lambda = 7
alpha = 0.1
accuracy = 0.9110091743119266
Current iteration:
n = 3
lambda = 7
alpha = 0.01
accuracy = 0.91651376146789
Current iteration:
n = 3
lambda = 7
alpha = 0.001
accuracy = 0.9311926605504588
Current iteration:
n = 3
lambda = 7
alpha = 1.0E-4
accuracy = 0.9559633027522937
Current iteration:
n = 3
lambda = 7
alpha = 1.0E-5
accuracy = 0.9651376146788992
Current iteration:
n = 3
lambda = 7
alpha = 1.0E-6
accuracy = 0.9678899082568808
Current iteration:
n = 3
lambda = 7
alpha = 1.0E-7
accuracy = 0.9706422018348626
Current iteration:
n = 3
lambda = 8
alpha = 1.0
accuracy = 0.9064220183486238
Current iteration:
n = 3
lambda = 8
alpha = 0.1
accuracy = 0.9119266055045874
Current iteration:
n = 3
lambda = 8
alpha = 0.01
accuracy = 0.91651376146789
Current iteration:
n = 3
lambda = 8
alpha = 0.001

In [None]:
optimalParams

[1, 1, 1.0E-7, 0.9816513761467892]

In [None]:
%use lets-plot
val n = 2 
val lambda = 4 
val alpha = 1e-7

val parts = DataFrame(n, MESSAGES_PREFIX).dataFrame()
val train: List<Message> = parts.subList(1, 10).flatten()
val test: List<Message> = parts[0]

fun roc() {
    val NBC = NaiveBayesClassifier(train, listOf(lambda, 1), alpha).fit()
    val probs = test
        .map { NBC.predict(it.words) }
        .map { Pair(it[0] / (it[0] + it[1]), it[1] / (it[0] + it[1])) }
        .sortedBy { -it.first }

    val legitCount = test.filter { it.type == 1 }.size
    val smapCount = test.size - legitCount

    val stepY = 1.0 / legitCount
    val stepX = 1.0 / smapCount

    val x_axis = mutableListOf<Double>()
    val y_axis = mutableListOf<Double>()

    var xCur = 0.0
    var yCur = 0.0

    test.forEachIndexed { i, _ ->
        if (probs[i].first > probs[i].second) {
            yCur += stepY
        } else {
            xCur += stepX
        }

        x_axis.add(xCur); y_axis.add(yCur)
    }

    val p = lets_plot(mapOf("x" to x_axis, "y" to y_axis)) + ggsize(700, 400) + geomPoint(
    color = "dark-green",
    size = 4.0) { x = "x"; y = "y" }
    p.show()

}

roc()

fun plot_accuracy() {
    val lambdas = mutableListOf(1)
    for (i in 10 until 1000 step 10) {
        lambdas.add(i)   
    }
    
    val accuracyes = mutableListOf<Double>()
    val accuracyesLegit = mutableListOf<Double>()
    for (lambda in lambdas) {
        val NBC = NaiveBayesClassifier(train, listOf(1, lambda), alpha).fit()
        val accuracy = NBC.accuracy(test) 
        val legitCount = test.filter { it.type == 1 }
        val NBCLigit = NaiveBayesClassifier(train, listOf(1, lambda * 10000), alpha).fit()
        val accuracyLegit = NBCLigit.accuracy(legitCount)
        accuracyes.add(accuracy)
        accuracyesLegit.add(accuracyLegit)
    }
    
    val p = lets_plot(mapOf("lambda" to lambdas, "accuracy" to accuracyes)) + ggsize(700, 400) + geomPoint(
        color = "dark-green",
        size = 4.0
    ) { x = "lambda"; y = "accuracy" }
    p.show()
    
    val pL = lets_plot(mapOf("lambda" to lambdas.map {it * 10000} , "accuracy legit" to accuracyesLegit)) + ggsize(700, 400) + geomPoint(
    color = "dark-green",
    size = 4.0
    ) { x = "lambda"; y = "accuracy legit" }
    pL.show()
}

plot_accuracy()