In [52]:
%useLatestDescriptors
%use dataframe
%use kandy
%use ktor-client

@file:DependsOn("org.jetbrains.kotlinx:kotlinx-io-core:0.8.0")
@file:DependsOn("org.apache.commons:commons-compress:1.28.0")

In [2]:
import kotlinx.io.asInputStream
import kotlinx.io.buffered
import kotlinx.io.files.Path
import kotlinx.io.files.SystemFileSystem
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream

private val remoteDataUrl = "https://github.com/ageron/handson-ml/raw/master/datasets"
private val housingUrl = "$remoteDataUrl/housing/housing.tgz"

private val localDataDir = "/Users/markn/IdeaProjects/ml-project/datasets"
private val housingTgzFile = "housing.tgz"
private val housingCsvFile = "housing.csv"

val housingTgzPath = Path("$localDataDir/$housingTgzFile")
val housingCsvPath = Path("$localDataDir/$housingCsvFile")

fun loadHousingTgz() : Path {
    val data = http.get(housingUrl).readBytes()
    SystemFileSystem.createDirectories(Path(localDataDir))
    SystemFileSystem.sink(housingTgzPath).buffered().use {
        it.write(data)
    }
    return housingTgzPath
}

fun extractHousingCsv(path: Path) : Path {
    SystemFileSystem.source(path).buffered().use { source ->
        GzipCompressorInputStream(source.asInputStream()).use { gins ->
            TarArchiveInputStream(gins).use { tais ->
                var entry = tais.nextEntry
                while (entry != null) {
                    if (entry.name == housingCsvFile) {
                        SystemFileSystem.sink(housingCsvPath).buffered().use { sink ->
                            sink.write(tais.readBytes())
                        }
                    }
                    entry = tais.nextEntry
                }
            }
        }
    }
    return housingCsvPath
}

In [None]:
val housingTgz = loadHousingTgz()
extractHousingCsv(housingTgz)

In [3]:
val housing = DataFrame.readCsv(File(housingCsvPath.toString()))

In [64]:
val plots = listOf(
    housing.plot { histogram(longitude) },
    housing.plot { histogram(latitude) },
    housing.plot { histogram(housing_median_age) },

    housing.plot { histogram(total_rooms) },
    housing.plot { histogram(total_bedrooms.dropNulls()) },
    housing.plot { histogram(population) },

    housing.plot { histogram(households) },
    housing.plot { histogram(median_income) },
    housing.plot { histogram(median_house_value) },
)
plotGrid(plots, nCol = 3)

In [13]:
val housingExt = housing.add("income_cat") { ceil(median_income / 1.5).coerceIn(minimumValue = 1.0, maximumValue = 5.0) }

In [47]:
fun <T> DataFrame<T>.stratSplit(
    random: Random = Random(42),
    fraction: Float,
    cols: ColumnsSelectionDsl<T>.(ColumnsSelectionDsl<T>) -> ColumnsResolver<*>,
): DataFrame<T> = groupBy(moveToTop = true, cols)
    .mapToFrames {
        val sampleSize = (group.rowsCount() * fraction).toInt().coerceAtLeast(1)
        group.shuffle(random).take(sampleSize)
    }
    .concat()

In [43]:
import kotlin.random.Random

val housingTrainRandom = housingExt.shuffle(Random(42)).take((housing.rowsCount() * 0.2).roundToInt())
val housingTrainStrat = housingExt.stratSplit(fraction = 0.2f) { income_cat }

In [44]:
val plots = listOf(
    housingExt.plot { histogram(income_cat) },
    housingTrainRandom.plot { histogram(income_cat) },
    housingTrainStrat.plot { histogram(income_cat) },
)
plotGrid(plots, nCol = 3)

In [57]:
val fractionsFull = housingExt
    .groupBy { income_cat }
    .count()
    .add("fraction") { it["count"] as Int / housingExt.count().toDouble() }
    .select("income_cat", "fraction")

val fractionsRandom = housingTrainRandom
    .groupBy { income_cat }
    .count()
    .add("fraction") { it["count"] as Int / housingTrainRandom.count().toDouble() }
    .select("income_cat", "fraction")

val fractionsStrat = housingTrainStrat
    .groupBy { income_cat }
    .count()
    .add("fraction") { it["count"] as Int / housingTrainStrat.count().toDouble() }
    .select("income_cat", "fraction")

In [60]:
val fractionDf = fractionsFull
    .sortBy { income_cat }
    .rename { fraction }.into { "overall" }
    .join(fractionsRandom.rename { fraction }.into { "random" })
    .join(fractionsStrat.rename { fraction }.into { "strat" })
    .add("random_error") { ((it["random"] as Double - it["overall"] as Double) / it["overall"] as Double) * 100 }
    .add("strat_error") { ((it["strat"] as Double - it["overall"] as Double) / it["overall"] as Double) * 100 }

println(fractionDf)

   income_cat  overall   random    strat random_error strat_error
 0        1,0 0,039826 0,037064 0,039748    -6,934307   -0,194954
 1        2,0 0,318847 0,307413 0,318953    -3,586081    0,033270
 2        3,0 0,350581 0,358769 0,350703     2,335544    0,034647
 3        4,0 0,176308 0,183382 0,176200     4,012091   -0,061500
 4        5,0 0,114438 0,113372 0,114397    -0,931414   -0,036242

