In [1]:
%useLatestDescriptors
%use dataframe
%use kandy
%use kandy-geo
%use ktor-client

@file:DependsOn("org.jetbrains.kotlinx:kotlinx-io-core:0.8.0")
@file:DependsOn("org.apache.commons:commons-compress:1.28.0")

import kotlinx.io.asInputStream
import kotlinx.io.buffered
import kotlinx.io.files.Path
import kotlinx.io.files.SystemFileSystem
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream
import kotlin.random.Random

In [2]:
private val remoteDataUrl = "https://github.com/ageron/handson-ml/raw/master/datasets"
private val housingUrl = "$remoteDataUrl/housing/housing.tgz"

private val localDataDir = "/Users/markn/IdeaProjects/ml-project/datasets"
private val housingTgzFile = "housing.tgz"
private val housingCsvFile = "housing.csv"

val housingTgzPath = Path("$localDataDir/$housingTgzFile")
val housingCsvPath = Path("$localDataDir/$housingCsvFile")

fun loadHousingTgz() : Path {
    val data = http.get(housingUrl).readBytes()
    SystemFileSystem.createDirectories(Path(localDataDir))
    SystemFileSystem.sink(housingTgzPath).buffered().use {
        it.write(data)
    }
    return housingTgzPath
}

fun extractHousingCsv(path: Path) : Path {
    SystemFileSystem.source(path).buffered().use { source ->
        GzipCompressorInputStream(source.asInputStream()).use { gins ->
            TarArchiveInputStream(gins).use { tais ->
                var entry = tais.nextEntry
                while (entry != null) {
                    if (entry.name == housingCsvFile) {
                        SystemFileSystem.sink(housingCsvPath).buffered().use { sink ->
                            sink.write(tais.readBytes())
                        }
                    }
                    entry = tais.nextEntry
                }
            }
        }
    }
    return housingCsvPath
}

In [None]:
val housingTgz = loadHousingTgz()
extractHousingCsv(housingTgz)

In [3]:
val housing = DataFrame.readCsv(File(housingCsvPath.toString()))

In [4]:
val plots = listOf(
    housing.plot { histogram(longitude) },
    housing.plot { histogram(latitude) },
    housing.plot { histogram(housing_median_age) },

    housing.plot { histogram(total_rooms) },
    housing.plot { histogram(total_bedrooms.dropNulls()) },
    housing.plot { histogram(population) },

    housing.plot { histogram(households) },
    housing.plot { histogram(median_income) },
    housing.plot { histogram(median_house_value) },
)
plotGrid(plots, nCol = 3)

In [5]:
val housingExt = housing.add("income_cat") { ceil(median_income / 1.5).coerceIn(minimumValue = 1.0, maximumValue = 5.0) }

In [6]:
fun <T> DataFrame<T>.trainTestSplit(
    random: Random = Random(42),
    testFraction: Double,
): Pair<DataFrame<T>, DataFrame<T>> {
    require(testFraction in 0.0..1.0) { "testFraction must be between 0 and 1" }

    val testSize = (rowsCount() * testFraction).roundToInt()
    shuffle(random).let { shuffled ->
        val trainDf = shuffled.drop(testSize)
        val testDf = shuffled.take(testSize)
        return trainDf to testDf
    }
}

fun <T> DataFrame<T>.trainTestSplitStrat(
    random: Random = Random(42),
    testFraction: Double,
    stratCols: ColumnsSelectionDsl<T>.(ColumnsSelectionDsl<T>) -> ColumnsResolver<*>,
): Pair<DataFrame<T>, DataFrame<T>> {
    require(testFraction in 0.0..1.0) { "testFraction must be between 0 and 1" }

    val listTrainDf = mutableListOf<DataFrame<T>>()
    val listTestDf = mutableListOf<DataFrame<T>>()

    groupBy(moveToTop = true, cols = stratCols)
        .mapToFrames { group.shuffle(random) }
        .forEach { df ->
            val groupSize = (df.rowsCount() * testFraction).toInt().coerceAtLeast(1)
            listTrainDf.add(df.drop(groupSize))
            listTestDf.add(df.take(groupSize))
        }
    return listTrainDf.concat() to listTestDf.concat()
}

In [7]:
val (housingTrainRandom, housingTestRandom) = housingExt.trainTestSplit(testFraction = 0.2)
val (housingTrainStrat, housingTestStrat) = housingExt.trainTestSplitStrat(testFraction = 0.2) { income_cat }

In [9]:
val plots = listOf(
    housingExt.plot { histogram(income_cat) },
    housingTrainRandom.plot { histogram(income_cat) },
    housingTrainStrat.plot { histogram(income_cat) },
)
plotGrid(plots, nCol = 3)

In [10]:
val fractionsFull = housingExt
    .groupBy { income_cat }
    .count()
    .add("fraction") { it["count"] as Int / housingExt.count().toDouble() }
    .select("income_cat", "fraction")

val fractionsRandom = housingTrainRandom
    .groupBy { income_cat }
    .count()
    .add("fraction") { it["count"] as Int / housingTrainRandom.count().toDouble() }
    .select("income_cat", "fraction")

val fractionsStrat = housingTrainStrat
    .groupBy { income_cat }
    .count()
    .add("fraction") { it["count"] as Int / housingTrainStrat.count().toDouble() }
    .select("income_cat", "fraction")

In [11]:
val fractionDf = fractionsFull
    .sortBy { income_cat }
    .rename { fraction }.into { "overall" }
    .join(fractionsRandom.rename { fraction }.into { "random" })
    .join(fractionsStrat.rename { fraction }.into { "strat" })
    .add("random_error") { ((it["random"] as Double - it["overall"] as Double) / it["overall"] as Double) * 100 }
    .add("strat_error") { ((it["strat"] as Double - it["overall"] as Double) / it["overall"] as Double) * 100 }

println(fractionDf)

   income_cat  overall   random    strat random_error strat_error
 0        1,0 0,039826 0,040516 0,039845     1,733577    0,048709
 1        2,0 0,318847 0,321705 0,318820     0,896520   -0,008313
 2        3,0 0,350581 0,348534 0,350551    -0,583886   -0,008656
 3        4,0 0,176308 0,174540 0,176335    -1,003023    0,015366
 4        5,0 0,114438 0,114704 0,114448     0,232854    0,009055



In [106]:
val californiaGeoDf = GeoDataFrame
    .readGeoJson("https://raw.githubusercontent.com/AndreiKingsley/datasets/refs/heads/main/USA.json").df
    .filter { it["name"] == "California" }
    .rename("geometry" to "geo")

In [109]:
californiaGeoDf.plot {
    geoMap(geo) {
        alpha = 0.0
        borderLine {
            width = 1.0
            color = Color.GREY
        }
    }
    withData(housingExt) {
        points {
            x(longitude)
            y(latitude)
            alpha = 0.4
            size(population) {
                scale = continuous(1.5..4.5)
            }
            color(median_house_value) {
                scale = continuousColorHue(180..360)
            }
        }
    }
}