In [None]:
%useLatestDescriptors
%use dataframe
%use kandy
%use ktor-client
%use spark

@file:DependsOn("org.jetbrains.kotlinx:kotlinx-io-core:0.8.0")
@file:DependsOn("org.apache.commons:commons-compress:1.28.0")

In [None]:
import kotlinx.io.asInputStream
import kotlinx.io.buffered
import kotlinx.io.files.Path
import kotlinx.io.files.SystemFileSystem
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream

private val remoteDataUrl = "https://github.com/ageron/handson-ml/raw/master/datasets"
private val housingUrl = "$remoteDataUrl/housing/housing.tgz"

private val localDataDir = "/Users/markn/IdeaProjects/ml-project/datasets"
private val housingTgzFile = "housing.tgz"
private val housingCsvFile = "housing.csv"

val housingTgzPath = Path("$localDataDir/$housingTgzFile")
val housingCsvPath = Path("$localDataDir/$housingCsvFile")

fun loadHousingTgz() : Path {
    val data = http.get(housingUrl).readBytes()
    SystemFileSystem.createDirectories(Path(localDataDir))
    SystemFileSystem.sink(housingTgzPath).buffered().use {
        it.write(data)
    }
    return housingTgzPath
}

fun extractHousingCsv(path: Path) : Path {
    SystemFileSystem.source(path).buffered().use { source ->
        GzipCompressorInputStream(source.asInputStream()).use { gins ->
            TarArchiveInputStream(gins).use { tais ->
                var entry = tais.nextEntry
                while (entry != null) {
                    if (entry.name == housingCsvFile) {
                        SystemFileSystem.sink(housingCsvPath).buffered().use { sink ->
                            sink.write(tais.readBytes())
                        }
                    }
                    entry = tais.nextEntry
                }
            }
        }
    }
    return housingCsvPath
}

In [None]:
val housingTgz = loadHousingTgz()
extractHousingCsv(housingTgz)

In [None]:
val housing = DataFrame.readCsv(File(housingCsvPath.toString()))

In [None]:
val plots = listOf(
    housing.plot { histogram(longitude) },
    housing.plot { histogram(latitude) },
    housing.plot { histogram(housing_median_age) },

    housing.plot { histogram(total_rooms) },
    housing.plot { histogram(total_bedrooms.map { it ?: -1.0 }.filter { it != -1.0 }) },
    housing.plot { histogram(population) },

    housing.plot { histogram(households) },
    housing.plot { histogram(median_income) },
    housing.plot { histogram(median_house_value) },
)
plotGrid(plots, nCol = 3)

In [66]:
import kotlin.random.Random

var housingTrain = housing
    .shuffle(Random(42)).take((housing.rowsCount() * 0.8).roundToInt())
    .add("income_cat") { ceil(median_income / 1.5).coerceIn(minimumValue = 1.0, maximumValue = 5.0) }
val housingTest = housing.shuffle(Random(42)).take((housing.rowsCount() * 0.2).roundToInt())

In [67]:
housingTrain.plot {
    histogram("income_cat")
}