In [32]:
%useLatestDescriptors
%use dataframe
%use kandy
%use ktor-client

@file:DependsOn("org.jetbrains.kotlinx:kotlinx-io-core:0.8.0")
@file:DependsOn("org.apache.commons:commons-compress:1.28.0")

In [29]:
private val remoteDataUrl = "https://github.com/ageron/handson-ml/raw/master/datasets"
private val housingUrl = "$remoteDataUrl/housing/housing.tgz"

private val localDataDir = "/Users/markn/IdeaProjects/ml-project/datasets"
private val housingTgzFile = "housing.tgz"
private val housingCsvFile = "housing.csv"

val housingTgzPath = Path("$localDataDir/$housingTgzFile")
val housingCsvPath = Path("$localDataDir/$housingCsvFile")

fun loadHousingTgz() : Path {
    val data = http.get(housingUrl).readBytes()
    SystemFileSystem.createDirectories(Path(localDataDir))
    SystemFileSystem.sink(housingTgzPath).buffered().use {
        it.write(data)
    }
    return housingTgzPath
}

fun extractHousingCsv(path: Path) : Path {
    SystemFileSystem.source(path).buffered().use { source ->
        GzipCompressorInputStream(source.asInputStream()).use { gins ->
            TarArchiveInputStream(gins).use { tais ->
                var entry = tais.nextEntry
                while (entry != null) {
                    if (entry.name == housingCsvFile) {
                        SystemFileSystem.sink(housingCsvPath).buffered().use { sink ->
                            sink.write(tais.readBytes())
                        }
                    }
                    entry = tais.nextEntry
                }
            }
        }
    }
    return housingCsvPath
}

In [30]:
val housingTgz = getHousingTgz()
val housingCsv = extractHousingCsv(housingTgz)
val housing = DataFrame.readCsv(File(housingCsv.toString()))

In [57]:
housing

longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
-122230000,37880000,41000000,880000000,129000000,322000000,126000000,8325200,452600000000,NEAR BAY
-122220000,37860000,21000000,7099000000,1106000000,2401000000,1138000000,8301400,358500000000,NEAR BAY
-122240000,37850000,52000000,1467000000,190000000,496000000,177000000,7257400,352100000000,NEAR BAY
-122250000,37850000,52000000,1274000000,235000000,558000000,219000000,5643100,341300000000,NEAR BAY
-122250000,37850000,52000000,1627000000,280000000,565000000,259000000,3846200,342200000000,NEAR BAY
-122250000,37850000,52000000,919000000,213000000,413000000,193000000,4036800,269700000000,NEAR BAY
-122250000,37840000,52000000,2535000000,489000000,1094000000,514000000,3659100,299200000000,NEAR BAY
-122250000,37840000,52000000,3104000000,687000000,1157000000,647000000,3120000,241400000000,NEAR BAY
-122260000,37840000,42000000,2555000000,665000000,1206000000,595000000,2080400,226700000000,NEAR BAY
-122250000,37840000,52000000,3549000000,707000000,1551000000,714000000,3691200,261100000000,NEAR BAY


In [59]:
val plots = listOf(
    housing.plot { histogram(longitude) },
    housing.plot { histogram(latitude) },
    housing.plot { histogram(housing_median_age) },

    housing.plot { histogram(total_rooms) },
    housing.plot { histogram(total_bedrooms.map { it ?: -1.0 }.filter { it != -1.0 }) },
    housing.plot { histogram(population) },

    housing.plot { histogram(households) },
    housing.plot { histogram(median_income) },
    housing.plot { histogram(median_house_value) },
)
plotGrid(plots, nCol = 3)