## Experiments with github and dataframe

Based on
- [dataframe github example](https://github.com/Kotlin/dataframe/blob/master/examples/notebooks/github/github.ipynb)

In [None]:
%useLatestDescriptors
%use dataframe, kandy

In [None]:
@file:DependsOn("org.http4k:http4k-core:4.16.3.0")
@file:DependsOn("org.http4k:http4k-client-apache:4.16.3.0")

In [None]:
import org.http4k.client.ApacheClient
import org.http4k.core.Method.GET
import org.http4k.core.Request

val client = ApacheClient()

In [None]:
data class Credentials(val name: String, val token: String)

In [None]:
import java.util.Base64

fun authHeader(name: String, token: String): String {
    return "Basic " + Base64.getEncoder().encode("$name:$token".toByteArray()).toString(Charsets.UTF_8)
}

fun readDf(url: String, credentials: Credentials?): DataFrame<*> {
    val request = Request(GET, url).let {
        if (credentials != null) {
            it.header("Authorization", authHeader(credentials.name, credentials.token))
        } else {
            it
        }
    }.query("per_page", "100")
    return DataFrame.readJsonStr(client(request).bodyString())
}

In [None]:
val credentials: Credentials? = Credentials( "langara", System.getenv("MY_TEST_RO_GH_TOKEN"))

fun readDf(url: String) = readDf(url, credentials)

In [None]:
import org.http4k.core.HttpHandler

data class PageInfo(val body: DataFrame<*>, val pageUrls: List<String>)

fun parseLinkHeader(links: Pair<String, String?>): String? = links.second?.split(",")
        ?.map { it.split(";")
        .map { it.trim() } }
        ?.find { (_, rel) -> rel == "rel=\"last\"" }
        ?.first()
        ?.drop(1)?.dropLast(1)
        

val PAGE_REGEX = "page=(\\d+)$".toRegex()
fun readPageInfo(client: HttpHandler, url: String): PageInfo {
    val firstPage = client(Request(GET, url).query("per_page", "100"))
    val firstPageDf = DataFrame.readJsonStr(firstPage.bodyString())
    val links = firstPage.headers.find { (name, _) -> name == "Link" } ?: return PageInfo(firstPageDf, emptyList())
    val lastPageUrl = parseLinkHeader(links) ?: error("Expected 'Link' header with non empty value 'rel=last' attribute on endpoint $url, but was $links")
    val n = PAGE_REGEX.find(lastPageUrl)?.groupValues?.get(1)?.toInt()!!
    val pageUrls = (2..n).map {
        lastPageUrl.replace(PAGE_REGEX, "page=$it")    
    }
    return PageInfo(firstPageDf, pageUrls)
}

In [None]:
fun readPaginatedEndpoint(url: String): DataFrame<*> {
    val pageInfo = readPageInfo(client, url)
    return pageInfo.body.concat(pageInfo.pageUrls.map { readDf(it) }.concat())
}

Section 2. Build the data sample

In [None]:
val jb = readDf("https://api.github.com/orgs/JetBrains")
jb

In [None]:
jb.schema()

Most of the data in the dataframe are just URLs. 

In [None]:
val jb1 = jb
    .add("repos") { readDf(repos_url) }
    .add("members") { readPaginatedEndpoint("${url}/public_members") }
jb1

But now we have repositories, each repository has contributors_url. If we want to download contributors, we need to add a new column to `repos`. 

In [None]:
val jb2 = jb1
    .convert { repos }.with { 
        it.add("contributors") { readDf(contributors_url) }
    }

Unfortunately, it gets worse when you need to add a new child column to some deeply nested column. Like, for example, downloading a list of followers for each contributor of each repository would require writing this code:
```
jb2.convert { repos }.with {
    it.convert { 
        contributors 
    }.with { 
        it.add("followers") { readDf(followers_url) } 
    }
}
```
You can still do it, if you need to.

In [None]:
jb2.schema()

Resulting dataframe has a lot of columns. We can use `select` and `remove` to filter them. Both operations provide DSL for selecting arbitrary set of columns https://kotlin.github.io/dataframe/columnselectors.html

In [None]:
val minifiedSample = jb2
    .select { cols(url, name, repos, members) }
    // Like before, we use convert to create dataframe with changed values in nested column
    .convert { members }.with { it.remove { endsWith("_url") } }
    .convert { repos }.with { it.remove { endsWith("_url").except(html_url) } }
    .convert { repos }.with { it.remove { owner } }
    .convert { repos }.with { 
        it.convert { contributors }.with { it.remove { endsWith("_url") } } 
    }

In [None]:
minifiedSample.schema()

In [None]:
minifiedSample.writeJson("jetbrains.json")

## Langara: Let's Try with google

In [None]:
val ge = readDf("https://api.github.com/orgs/Google")
ge


In [None]:
val ge1 = ge
    .add("repos") { readDf(repos_url) }
    .add("members") { readPaginatedEndpoint("${url}/public_members") }
ge1


In [None]:
ge1.repos[0]

## TODO later: continue playing with github api; create some visualizations in kandy