In [1]:
#!pip install git+https://github.com/navikt/inbound@main

## Load data from local file to duckdb

In [1]:
from inbound.core.jobs import run_job

job = {"jobs": [
        {
            "name": "CSV to DuckDB",
            "source": {"type": "file", "spec": {"path": "source.csv", "format": "meta+json"}},
            "target": {
                "type": "duckdb",
                "spec": {
                    "table": "test",
                    "database": "tempdb",
                },
            },
        }
    ]}

res = run_job(job)


[2022-11-07 22:51:20,243] INFO - inbound 0.0.3 | Starting job: CSV to DuckDB (mean-human-night). Source: file. Target: duckdb
[2022-11-07 22:51:20,255] INFO - inbound 0.0.3 | Batch number 1 of length 10 returned after 3722922 nanoseconds
[2022-11-07 22:51:20,265] INFO - inbound 0.0.3 | Job  CSV to DuckDB (mean-human-night) completed in 22460066 nanoseconds. Result: Finished in 22.000 seconds. Result: {"result": "DONE", "rows": "10", "size": "368", "duration": "22.000", "batchcount": "3"}


## Load data from url to duckdb

In [2]:
job = {"jobs": [
        {
            "name": "CSV to DuckDB",
            "source": {"type": "file", "spec": {"url": "http://data.ssb.no/api/v0/dataset/1054.csv?lang=en"}},
            "target": {
                "type": "duckdb",
                "spec": {
                    "table": "ssb",
                    "database": "tempdb",
                },
            },
        }
    ]}

res = run_job(job)


[2022-11-07 22:51:24,825] INFO - inbound 0.0.3 | Starting job: CSV to DuckDB (mean-human-night). Source: file. Target: duckdb
[2022-11-07 22:51:25,329] INFO - inbound 0.0.3 | Batch number 1 of length 10000 returned after 493789456 nanoseconds
[2022-11-07 22:51:25,355] INFO - inbound 0.0.3 | Batch number 2 of length 4328 returned after 519692770 nanoseconds
[2022-11-07 22:51:25,376] INFO - inbound 0.0.3 | Job  CSV to DuckDB (mean-human-night) completed in 550398016 nanoseconds. Result: Finished in 550.000 seconds. Result: {"result": "DONE", "rows": "14328", "size": "573380", "duration": "550.000", "batchcount": "6"}


In [3]:
import duckdb

con = duckdb.connect("tempdb")
df = con.execute("SELECT * FROM ssb").df()
df.head()

Unnamed: 0,sex,age,month,contents,"13332: Employment, unemployment, labour force and break and man-weeks worked for persons aged 15-74, by sex, age, month and contents"
0,2 Females,15-74 15-74 years,2010M10,"Labour force, seasonally adjusted (1 000 persons)",1254.0
1,2 Females,15-74 15-74 years,2010M10,"Total employment (1 000 persons), seasonally a...",1213.0
2,2 Females,15-74 15-74 years,2010M10,"Man-weeks worked of 37.5 hours, seasonally adj...",781.0
3,2 Females,15-74 15-74 years,2010M10,"Unemployment (LFS) (1 000 persons), seasonally...",40.0
4,2 Females,15-74 15-74 years,2010M10,"Unemployment rate (LFS), seasonally adjusted",3.2


## Load data from url with transformations

In [4]:
job = {"jobs": [
        {
            "name": "CSV to DuckDB",
            "source": {
                "type": "file", 
                "spec": {
                    "url": "http://data.ssb.no/api/v0/dataset/1054.csv?lang=en",
                    "transformer": "transformer.py" 
                    }
                },
            "target": {
                "type": "duckdb",
                "spec": {
                    "table": "ssb",
                    "database": "tempdb",
                },
            },
        }
    ]}

res = run_job(job)

[2022-11-07 22:51:37,328] INFO - inbound 0.0.3 | Starting job: CSV to DuckDB (mean-human-night). Source: file. Target: duckdb
[2022-11-07 22:51:37,669] INFO - inbound 0.0.3 | Batch number 1 of length 10000 returned after 291246069 nanoseconds
[2022-11-07 22:51:37,696] INFO - inbound 0.0.3 | Batch number 2 of length 4328 returned after 318550061 nanoseconds
[2022-11-07 22:51:37,719] INFO - inbound 0.0.3 | Job  CSV to DuckDB (mean-human-night) completed in 391732146 nanoseconds. Result: Finished in 391.000 seconds. Result: {"result": "DONE", "rows": "14328", "size": "688004", "duration": "391.000", "batchcount": "6"}


In [5]:
con = duckdb.connect("tempdb")
df = con.execute("SELECT * FROM ssb").df()
df.head()

Unnamed: 0,sex,age,month,contents,"13332: Employment, unemployment, labour force and break and man-weeks worked for persons aged 15-74, by sex, age, month and contents",test
0,2 Females,15-74 15-74 years,2010M10,"Labour force, seasonally adjusted (1 000 persons)",1254.0,I'm transformed
1,2 Females,15-74 15-74 years,2010M10,"Total employment (1 000 persons), seasonally a...",1213.0,I'm transformed
2,2 Females,15-74 15-74 years,2010M10,"Man-weeks worked of 37.5 hours, seasonally adj...",781.0,I'm transformed
3,2 Females,15-74 15-74 years,2010M10,"Unemployment (LFS) (1 000 persons), seasonally...",40.0,I'm transformed
4,2 Females,15-74 15-74 years,2010M10,"Unemployment rate (LFS), seasonally adjusted",3.2,I'm transformed


## Load from url and add job metadata

In [6]:
job = {"jobs": [
        {
            "name": "CSV to DuckDB",
            "source": {
                "type": "file", 
                "spec": {
                    "url": "http://data.ssb.no/api/v0/dataset/1054.csv?lang=en",
                    "format": "meta+json", 
                    "meta": {
                        "system": "SSB Statistikkbank",
                        "api": "http://data.ssb.no/api/v0/dataset/1054.csv?lang=en",
                        "description": "Sysselsetting og arbeidsledighet (AKU), etter kjønn og alder. Hele datasettet 2006M02 - siste måned"
                    }
                 }
            },
            "target": {
                "type": "duckdb",
                "spec": {
                    "table": "ssb",
                    "database": "tempdb",
                },
            },
        }
    ]}

res = run_job(job)

[2022-11-07 22:51:43,415] INFO - inbound 0.0.3 | Starting job: CSV to DuckDB (mean-human-night). Source: file. Target: duckdb
[2022-11-07 22:51:43,773] INFO - inbound 0.0.3 | Batch number 1 of length 10000 returned after 347777230 nanoseconds
[2022-11-07 22:51:44,032] INFO - inbound 0.0.3 | Batch number 2 of length 4328 returned after 606838711 nanoseconds
[2022-11-07 22:51:44,133] INFO - inbound 0.0.3 | Job  CSV to DuckDB (mean-human-night) completed in 718647530 nanoseconds. Result: Finished in 718.000 seconds. Result: {"result": "DONE", "rows": "14328", "size": "573376", "duration": "718.000", "batchcount": "6"}


In [7]:
con = duckdb.connect("tempdb")
df = con.execute("SELECT * FROM ssb").df()
df.head()

Unnamed: 0,system,api,description,loaded,data
0,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
1,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
2,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
3,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
4,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
