In [1]:
#!pip install git+https://github.com/navikt/inbound-core@main

## Load data from local file to duckdb

In [2]:
from inbound.core.jobs import run_job

job = {"jobs": [
        {
            "name": "CSV to DuckDB",
            "source": {"type": "file", "spec": {"path": "source.csv", "format": "meta+json"}},
            "target": {
                "type": "duckdb",
                "spec": {
                    "table": "test",
                    "database": "tempdb",
                },
            },
        }
    ]}

res = run_job(job)


  warn_incompatible_dep(


17:49:38.032 |  [1mINFO[0m | Starting job: CSV to DuckDB (remember-general-world). Source: file. Target: duckdb
17:49:38.055 |  [1mINFO[0m | Batch number 1 of length 10 returned after 0.0084 seconds. Memory allocated: 174149/354911
17:49:38.079 |  [1mINFO[0m | Job  () finished in 0.0468 seconds. Result: {"result": "DONE", "rows": "10", "size": "368", "duration": "0.046838712999999386", "memory size": "0", "memory peak": "0", "batchcount": "1"}


## Load data from url to duckdb

In [3]:
job = {"jobs": [
        {
            "name": "CSV to DuckDB",
            "source": {"type": "file", "spec": {"url": "http://data.ssb.no/api/v0/dataset/1054.csv?lang=en"}},
            "target": {
                "type": "duckdb",
                "spec": {
                    "table": "ssb",
                    "database": "tempdb",
                },
            },
        }
    ]}

res = run_job(job)


17:49:45.033 |  [1mINFO[0m | Starting job: CSV to DuckDB (remember-general-world). Source: file. Target: duckdb
17:49:46.234 |  [1mINFO[0m | Batch number 1 of length 10000 returned after 1.1903 seconds. Memory allocated: 7454865/13164727
17:49:46.317 |  [1mINFO[0m | Batch number 2 of length 10000 returned after 1.2737 seconds. Memory allocated: 8065572/13164727
17:49:46.401 |  [1mINFO[0m | Batch number 3 of length 10000 returned after 1.3571 seconds. Memory allocated: 8048965/13164727
17:49:46.481 |  [1mINFO[0m | Batch number 4 of length 10000 returned after 1.4375 seconds. Memory allocated: 8033093/13164727
17:49:46.569 |  [1mINFO[0m | Batch number 5 of length 10000 returned after 1.5251 seconds. Memory allocated: 8034766/13164727
17:49:46.653 |  [1mINFO[0m | Batch number 6 of length 9040 returned after 1.6097 seconds. Memory allocated: 7723626/13164727
17:49:46.826 |  [1mINFO[0m | Job  () finished in 1.793 seconds. Result: {"result": "DONE", "rows": "59040", "size": "

In [4]:
import duckdb

con = duckdb.connect("tempdb")
df = con.execute("SELECT * FROM ssb").df()
df.head()

Unnamed: 0,sex,age,type of adjustment,month,contents,"13760: Labour force, employment, unemployment and man-weeks worked for persons aged 15-74, by sex, age, type of adjustment, month and contents"
0,0 Both sexes,15-74 15-74 years,T Trend,2006M01,Labour force (1000 persons),2449
1,0 Both sexes,15-74 15-74 years,T Trend,2006M01,Total employment (1000 persons),2344
2,0 Both sexes,15-74 15-74 years,T Trend,2006M01,Man-weeks worked of 37.5 hours (1000),1787
3,0 Both sexes,15-74 15-74 years,T Trend,2006M01,Unemployment (LFS) (1000 persons),105
4,0 Both sexes,15-74 15-74 years,T Trend,2006M01,Labour force in per cent of the population,..


## Load data from url with transformations

In [7]:
job = {"jobs": [
        {
            "name": "CSV to DuckDB",
            "source": {
                "type": "file", 
                "spec": {
                    "url": "http://data.ssb.no/api/v0/dataset/1054.csv?lang=en",
                    "transformer": "transformer.py" 
                    }
                },
            "target": {
                "type": "duckdb",
                "spec": {
                    "table": "ssb_transformed",
                    "database": "tempdb",
                },
            },
        }
    ]}

res = run_job(job)

15:45:48.173 |  [1mINFO[0m | Starting job: CSV to DuckDB (buy-easy-back). Source: file. Target: duckdb
15:45:49.199 |  [1mINFO[0m | Batch number 1 of length 10000 returned after 1.0226 seconds. Memory allocated: 7407.436
15:45:49.336 |  [1mINFO[0m | Batch number 2 of length 10000 returned after 1.1598 seconds. Memory allocated: 8582.938
15:45:49.447 |  [1mINFO[0m | Batch number 3 of length 10000 returned after 1.2709 seconds. Memory allocated: 8566.565
15:45:49.555 |  [1mINFO[0m | Batch number 4 of length 10000 returned after 1.3795 seconds. Memory allocated: 8551.53
15:45:49.657 |  [1mINFO[0m | Batch number 5 of length 10000 returned after 1.4806 seconds. Memory allocated: 8546.209
15:45:49.762 |  [1mINFO[0m | Batch number 6 of length 9040 returned after 1.5862 seconds. Memory allocated: 8234.936
15:45:49.848 |  [1mINFO[0m | Job CSV to DuckDB () finished in 1.6741 seconds. Result: {"result": "DONE", "rows": "59040", "size": "3307028", "duration": "1.6741132940000234", 

In [5]:
con = duckdb.connect("tempdb")
df = con.execute("SELECT * FROM ssb").df()
df.head()

Unnamed: 0,sex,age,month,contents,"13332: Employment, unemployment, labour force and break and man-weeks worked for persons aged 15-74, by sex, age, month and contents",test
0,2 Females,15-74 15-74 years,2010M10,"Labour force, seasonally adjusted (1 000 persons)",1254.0,I'm transformed
1,2 Females,15-74 15-74 years,2010M10,"Total employment (1 000 persons), seasonally a...",1213.0,I'm transformed
2,2 Females,15-74 15-74 years,2010M10,"Man-weeks worked of 37.5 hours, seasonally adj...",781.0,I'm transformed
3,2 Females,15-74 15-74 years,2010M10,"Unemployment (LFS) (1 000 persons), seasonally...",40.0,I'm transformed
4,2 Females,15-74 15-74 years,2010M10,"Unemployment rate (LFS), seasonally adjusted",3.2,I'm transformed


## Load from url and add job metadata

In [6]:
job = {"jobs": [
        {
            "name": "CSV to DuckDB",
            "source": {
                "type": "file", 
                "spec": {
                    "url": "http://data.ssb.no/api/v0/dataset/1054.csv?lang=en",
                    "format": "meta+json", 
                    "meta": {
                        "system": "SSB Statistikkbank",
                        "api": "http://data.ssb.no/api/v0/dataset/1054.csv?lang=en",
                        "description": "Sysselsetting og arbeidsledighet (AKU), etter kjønn og alder. Hele datasettet 2006M02 - siste måned"
                    }
                 }
            },
            "target": {
                "type": "duckdb",
                "spec": {
                    "table": "ssb",
                    "database": "tempdb",
                },
            },
        }
    ]}

res = run_job(job)

[2022-11-07 22:51:43,415] INFO - inbound 0.0.3 | Starting job: CSV to DuckDB (mean-human-night). Source: file. Target: duckdb
[2022-11-07 22:51:43,773] INFO - inbound 0.0.3 | Batch number 1 of length 10000 returned after 347777230 nanoseconds
[2022-11-07 22:51:44,032] INFO - inbound 0.0.3 | Batch number 2 of length 4328 returned after 606838711 nanoseconds
[2022-11-07 22:51:44,133] INFO - inbound 0.0.3 | Job  CSV to DuckDB (mean-human-night) completed in 718647530 nanoseconds. Result: Finished in 718.000 seconds. Result: {"result": "DONE", "rows": "14328", "size": "573376", "duration": "718.000", "batchcount": "6"}


In [7]:
con = duckdb.connect("tempdb")
df = con.execute("SELECT * FROM ssb").df()
df.head()

Unnamed: 0,system,api,description,loaded,data
0,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
1,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
2,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
3,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
4,SSB Statistikkbank,http://data.ssb.no/api/v0/dataset/1054.csv?lan...,"Sysselsetting og arbeidsledighet (AKU), etter ...",2022-11-07 22:50:38.825981,"{'sex': '2 Females', 'age': '15-74 15-74 years..."
