In [33]:
import time

import duckdb
import pandas as pd
import plotly.express as px
from plotly_calplot import calplot

from common.env_variables import DUCKDB_DWH_FILE
def display_df(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, "expand_frame_repr", False, "display.float_format", '${:,.2f}'.format):
        display(df.fillna('.'))

conn = duckdb.connect(DUCKDB_DWH_FILE, read_only=True)

In [34]:
df = conn.execute(f'''
SELECT
    company_name,
    COUNT('job_id') as job_count
FROM curated.job_description
GROUP BY 1
ORDER BY 2 DESC
LIMIT 100
''').df()
display_df(df)

Unnamed: 0,company_name,job_count
0,Deutsche Bahn AG,15603
1,Bosch Gruppe,10165
2,Bertrandt AG,6018
3,HAPEKO Hanseatisches Personalkontor GmbH,5954
4,ZEISS,5634
5,Mercedes - Benz AG,5194
6,meinestadt.de,5063
7,StepStone GmbH,4973
8,FERCHAU GmbH,4815
9,IU Internationale Hochschule,3855


In [35]:
start = time.time()
df = conn.execute(f'''
SELECT
    online_at,
    COUNT('a.job_id') as job_count
FROM curated.job_description a
INNER JOIN curated.job_online b ON (a.job_id = b.job_id)
WHERE
    a.company_name = 'Bosch Gruppe'
GROUP BY 1
ORDER BY 1 DESC
LIMIT 100
''').df()
print(time.time() - start)
display_df(df)

0.9006869792938232


Unnamed: 0,online_at,job_count
0,2022-09-22,2245
1,2022-09-21,2220
2,2022-09-20,2243
3,2022-09-19,2223
4,2022-09-18,2252
5,2022-09-17,2311
6,2022-09-16,2311
7,2022-09-15,2278
8,2022-09-14,2284
9,2022-09-13,2301


In [36]:
start = time.time()
df = conn.execute(f'''
WITH bosch_jobs AS (
    SELECT
        job_id
    FROM curated.job_description a
    WHERE
        a.company_name = 'Bosch Gruppe'
)
SELECT
     online_at
    ,COUNT('job_id') AS job_count
FROM curated.job_online
WHERE job_id in (
    SELECT job_id
    FROM bosch_jobs
)
GROUP BY 1
ORDER BY 1 DESC
''').df()
print(time.time() - start)
display_df(df)

0.7602357864379883


Unnamed: 0,online_at,job_count
0,2022-09-22,1951
1,2022-09-21,1929
2,2022-09-20,1943
3,2022-09-19,1921
4,2022-09-18,1944
5,2022-09-17,1996
6,2022-09-16,1999
7,2022-09-15,1968
8,2022-09-14,1975
9,2022-09-13,1989


In [39]:
conn.close()