In [9]:
import time

import duckdb
import pandas as pd
import plotly.express as px
from plotly_calplot import calplot

from common.env_variables import DUCKDB_DWH_FILE
def display_df(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, "expand_frame_repr", False, "display.float_format", '${:,.2f}'.format):
        display(df.fillna('.'))

conn = duckdb.connect(DUCKDB_DWH_FILE, read_only=True)

In [10]:
df = conn.execute(f'''
SELECT
    company_name,
    COUNT('job_id') as job_count
FROM curated.job
GROUP BY 1
ORDER BY 2 DESC
LIMIT 100
''').df()
display_df(df)

Unnamed: 0,company_name,job_count
0,Deutsche Bahn AG,15853
1,Bosch Gruppe,10250
2,Bertrandt AG,6079
3,HAPEKO Hanseatisches Personalkontor GmbH,6014
4,ZEISS,5639
5,Mercedes - Benz AG,5251
6,meinestadt.de,5063
7,StepStone GmbH,5007
8,FERCHAU GmbH,4839
9,IU Internationale Hochschule,3950


In [11]:
start = time.time()
df = conn.execute(f'''
SELECT
    online_at,
    COUNT('a.job_id') as job_count
FROM curated.job a
INNER JOIN curated.online_job b ON (a.job_id = b.job_id)
WHERE
    a.company_name = 'Bosch Gruppe'
GROUP BY 1
ORDER BY 1 DESC
LIMIT 5
''').df()
print(time.time() - start)
display_df(df)

0.8561420440673828


Unnamed: 0,online_at,job_count
0,2022-09-27,2219
1,2022-09-26,2198
2,2022-09-25,2221
3,2022-09-24,2259
4,2022-09-23,2268


In [12]:
start = time.time()
df = conn.execute(f'''
WITH bosch_jobs AS (
    SELECT
        job_id
    FROM curated.job a
    WHERE
        a.company_name = 'Bosch Gruppe'
)
SELECT
     online_at
    ,COUNT('job_id') AS job_count
FROM curated.online_job
WHERE job_id in (
    SELECT job_id
    FROM bosch_jobs
)
GROUP BY 1
ORDER BY 1 DESC
''').df()
print(time.time() - start)
display_df(df)

0.7945160865783691


Unnamed: 0,online_at,job_count
0,2022-09-27,1908
1,2022-09-26,1886
2,2022-09-25,1907
3,2022-09-24,1943
4,2022-09-23,1954
5,2022-09-22,1951
6,2022-09-21,1929
7,2022-09-20,1943
8,2022-09-19,1921
9,2022-09-18,1944


In [None]:
start = time.time()
df = conn.execute(f'''
  WITH all_company AS (
SELECT DISTINCT company_name
  FROM curated.job
),
       car_company AS (
SELECT company_name
  FROM all_company
 WHERE company_name ilike 'BMW%' OR
       company_name ilike 'Audi%' OR
       company_name ilike 'Volkswagen%' OR
       company_name ilike 'Mercedes%'
),
       car_job AS (
SELECT a.company_name,
       a.job_id
  FROM curated.job a
  JOIN car_company b
    ON (a.company_name = b.company_name)
)
SELECT company_name,
       COUNT('job_id') AS job_count
  FROM car_job
 GROUP BY 1
 ORDER BY 2 DESC
LIMIT 100
''').df()
print(time.time() - start)
display_df(df)

In [None]:
conn.close()