# Imports

In [53]:
import sys
import polars as pl
pl.Config.set_tbl_rows(15) # show up rows in cell
pl.Config.set_fmt_str_lengths(200)   # show up to 200 chars for string cells
import boto3
import time
from IPython.display import display
import datetime
import math
from pathlib import PurePosixPath
print(f"{sys.version=}")

sys.version='3.14.0 (main, Oct  7 2025, 09:34:52) [Clang 17.0.0 (clang-1700.3.19.1)]'


In [2]:
s3 = boto3.resource(
    's3',
    endpoint_url='http://localhost:9000',
    aws_access_key_id='minioadmin',
    aws_secret_access_key='minioadmin',
    aws_session_token=None,
    config=boto3.session.Config(signature_version='s3v4'),
    verify=False
)
storage_options = {
    "aws_endpoint_url": "http://localhost:9000", # Важно: aws_endpoint_url (иногда endpoint_url)
    "aws_access_key_id": "minioadmin",
    "aws_secret_access_key": "minioadmin",
    "aws_region": "us-east-1", # Для MinIO часто можно оставить us-east-1
    "aws_allow_http": "true",  # Разрешить HTTP (без SSL)
}

# Список silver S3

In [3]:
def silver_path2dt(obj):
    return datetime.datetime(**{item.split("=",1)[0]: int(item.split("=",1)[1]) for item in PurePosixPath(obj.key).parts if "=" in item})
bucket = s3.Bucket('silver')
objects = sorted(bucket.objects.all(), key=silver_path2dt)

In [7]:
objects[:2]

[s3.ObjectSummary(bucket_name='silver', key='hh/vacancies/year=2025/month=12/day=15/part-e3dbed9e-2f5a-4d82-8ef7-36e2c4c45a48.parquet')]

In [8]:
total_size_mb = 0
for obj in objects:
    key = obj.key
    size_mb = obj.size//1e+6
    total_size_mb += size_mb
    print(key, size_mb, "mb,", "last_modified", obj.last_modified)

hh/vacancies/year=2025/month=12/day=15/part-e3dbed9e-2f5a-4d82-8ef7-36e2c4c45a48.parquet 29.0 mb, last_modified 2025-12-16 10:15:25.578000+00:00


In [9]:
print('Silver layer size', total_size_mb, 'mb')

Silver layer size 29.0 mb


# Общее кол-во вакансий в S3

In [10]:
for obj in objects:
    schema = pl.scan_parquet(f"s3://silver/{obj.key}",storage_options=storage_options).collect_schema()
    print(obj, len(schema))

s3.ObjectSummary(bucket_name='silver', key='hh/vacancies/year=2025/month=12/day=15/part-e3dbed9e-2f5a-4d82-8ef7-36e2c4c45a48.parquet') 51


In [11]:
df_lazy = pl.scan_parquet(
    [f"s3://silver/{obj.key}" for obj in objects],
    storage_options=storage_options,
)

In [12]:
df_lazy.select(
    pl.col('id').count().alias('id_cnt'),
    pl.col('id').unique().count().alias('unique_id_cnt')
).collect()

id_cnt,unique_id_cnt
u32,u32
144437,144437


In [13]:
df_lazy.tail(1).collect().glimpse()

Rows: 1
Columns: 51
$ id                                    <str> '128367096'
$ name                                  <str> 'Продавец-консультант'
$ schedule                        <struct[2]> {'id': 'fullDay', 'name': 'Полный день'}
$ working_time_modes        <list[struct[2]]> []
$ working_time_intervals    <list[struct[2]]> []
$ working_days              <list[struct[2]]> []
$ working_hours             <list[struct[2]]> [{'id': 'HOURS_8', 'name': '8\xa0часов'}]
$ work_schedule_by_days     <list[struct[2]]> [{'id': 'FIVE_ON_TWO_OFF', 'name': '5/2'}]
$ fly_in_fly_out_duration   <list[struct[2]]> []
$ is_adv_vacancy                       <bool> False
$ internship                           <bool> False
$ accept_temporary                     <bool> False
$ accept_incomplete_resumes            <bool> True
$ premium                              <bool> False
$ has_test                             <bool> False
$ show_contacts                        <bool> True
$ response_letter_required     

# Сверка схем Bronze & Silver

In [14]:
check_report_dt = datetime.datetime(year=2025, month=12, day=15)

In [15]:
bronze_lazy = pl.scan_ndjson(f"s3://bronze/hh/vacancies/date={check_report_dt.strftime('%Y-%m-%d')}/*.jsonl.gz", storage_options=storage_options)
bronze_cols = set(bronze_lazy.collect_schema().names())

In [16]:
report_dt_obj = next(iter(s3.Bucket('silver').objects.filter(Prefix="hh/vacancies/"+check_report_dt.strftime("year=%Y/month=%m/day=%d")).all()))
silver_s3_path = report_dt_obj.key
silver_lazy = pl.scan_parquet(f"s3://silver/{silver_s3_path}", storage_options=storage_options)
silver_cols = set(silver_lazy.collect_schema().names())

In [17]:
silver_cols - bronze_cols

{'created_at_offset',
 'created_at_utc',
 'immediate_redirect_url',
 'published_at_offset',
 'published_at_utc',
 'video_vacancy'}

In [18]:
"brand_snippet" in bronze_cols

True

In [19]:
for report_dt in {"2025-12-08", "2025-12-09", "2025-12-10", "2025-12-11", "2025-12-12", "2025-12-13", "2025-12-14", "2025-12-15"}:
    print(report_dt)
    for bronze_obj in s3.Bucket('bronze').objects.filter(Prefix=f"hh/vacancies/date={report_dt}"):
        _bronze_lazy = pl.scan_ndjson(f"s3://bronze/{bronze_obj.key}/*.jsonl.gz", storage_options=storage_options)
        print("\t", "brand_snippet" in set(bronze_lazy.collect_schema().names()))

2025-12-09
2025-12-10
2025-12-13
2025-12-15
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
	 True
2025-12-11
2025-12-08
2025-12-14
2025-12-12


# Lil EDA

In [20]:
def silver_path2dt(obj):
    return datetime.datetime(**{item.split("=",1)[0]: int(item.split("=",1)[1]) for item in PurePosixPath(obj.key).parts if "=" in item})
bucket = s3.Bucket('silver')
objects = sorted(bucket.objects.all(), key=silver_path2dt)
objects

[s3.ObjectSummary(bucket_name='silver', key='hh/vacancies/year=2025/month=12/day=15/part-e3dbed9e-2f5a-4d82-8ef7-36e2c4c45a48.parquet')]

In [21]:
df_lazy = (
    pl.scan_parquet([f"s3://silver/{obj.key}" for obj in objects], storage_options=storage_options)

    # Published Date transfromation from UTC to Local
    .with_columns(
        published_at_local = (
                pl.col('published_at_utc').dt.replace_time_zone(None) +
                pl.duration(minutes=pl.col('published_at_offset')
            )
        )
    )
    .with_columns(
        published_date=pl.col('published_at_local').dt.date()
    )
)

In [22]:
df_lazy.select("published_at_utc", "published_at_offset", "published_at_local").head(100).collect()

published_at_utc,published_at_offset,published_at_local
datetime[μs],i16,datetime[μs]
2025-12-15 07:55:43,180,2025-12-15 10:55:43
2025-12-15 07:55:43,180,2025-12-15 10:55:43
2025-12-15 07:55:43,180,2025-12-15 10:55:43
2025-12-15 07:55:49,180,2025-12-15 10:55:49
2025-12-15 07:56:21,180,2025-12-15 10:56:21
…,…,…
2025-12-15 08:20:39,180,2025-12-15 11:20:39
2025-12-15 08:20:41,180,2025-12-15 11:20:41
2025-12-15 08:21:40,180,2025-12-15 11:21:40
2025-12-15 08:21:59,180,2025-12-15 11:21:59


In [23]:
df_lazy.select(
    pl.col('id').count(),
    pl.col('id').unique().count().alias('unique_id')
).collect()

id,unique_id
u32,u32
144437,144437


In [26]:
dups = df_lazy.filter(pl.col("id").is_duplicated()).sort(by='id')
dups.head(20).select("id", "name", "published_at_local").collect()

id,name,published_at_local
str,str,datetime[μs]


## Describe

In [27]:
df_lazy_described = df_lazy.describe()

In [28]:
from IPython.display import display, HTML

display(HTML("""
<style>
/* classic notebook */
div.output_area pre {
  white-space: pre !important;   /* не переносить строки */
}

/* jupyterlab */
div.jp-OutputArea-output pre {
  white-space: pre !important;   /* не переносить строки */
}
</style>
"""))

with pl.Config(tbl_cols=-1, tbl_formatting='MARKDOWN', tbl_width_chars=-1):
    print(df_lazy_described)

shape: (9, 54)
| statistic  | id        | name                            | schedule | working_time_modes | working_time_intervals | working_days | working_hours | work_schedule_by_days | fly_in_fly_out_duration | is_adv_vacancy | internship | accept_temporary | accept_incomplete_resumes | premium  | has_test | show_contacts | response_letter_required | show_logo_in_search | archived | night_shifts | url                             | alternate_url                   | apply_alternate_url             | response_url                    | adv_response_url | immediate_redirect_url          | salary   | salary_range | employer | department | type     | employment | employment_form | experience | area     | address  | work_format | relations | professional_roles | contacts | adv_context | sort_point_distance | snippet  | branding | insider_interview | video_vacancy | brand_snippet | published_at_utc           | created_at_utc             | published_at_offset | created_at_offset | published_at

## Check nulls

In [29]:
df_lazy.null_count().collect().glimpse()

Rows: 1
Columns: 53
$ id                        <u32> 0
$ name                      <u32> 0
$ schedule                  <u32> 0
$ working_time_modes        <u32> 0
$ working_time_intervals    <u32> 0
$ working_days              <u32> 0
$ working_hours             <u32> 0
$ work_schedule_by_days     <u32> 0
$ fly_in_fly_out_duration   <u32> 0
$ is_adv_vacancy            <u32> 0
$ internship                <u32> 0
$ accept_temporary          <u32> 0
$ accept_incomplete_resumes <u32> 0
$ premium                   <u32> 0
$ has_test                  <u32> 0
$ show_contacts             <u32> 0
$ response_letter_required  <u32> 0
$ show_logo_in_search       <u32> 79223
$ archived                  <u32> 0
$ night_shifts              <u32> 0
$ url                       <u32> 0
$ alternate_url             <u32> 0
$ apply_alternate_url       <u32> 0
$ response_url              <u32> 120494
$ adv_response_url          <u32> 144437
$ immediate_redirect_url    <u32> 120495
$ salary                 

## Слишком много null'ов в silver, проверка заполненности данных в bronze

In [30]:
bucket = s3.Bucket('bronze')

In [35]:
report_dt = "2025-12-07"
report_dt_objects = [o for o in bucket.objects.filter(Prefix=f"hh/vacancies/date={report_dt}")]
len(report_dt_objects)

1

In [36]:
for obj in report_dt_objects:
    s3path = f"s3://bronze/{obj.key}"
    lazy_bronze = pl.scan_ndjson(s3path, storage_options=storage_options, infer_schema_length=None)
    null_cnt = lazy_bronze.null_count().collect()
    print(null_cnt)
    

shape: (1, 49)
┌─────────────┬─────────────┬────────┬────────────┬───┬─────────────┬──────────┬─────┬─────────────┐
│ night_shift ┆ working_day ┆ salary ┆ created_at ┆ … ┆ fly_in_fly_ ┆ has_test ┆ url ┆ immediate_r │
│ s           ┆ s           ┆ ---    ┆ ---        ┆   ┆ out_duratio ┆ ---      ┆ --- ┆ edirect_url │
│ ---         ┆ ---         ┆ u32    ┆ u32        ┆   ┆ n           ┆ u32      ┆ u32 ┆ ---         │
│ u32         ┆ u32         ┆        ┆            ┆   ┆ ---         ┆          ┆     ┆ u32         │
│             ┆             ┆        ┆            ┆   ┆ u32         ┆          ┆     ┆             │
╞═════════════╪═════════════╪════════╪════════════╪═══╪═════════════╪══════════╪═════╪═════════════╡
│ 0           ┆ 0           ┆ 560    ┆ 0          ┆ … ┆ 0           ┆ 0        ┆ 0   ┆ 5121        │
└─────────────┴─────────────┴────────┴────────────┴───┴─────────────┴──────────┴─────┴─────────────┘


In [38]:
obj = report_dt_objects[0]
s3path = f"s3://bronze/{obj.key}"
lazy_bronze = pl.scan_ndjson(s3path, storage_options=storage_options, infer_schema_length=None)

In [40]:
silver_lazy = pl.scan_parquet(f"s3://silver/hh/vacancies/year=2025/month=12/day=7/part-13d2fa0f-b0db-4aca-85cb-c138f5639d7b.parquet", storage_options=storage_options)
silver_schema = set(silver_lazy.collect_schema().names())
bronze_schema = set(lazy_bronze.collect_schema().names())
len(silver_schema), len(bronze_schema)

(52, 49)

In [41]:
len(silver_schema & bronze_schema)

47

In [42]:
len(silver_schema - bronze_schema)

5

In [43]:
silver_schema - bronze_schema

{'created_at_offset',
 'created_at_utc',
 'date',
 'published_at_offset',
 'published_at_utc'}

In [48]:
'date' in lazy_bronze.collect_schema().names()

False

In [49]:
'date' in silver_lazy.collect_schema().names()

True

In [69]:
lazy_bronze = pl.scan_ndjson("s3://bronze/hh/vacancies/date=2025-12-07/*.jsonl.gz", storage_options=storage_options, infer_schema_length=None)
silver_lazy = pl.scan_parquet(f"s3://silver/hh/vacancies/year=2025/month=12/day=7/part-d696cca2-5e21-4dcb-b385-01df97955700.parquet", storage_options=storage_options)

In [70]:
'date' in lazy_bronze.collect_schema().names()

False

In [71]:
'date' in silver_lazy.collect_schema().names()

False

## professional_role count

In [78]:
lens = pl.col('professional_roles').list.len()
df_lazy.select(
    lens.min().alias("min_len"),
    lens.max().alias("max_len"),
    lens.mean().alias("avg_len"),
    lens.median().alias("median_len"),
    lens.quantile(0.25).alias("p25_len"),
    lens.quantile(0.75).alias("p75_len"),
    lens.quantile(0.95).alias("p95_len"),
).collect()

min_len,max_len,avg_len,median_len,p25_len,p75_len,p95_len
u32,u32,f64,f64,f64,f64,f64
1,1,1.0,1.0,1.0,1.0,1.0


In [77]:
with pl.Config(set_tbl_rows=15):
    res = df_lazy.select("professional_roles").head(15).collect()
    display(res)

professional_roles
list[struct[2]]
"[{""77"",""Мерчандайзер""}]"
"[{""131"",""Упаковщик, комплектовщик""}]"
"[{""131"",""Упаковщик, комплектовщик""}]"
"[{""40"",""Другое""}]"
"[{""131"",""Упаковщик, комплектовщик""}]"
"[{""40"",""Другое""}]"
"[{""80"",""Начальник производства""}]"
"[{""131"",""Упаковщик, комплектовщик""}]"
"[{""97"",""Продавец-консультант, продавец-кассир""}]"
"[{""97"",""Продавец-консультант, продавец-кассир""}]"


In [79]:
q = (df_lazy.with_columns(
        pl.col('professional_roles').list.get(0).struct.field('name').alias('professional_role')
    )
)
with pl.Config(set_tbl_rows=20):
    res = q.group_by('professional_role').len().sort(by="len", descending=True).collect()
    display(res)

professional_role,len
str,u32
"""Продавец-консультант, продавец-кассир""",12183
"""Оператор производственной линии""",11941
"""Менеджер по продажам, менеджер по работе с клиентами""",9373
"""Повар, пекарь, кондитер""",8896
"""Курьер""",8738
"""Другое""",8225
"""Разнорабочий""",5395
"""Упаковщик, комплектовщик""",3938
"""Водитель""",3742
"""Официант, бармен, бариста""",3247
