In [2]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re

import csv

In [3]:
SEARCH_OPTS = {
    'area': int, # 1 - Москва, 2019 - МО
    'part_time': {
        'employment_part': "Неполный день",
        'from_four_to_six_hours_in_a_day': "От 4 часов в день",
        'only_saturday_and_sunday': "По выходным",
        'start_after_sixteen': "По вечерам",
        'employment_project': "Разовое задание"
    },
    'salary': int,
    'only_with_salary': bool,
    'text': str # позиция
}

INIT_PARAMS = {
    'hhtmFrom': "vacancy_search_list",
    'hhtmFromLabel': "vacancy_search_line",
    'search_field': ["name", "company_name", "description"],
    # 'enable_snippets': False,
    'ored_clusters': True,
}

In [None]:
params = INIT_PARAMS.copy()

for optName, optType in SEARCH_OPTS.items():
    if type(optType) == dict:
        optDesc = "".join([
            f"{i}. {desc}\n"
            for i, desc in enumerate(optType.values())
        ])
        params.update({optName: list(optType.keys())[int(input(f"{optDesc}Input variant: "))]})
    else:
        params.update({optName: optType(input(f"Input parametr <{optName}>: "))})

params

In [5]:
params = INIT_PARAMS.copy()
params.update({
    'area': 1,
    # 'part_time': ['employment_part'],
    'salary': 70_000,
    # 'only_with_salary': True,
    'text': 'python'
})
params

{'hhtmFrom': 'vacancy_search_list',
 'hhtmFromLabel': 'vacancy_search_line',
 'search_field': ['name', 'company_name', 'description'],
 'ored_clusters': True,
 'area': 1,
 'salary': 70000,
 'text': 'python'}

In [6]:
BASE_URL = "https://hh.ru"
INIT_LINK = "/search/vacancy/"
page_modif = lambda p: {'page': p}
HEADER = {
    'Accept': "*/*",
    'User-Agent': UserAgent().chrome,
    'Connection': "keep-alive",
}

In [7]:
def parseSal(salStr):
    salStr, tax = salStr.split("\xa0")
    currency = salStr[-1]
    
    salStr = re.sub("\s", "", salStr[:-1]).replace("до", "–")
    if salStr[:2].lower() == "от":
        salStr = salStr.lstrip("от") + "–"
    valScopes = list(map(lambda s: int(s) if s else 0, salStr.split("–")))
    
    return valScopes, currency, tax

In [8]:
def parseSoup(soup, data):
    for card in soup.find_all("div", class_=re.compile("^vacancy-card")):
        cardDict = {}
        
        tag = card.findChild("span", class_=re.compile("^vacancy-name"))
        cardDict['position'] = tag.getText() if tag else None
        
        tag = card.findChild("div", class_=re.compile("^info-section")) \
            .findChild("a", class_=re.compile("^bloko-link"))
        cardDict['company'] = (
            tag.findChild("span", class_=re.compile("^company-info-text")).getText() \
                .replace("\xa0", " ")
                if tag else None,
            BASE_URL + tag.get("href") if tag else None
        )

        tag = card.findChild("span", class_=re.compile("^compensation-text"))
        cardDict['salary'] = parseSal(tag.getText()) if tag else None
            
        data.append(cardDict)

In [None]:
def isNextExists(soup):
    return bool(soup.find("a", {'data-qa': "pager-next"}))

In [10]:
session = requests.session() # для оптимизации повторных запросов
pageNum = 0
nextExists = True
parseData = []

while nextExists:
    params.update(page_modif(pageNum))
    response = session.get(BASE_URL + INIT_LINK, params=params, headers=HEADER)
    print("*" if response.ok else "!", end="")
    with open("hh.html", "wb") as f:
        f.write(response.content)

    soup = BeautifulSoup(response.text, features="html.parser")
    parseSoup(soup, parseData)
    if not isNextExists(soup):
        nextExists = False
        print()

    pageNum += 1

*

NameError: name 'isNextExists' is not defined

In [None]:
CSV_FIELDS = ("position", "company", "comp_link", "sal_from", "sal_to", "sal_curr", "sal_tax")

with open("hh.csv", "w", encoding="utf-8", newline="") as outFile:
    writer = csv.writer(outFile, delimiter=";")
    writer.writerow(CSV_FIELDS)
    writer.writerows(
        (
            row['position'],
            row['company'][0] if row['company'] else None,
            row['company'][1] if row['company'] else None,
            row['salary'][0][0] if row['salary'] else None, 
            row['salary'][0][1] if row['salary'] else None,
            row['salary'][1] if row['salary'] else None,
            row['salary'][2] if row['salary'] else None
        )
        for row in parseData
    )