In [None]:
import requests
import time
import os
import pandas as pd
import json
import glob
import re
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

Зададим исходные данные:
- количество страниц поисковой выдачи hh.ru для парсинга;
- timeout парсера при скачивании страниц поисковой выдачи;
- timeout парсера при скачивании страниц резюме.

Пояснение: страница поисковой выдачи - страница, генерируемая hh.ru при поисковом запросе на резюме определенных специалистов. Обычно такой запрос выдает 20 ссылок на резюме на каждой странице.

In [None]:
PAGES_COUNT = 250
HHDownloader_timeout = 5
HHResumeDownloader_timeout = 10
HHResumeDownloader_count = PAGES_COUNT * 20

Для скачивания страниц поисковой выдачи объявим класс HHDownloader.

In [None]:
class HHDownloader:
    def __init__(self, start_url_template: str, data_path: str, timeout=10):
        self.start_url_template = start_url_template
        self.headers = {'User-Agent':
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        self.timeout = timeout
        self.data_path = data_path

    def check_if_exists(self, page_num: int):
#   проверка, если страница уже скачана
        return os.path.exists(os.path.join(self.data_path, 'resume_page_{0}.html'.format(page_num)))

    def download_pages(self, start_page: int, end_page: int):
#   скачивание страниц в заданном диапазоне
        for page_num in range(start_page, end_page + 1):
            if self.check_if_exists(page_num):
                print(f"File {'resume_page_{0}.html'.format(page_num)} already exists. Skip")
                continue
            print(f"Start downloading {page_num} page")
            page_url = self.get_page_url(page_num)
            print(f"Downloaded {page_num} page")
            page = self.download_page(page_url)
            self.save_page(page, page_num)
            print(f"Saved {page_num} page to {os.getcwd()}/data")
            print("*" * 20)
            time.sleep(self.timeout)

    def get_page_url(self, page_num: int):
#   генерация ссылок на страницы
        return self.start_url_template.format(page_num)

    def download_page(self, url: str):
#   скачивание страницы
        page = requests.get(url, headers=self.headers)
        return page

    def save_page(self, page, page_num: int):
#   сохранение страниц в текущей директории
        page_file_name = os.path.join(self.data_path, 'resume_page_{0}.html'.format(page_num))
        with open(page_file_name, 'w') as resume_request:
            resume_request.write(page.text)


if __name__ == "__main__":
    start_url = "https://hh.ru/search/resume?text=sales+manager&st=resumeSearch&logic=normal&pos=full_text&exp_" \
                   "period=all_time&exp_company_size=any&exp_industry=any&area=2&area=1&relocation=living_or_relocati" \
                   "on&salary_from=&salary_to=&currency_code=RUR&label=only_with_salary&education=none&age_from=&age_" \
                   "to=&gender=unknown&order_by=relevance&search_period=0&items_on_page=20&page={0}"
    hh_downloader = HHDownloader(start_url, "data/", timeout=HHDownloader_timeout)
    hh_downloader.download_pages(0, PAGES_COUNT)


Далее сформируем список id резюме для последующей генерации уникальных ссылок на каждое резюме и сохраним его в файле id_lists.txt.

In [None]:
def write_links(page: str, pattern: str, start: int, end: int):
    for i in range(start, end + 1):
        with open(page.format(i), 'r') as resume_request, open('data/id_list/id_list.txt', 'a') as id_list:
            soup = BeautifulSoup(resume_request, features="lxml")
            id_lst = soup.find_all('a', {'class': "resume-search-item__name"})
            for raw_id in id_lst:
                id_list.write(re.search(pattern, raw_id['href']).group(1))
                id_list.write('\n')


write_links('data/resume_page_{0}.html', r'resume\/(.+)\?', 0, PAGES_COUNT)
print('id_lists записан')

Скачивание уникальных страниц с резюме кандидатов реализуется через класс HHResumeDownloader.

In [None]:
class HHResumeDownloader:
    def __init__(self, page_url: str, data_path: str, number: int, timeout=10):
        self.page_url = page_url
        self.headers = {'User-Agent':
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        self.timeout = timeout
        self.data_path = data_path
        self.number = number
        self.proxy = {'http': 'http://161.202.226.195:8123'}

    def check_if_exists(self, resume_id: str):
#   Проверка, была ли уже скачана данная странца
        return os.path.exists(os.path.join(self.data_path, f'resume_page_{resume_id}.html'))

    def get_page_url(self, resume_id: str):
#   Склеиваем ссылку на резюме и уникальный номер страницы
        return self.page_url.format(resume_id)

    def download_pages(self):
#   Скачиваем страницу и сохраняем локально
        with open('data/id_list/id_list.txt', 'r') as id_list:
            rep = 0
            for resume_id in id_list:
                resume_id = resume_id.strip()
                if self.check_if_exists(resume_id):
                    print(f"File {'resume_page_{0}.html'.format(resume_id)} already exists. Skip")
                    continue
                print(f"Start downloading {resume_id} page")
                page_url = self.get_page_url(resume_id)
                print(f"Downloaded {resume_id} page")
                page = self.download_page(page_url)
                self.save_page(page, resume_id)
                print(f"Saved {resume_id} page")
                print("*" * 20)
                time.sleep(self.timeout)
                rep += 1
                if self.number == rep:
                    break

    def download_page(self, url: str):
        page = requests.get(url, headers=self.headers, proxies=self.proxy)
        return page

    def save_page(self, page, resume_id: str):
        page_file_name = os.path.join(self.data_path, 'resume_page_{0}.html'.format(resume_id))
        with open(page_file_name, 'w') as resume_request:
            resume_request.write(page.text)


if __name__ == "__main__":
    hh_resume_downloader = HHResumeDownloader("https://hh.ru/resume/{0}",
                                              "data/saved_resumes/",
                                              HHResumeDownloader_count,
                                              timeout=HHResumeDownloader_timeout)
    hh_resume_downloader.download_pages()


Парсим данные из скачанных html страниц с помощью Beautiful soup и класса Resume.

In [None]:
class Resume:
#распаковываем информацию из страницы резюме
    def __init__(self, soup, resume_id):
        self.soup = soup
        self.resume_id = resume_id
        self.resume_title = self.extract_title()
        self.city = self.extract_information('span', {'data-qa': "resume-personal-address"})
        self.age = self.extract_age()
        self.gender = self.extract_gender
        self.area = self.extract_area()
        self.desired_wage = self.extract_wage()
        self.experience_description = self.extract_information('div', {'data-qa': "resume-block-experience-description"})
        self.work_exp = self.extract_work_experience()
        self.education = self.extract_nested_information('div', {'data-qa': "resume-block-education"})
        self.language_prof = self.extract_languages()
        self.skills = self.extract_information('span', {'class': "bloko-tag__section bloko-tag__section_text"})
        self.dict_resume = self.resume_dict_maker()

    def __repr__(self):
        return f'{self.dict_resume}'

    def extract_title(self):
        title = self.extract_information('span', {'data-qa': "resume-block-title-position"})
        main_titles = title.split(',')
        main_title = main_titles[0]
        return main_title

    def extract_age(self):
        raw_age = self.extract_information('span', {'data-qa': "resume-personal-age"})
        digit_pattern = r'\d+'
        if raw_age != "No information":
            age = re.findall(digit_pattern, raw_age)
            age = age[0]
            return age
        else:
            return 0

    def extract_work_experience(self):
        work_exp = self.extract_nested_information('div', {'data-qa': "resume-block-experience"})
        float_pattern = r'\d+'
        if work_exp != "No information":
            work_exp = re.findall(float_pattern, work_exp)
            if len(work_exp) == 2:
                work_exp = float(f'{work_exp[0]}.{work_exp[1]}')
            elif len(work_exp) == 1:
                work_exp = work_exp[0]
            return work_exp
        else:
            return 0

    def extract_area(self):
        area = self.extract_information('span', {'data-qa': "resume-block-specialization-category"})
        if area != "No information":
            area = area.split(',')
            area = area[0]
        return area

    def extract_wage(self):
        wage = self.extract_information('span', {'data-qa': "resume-block-salary"})
        int_pattern = r'\d'
        if wage != "No information":
            wage_amount = re.findall(int_pattern, wage)
            wage_str = ''.join(wage_amount)
            wage = int(f'{wage_str}')
            return wage
        else:
            return 0
    

    def extract_languages(self):
        languages = self.extract_information('p', {'data-qa': "resume-block-language-item"}, to_list=False)
        if type(languages) == str:
            languages = {"Russian": "Native"}
        return languages

    @property
    def extract_gender(self):
        gender = self.extract_information('span', {'data-qa': "resume-personal-gender"})
        if gender == 'The man':
            gender = 'Male'
        elif gender == 'Woman':
            gender = 'Female'
        return gender


    def extract_information(self, tag: str, attributes: dict, to_list=True):
        finder = self.soup.find_all(tag, attributes)
        if len(finder) == 1:
            finder = finder[0].get_text()
            return finder
        elif len(finder) > 1:
            if to_list:
                listed_info = []
                for part in finder:
                    list_element = part.get_text()
                    listed_info.append(list_element)
                return listed_info
            else:
                dict_info = {}
                for dict_part in finder:
                    dict_element = dict_part.get_text()
                    splitted_dict_element = dict_element.split()
                    dict_info[splitted_dict_element[0]] = splitted_dict_element[2]
                return dict_info
        else:
            return "No information"

    def extract_nested_information(self, tag: str, attributes: dict, to_list=True):
        finder = self.soup.find(tag, attributes)
        if finder is None:
            return "No information"
        else:
            finder = finder.find('span', {'class': "resume-block__title-text resume-block__title-text_sub"})
            finder = finder.get_text()
        return finder

    def resume_dict_maker(self):
        resume_dict = {'id': self.resume_id,
                       'title': self.resume_title,
                       'city': self.city,
                       'age': self.age,
                       'gender': self.gender,
                       'area': self.area,
                       'desired_wage': self.desired_wage,
                       'work_experience': self.work_exp,
                       'experience_description': self.experience_description,
                       'education_level': self.education,
                       'languages': self.language_prof,
                       'skills': self.skills
                       }
        return resume_dict


class ResumeGetter:
#скачиваем резюме    
    def __init__(self):
        self.dir_path = "./data/saved_resumes/"
        self.resume_dict_storage = []
        with open('./data/id_list/id_list.txt', 'r') as f:
            self.resume_storage = ['resume_page_' + line.strip() + '.html' for line in f]

    def get_resume(self):
        pbar = tqdm(total=len(self.resume_storage))

        for resume in self.resume_storage:
            with open(f'{self.dir_path}{resume}', "r") as resume_page:
                resume_text = BeautifulSoup(resume_page, features="lxml")
                resume_id = str(re.search(r"page_(.+)\.html", resume).group(1))
                if self.check_if_exists(resume_id):
                        print(f"File {f'{resume_id}.json'} already exists. Skip")
                resume_getter = Resume(resume_text, resume_id)
                dict_format = resume_getter.resume_dict_maker()
                self.resume_dict_storage.append(dict_format)
                self.get_json_resume(dict_format, resume_id)
                pbar.update()  
        return

    def check_if_exists(self, res_id: str):
        return os.path.exists(os.path.join('data/new_json_resumes/', f'{res_id}.json'))

    def get_json_resume(self, resume_dict: dict, r_id: str):
        with open(f'data/new_json_resumes/{r_id}.json', 'w+') as json_file:
            json.dump(resume_dict, json_file, indent=4)


if __name__ == '__main__':
    data = ResumeGetter()
    data.get_resume()
    print(data.resume_dict_storage)

Аккумулируем всю информацию в один json файл, а затем конвертируем его в csv.

In [None]:
result = []
for f in glob.glob("data/new_json_resumes/*.json"):
    with open(f, "r") as infile:
        result.append(json.load(infile))

with open("data/merged/merged_file.json", "w") as outfile:
     json.dump(result, outfile)

In [None]:
df = pd.read_json('data/merged/merged_file.json')
df.to_csv('data/merged/merged_file.csv')
print('Merged file saved')

На выходе имеем csv файл, содержащий в себе информацию из всех скачанных резюме.