Скраппинг

In [1]:
import requests      # Для запросов по API
import json          # Для обработки полученных результатов
import time          # Для задержки между запросами
import datetime
import numpy as np
import pathlib

In [3]:
def top_k_industries(k):
    url_industry = 'https://api.hh.ru/industries'
    url_vacancies = 'https://api.hh.ru/vacancies'
    params = {
    }
    headers = {
        'HH-User-Agent': 'my-app/0.0.1'
    }
    industries_response = requests.get(url_industry, params=params, headers=headers)
    industries_response.raise_for_status()
    industries_response = industries_response.json()
    industries = [(element["id"], element["name"]) for element in industries_response]
    start_date = datetime.datetime(2023, 1, 1)
    top_tier_industries = []
    for industry_id, name in industries:
        params_for_industry = {
            'industry_id': industry_id,
            'date_from': start_date.strftime('%Y-%m-%d'),
            'area': 1,
            'only_with_salary': True,
            'currency': "RUR"
        }
        header_for_industry = {
            'HH-User-Agent': 'my-app/0.0.1'
        }
        response_industry = requests.get(url_vacancies, params=params_for_industry, headers=header_for_industry)
        response_industry.raise_for_status()
        if response_industry.status_code == 200:
            
            response_industry = response_industry.json()
            if len(top_tier_industries) < k:
                top_tier_industries.append((response_industry["found"], id, name))
            else:
                found = [i[0] for i in top_tier_industries]
                idx_min = np.argmin(found)
                if top_tier_industries[idx_min][0] < response_industry["found"]:
                    top_tier_industries[idx_min] = (response_industry["found"], id, name)
        time.sleep(np.random.uniform(0.5, 2.0))
    return top_tier_industries

In [4]:
def get_vacancies(city_id, number_of_vacancies, industry_id):
    url = 'https://api.hh.ru/vacancies'
    headers = {
        'HH-User-Agent': 'my-app/0.0.1'
    }
    vacancies = None
    start_date = datetime.datetime(2023, 1, 1)
    for i in range(number_of_vacancies//100):
        params = {
            'area': city_id,
            'per_page': 100,
            'page': i,
            'date_from': start_date.strftime('%Y-%m-%d'),
            'industry': industry_id,
            'only_with_salary': True, 
            'currency': "RUR"
        }
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        if vacancies is None:
            vacancies = response.json()
        else:
            vacancies['items'].extend(response.json()['items'])
        time.sleep(np.random.uniform(0.5, 2.0))
        
    for vac in vacancies['items']:  
        for key in ("area", "type", "response_url", "sort_point_distance", "published_at", "created_at", "archived", "apply_alternate_url", "brand_snippet",
        "branding", "show_logo_in_search", "insider_interview", "url", "alternate_url", "relations", "contacts", "adv_context", "adv_response_url"):
            if key in vac.keys():
                vac.pop(key)
        vac["employer"] = {"trusted":vac["employer"]["trusted"]}
        
            
    return vacancies["items"]

In [6]:
def metro_stations_in_city(city_id):
    url_stations = f'https://api.hh.ru/metro/{city_id}'
    params = {
    }
    headers = {
        'HH-User-Agent': 'my-app/0.0.1'
    }
    stations_response = requests.get(url_stations, params=params, headers=headers)
    stations_response.raise_for_status()
    stations_response = stations_response.json()
    np_stations = np.empty((0,2))
    for line in stations_response["lines"]:
        for station in line["stations"]:
            np_stations = np.append(np_stations, np.array([station["lat"], station["lng"]], ndmin=2), axis=0)
    return np_stations

In [5]:
top_industries = top_k_industries(3)

In [6]:
print(top_industries)

[(163363, '5', 'Перевозки, логистика, склад, ВЭД'), (163363, '11', 'СМИ, маркетинг, реклама, BTL, PR, дизайн, продюсирование'), (163363, '13', 'Строительство, недвижимость, эксплуатация, проектирование')]


In [7]:
import os

def clear_directory(directory):
    # Check if the directory exists
    if os.path.exists(directory):
        # List all files and subdirectories in the directory
        for file_name in os.listdir(directory):
            # Construct the full path to the file or subdirectory
            full_path = os.path.join(directory, file_name)
            # Check if it's a file
            if os.path.isfile(full_path):
                # Delete the file
                os.remove(full_path)
            # If it's a directory, recursively clear it
            elif os.path.isdir(full_path):
                clear_directory(full_path)
        # After deleting all files and subdirectories, remove the directory itself
        os.rmdir(directory)
        print(f"Directory '{directory}' cleared successfully.")
    else:
        print(f"Directory '{directory}' does not exist.")


In [8]:
directory_path = "datasets"
clear_directory(directory_path)

Directory 'datasets' cleared successfully.


In [9]:
pathlib.Path('datasets').mkdir(parents=True, exist_ok=True)

In [10]:
for ind in top_industries:
    with open(f"datasets/industry({ind[2]}).json", "w", encoding='utf-8') as outfile:
        json.dump(get_vacancies(1, 2000, ind[1]), outfile, ensure_ascii=False, indent=4)

In [8]:
np.save("src_files/stations.npy", metro_stations_in_city(1))