План

1. Менюшка:
    1. Ввод названия профессии
    2. город или города
    3. количество дней (последних и 0 если год)
    4. количество страниц для парса
    5. вывод статистики по вакансии
    6. вывод всех спаршенных вакансий в виде таблички
    7. вывод определенной по id с описанием

2. функции статистики:
    1. средняя зп
    2. количество вакансий
    3. зарплата по городам
    4. количественный график по зп
    5. 
    
3. 
    

In [1]:
import requests
import time
from datetime import datetime, timedelta
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [ ]:
def get_exchange_courses():
    url = 'https://api.exchangerate-api.com/v4/latest/RUB'

    response = requests.get(url)
    data = response.json()

    return data['rates']


def calculate_average_salary(salary):
    avg = 0

    if salary is None:
        return -1
    elif salary['from'] is not None and salary['to'] is not None:
        avg = (salary['from'] + salary['to']) / 2
    elif salary['from'] is not None:
        avg = salary['from']
    elif salary['to'] is not None:
        avg = salary['to']
    else:
        return -3

    if salary['gross']:
        avg *= 0.87
    if salary['currency'] != 'RUR':
        courses = get_exchange_courses()
        avg /= courses[salary['currency']]

    return avg


def parse_vacancies(text, num_of_pages=20, daily=False):

    vacancies = []
    error_codes = [0, 0, 0, 0]
    ids = []

    if daily:
        num_of_days = 365
    else:
        num_of_days = 1
        
    delta = timedelta(days=1)
    date = datetime.now()

    for _ in range(num_of_days):
        url = f'https://api.hh.ru/vacancies?text={text}&per_page=100&date_from={str(date - delta)[:10]}&date_to={str(date)[:10]}&area=1&page='
        if not daily:
            url = f'https://api.hh.ru/vacancies?text={text}&per_page=100&area=1&page='
        
        for i in range(num_of_pages):
            req = requests.get(url + str(i)).json()

            if req is not None and 'items' in req.keys():
                for raw_vac in req['items']:
                    average_salary = calculate_average_salary(raw_vac['salary'])

                    if average_salary > 0 and raw_vac['id'] not in ids:
                        vac = {
                            'id': raw_vac['id'],
                            'name': raw_vac['name'],
                            'salary': average_salary,
                            'employer': raw_vac['employer']['name'],
                        }

                        ids.append(raw_vac['id'])

                        if raw_vac['snippet'] is not None:
                            vac['responsibility'] = raw_vac['snippet']['responsibility']
                            vac['requirement'] = raw_vac['snippet']['requirement']
                        else:
                            vac['responsibility'] = None
                            vac['requirement'] = None

                        vacancies.append(vac)

                    elif raw_vac['id'] in ids:
                        error_codes[-1] += 1

                    else:
                        error_codes[-1 - average_salary] += 1

        date -= delta

    return vacancies


def parse_greedy(text, num_of_pages=20):
    not_daily = parse_vacancies(text, num_of_pages, daily=False)
    time.sleep(30)
    daily = parse_vacancies(text, num_of_pages, daily=True)

    ids_daily = [vac['id'] for vac in daily]
    all_vacs = []

    for vac in not_daily:
        if vac['id'] not in ids_daily:
            all_vacs.append(vac)
    all_vacs += daily

    return all_vacs


def draw_boxplot(files, data_folder):
    for file_name in files:
        if file_name.endswith('.json'):

            with open(os.path.join(data_folder, file_name), 'r', encoding='utf-8') as file:
                vacancies = json.load(file)

            salaries = [vacancy['salary'] for vacancy in vacancies]

            plt.figure(figsize=(10, 6))
            sns.boxplot(x=salaries)
            plt.title(f'Распределение зарплат для специальности {file_name[:-5]}')
            plt.xlabel('Зарплата (рубли)')
            plt.show()
            
            
def draw_histogramm(files, data_folder):
    for file_name in files:
        if file_name.endswith('.json'):

            with open(os.path.join(data_folder, file_name), 'r', encoding='utf-8') as file:
                vacancies = json.load(file)

            salaries = [vacancy['salary'] for vacancy in vacancies]

            plt.figure(figsize=(10, 6))
            sns.histplot(salaries, kde=True)
            plt.title(f'Распределение зарплат для специальности {file_name[:-5]}')
            plt.xlabel('Зарплата (рубли)')
            plt.ylabel('Частота')
            plt.show()
            
            
def main():
    running = True

    while running:
        mode = int(input('Ввести 0 для выхода, 1 для сбора инфы, 2 для анализа, 3 для удаления бд.\n'))

        if mode == 0:
            running = False
        else:
            print('--> * для возврата к выбору режима.')

        if mode == 1:
            greedy = int(input('0 - искать простым методом (чуть меньше вакансий, сильно быстрее), 1 - искать сложным методом (чуть больше, очень долго).'))
            text = input('Запрос: ')

            while text != '*':
                if greedy:
                    vacancies = parse_greedy(text)
                else:
                    vacancies = parse_vacancies(text)

                with open('data/{}.json'.format(text), 'w') as file:
                    json.dump(vacancies, file)
                print('Записано', len(vacancies), 'вакансий.')
                text = input('Запрос: ')

        elif mode == 2:
            data_folder = 'data'
            files = os.listdir(data_folder)
            plot = input('1 - гистограмма, 2 - коробочка.')
            if plot == '*':
                continue
            elif int(plot) == 1:
                draw_histogramm(files, data_folder)
            elif int(plot) == 2:
                draw_boxplot(files, data_folder)
            else:
                break

        elif mode == 3:
            response = input('Вы уверены в своих действиях? [д - да, н - нет]')
            if response.lower() == 'д':
                response = input(
                    'Прям вот точно? [д - да, н - нет]')
                if response.lower() == 'д':
                    files = os.listdir('data')
                    for file_name in files:
                        file_path = os.path.join('data', file_name)
                        if os.path.isfile(file_path) and file_name.endswith('.json'):
                            os.remove(file_path)
                else:
                    continue
            else:
                continue
                
            
def parse_custom(text, per_page, area_name):
    
    page = 0
    
    url = f'https://api.hh.ru/vacancies?text={text}&per_page={per_page}&area={area_name}&page={page}'
    
    salary = lambda x: [x['salary']['from'], x['salary']['to'], x['salary']['currency'], x['salary']['gross']] if x['salary'] is not None else [None] * 4
    
    snippet = lambda x: [x['snippet']['responsibility'], x['snippet']['requirement']] if ['snippet'] is not None else [None] * 2
    
    id = lambda x: [x['id']] if x['id'] is not None else [None]
    
    name = lambda x: [x['name']] if x['name'] is not None else [None]
    
    area = lambda x: [x['area']['id'], x['area']['name']] if x['area'] is not None else [None] * 2
    
    type = lambda x: [x['type']['id'], x['type']['name']] if x['type'] is not None else [None] * 2
    
    employer = lambda x: [x['employer']['name']] if x['employer'] is not None else [None]
    
    schedule = lambda x: [x['schedule']['id'], x['schedule']['name']] if x['schedule'] is not None else [None] * 2
    
    experience = lambda x: [x['experience']['id'], x['experience']['name']] if x['experience'] is not None else [None] * 2
    
    employment = lambda x: [x['employment']['id'], x['employment']['name']] if x['employment'] is not None else [None] * 2
    
    vacancy = [id(item) + name(item) + salary(item) + snippet(item) +area(item) + type(item) + employer(item) + schedule(item) + experience(item) + employment(item) for item in requests.get(url).json()['items']]

In [ ]:
main()