# HH.ru parsing with Api

In [16]:
import requests
import pandas as pd
import xlsxwriter
import re
import openpyxl

In [7]:
URL = 'https://api.hh.ru/'
FILE = 'info.csv'
HEADERS = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                         ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
CMD = 'vacancies'
REQUEST = 'исследователь математика'

In [8]:
def authorization(url):
    r = requests.get(url, headers=HEADERS)
    return r.status_code

In [9]:
def get_vacancy(cmd_url, parameters=None):
    cmd = URL + cmd_url
    r = requests.get(cmd, headers=HEADERS, params=parameters)
    return r.json()

In [10]:
def vacancy_info(vacancy_number):
    url = f'{URL}vacancies/{vacancy_number}'
    r = requests.get(url, headers=HEADERS)
    info = r.json()
    try:
        experience = info['experience']['name']
        skills = info['key_skills']
        key_skills = []
        for skill in skills:
            key_skills.append(skill['name'])
    except:
        pass
    return experience, key_skills

In [11]:
def collect_vacancies(json_text):
    list_of_vacancies = list()
    list_of_dicts = json_text['items']
    for dict in list_of_dicts:
        id = dict['id']
        link = dict['alternate_url']
        employee = dict['name']
        employer = dict['employer']['name']
        city = dict['area']['name']
        experience, key_skills = vacancy_info(id)
        requirements = dict['snippet']['requirement']
        responsibilities = dict['snippet']['responsibility']
        if responsibilities is not None:
            responsibilities = re.sub('<.*?highlighttext>', '', responsibilities)
        if requirements is not None:
            requirements = re.sub('<.*?highlighttext>', '', requirements)
        if len(key_skills) == 0:
            key_skills = None
        else:
            key_skills = ', '.join(key_skills)
        vacancy = {'link': link, 'employee': employee,
                   'employer': employer, 'city': city,
                   'experience': experience, 'requirements': requirements,
                   'responsibilities': responsibilities, 'key_skills': key_skills}
        list_of_vacancies.append(vacancy)
    return list_of_vacancies

In [12]:
def process_vacancies(key_words):
    full_list = []
    for i in range(19):
        print(f'Processing page {i}')
        params = {'text': key_words,
                  'area': 113, 'page': i, 'per_page': 100}
        text = get_vacancy(CMD, params)
        try:
            full_list += collect_vacancies(text)
        except:
            print('====Nothing found====')
        i += 1
    print(f'Number of vacancies: {len(full_list)}')
    df = pd.DataFrame(full_list)
    df.columns = ['Ссылка', 'Кого ищут', 'Кто ищет',
                  'Город', 'Опыт', 'Требования', 'Обязанности', 'Ключевые навыки']
    return df

In [13]:
def pandas_export(data_frame, sheet_name):
    writer = pd.ExcelWriter('vacancy.xlsx', engine='xlsxwriter')
    data_frame.to_excel(writer, sheet_name=sheet_name, index=False)
    workbook = writer.book
    worksheet = writer.sheets[sheet_name]
    format1 = workbook.add_format({'num_format': '#,##0.00'})
    format2 = workbook.add_format({'num_format': '0%'})
    worksheet.set_column('A:H', 30, format1)
    worksheet.set_column('C:C', 30, format2)
    writer.save()

In [14]:
def pandas_add(data, sheet_name):
    path = 'vacancy.xlsx'
    book = openpyxl.load_workbook(path)
    writer = pd.ExcelWriter(path, engine='openpyxl')
    writer.book = book
    data.to_excel(writer, sheet_name=sheet_name, index=False)
    writer.save()
    writer.close()

In [15]:
if authorization(URL) == 200:
    print('====Connection set====')
    df = process_vacancies(REQUEST)
    pandas_export(df, REQUEST)
    again = -1
    while again != 0:
        REQUEST = input('New request: ')
        df = process_vacancies(REQUEST)
        name_of_sheet = input('Sheet name: ')
        pandas_add(df, name_of_sheet)
        again = int(input('0 to stop:'))
    print('====Finished====')
else:
    pass
    print('====Error====')

====Connection set====
Processing page 0
Processing page 1
Processing page 2
Processing page 3
Processing page 4
Processing page 5
Processing page 6
Processing page 7
Processing page 8
Processing page 9
Processing page 10
Processing page 11
Processing page 12
Processing page 13
Processing page 14
Processing page 15
Processing page 16
Processing page 17
Processing page 18
Number of vacancies: 104


KeyboardInterrupt: Interrupted by user