In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import undetected_chromedriver as uc
from stem.control import Controller
from stem import Signal

options = webdriver.ChromeOptions()
pd.set_option('display.max_columns', None)
options.add_argument('--headless')
options.add_argument('--disable-javascript')
driver_path = '/home/natasha/DiplomaWork/Natasha/chromedriver'
# driver_path = '/srv/data/shared_data_folder/Utilities/chromedriver'
file_path = 'resumes_2.csv'
tor_pwd = 'vladik01'

In [2]:
def soup_get(driver):
    page = driver.page_source
    soup = BeautifulSoup(page)
    return soup

def get_title(soup):
    try:
        resume_block = soup.find('div', {'class' : 'resume-block'})
        title = resume_block.find('div', {'class' : 'resume-block__title-text-wrapper'}).text
        return title
    except:
        return None

def get_money(soup):
    try:
        salary = soup.find('span', {'class' : 'resume-block__salary resume-block__title-text_salary'}).text
        salary = int(re.findall(r'\d+', salary.replace('\u2009', '').replace('\xa0', ' '))[0])
        return salary
    except:
        return None
    
def get_exp_period(soup):
    try:
        exp_block = soup.find('div', {'data-qa' : 'resume-block-experience'})
        raw_years = exp_block.find('span', {'class' : 'resume-block__title-text resume-block__title-text_sub'})
        if len(raw_years.find_all('span')) > 1:
            years = int(re.findall(r'\d+', raw_years.find_all('span')[0].text)[0])
            months = int(re.findall(r'\d+', raw_years.find_all('span')[1].text)[0])
            exp_period = years*12 + months

        elif (len(raw_years.find_all('span')) == 1) and (('лет' in raw_years.find_all('span')[0]) or ('год' in raw_years.find_all('span')[0])):
            years = int(re.findall(r'\d+', raw_years.find_all('span')[0].text)[0])
            exp_period = years * 12

        elif (len(raw_years.find_all('span')) == 1) and (('месяц' in raw_years.find_all('span')[0]) or ('месяцев' in raw_years.find_all('span')[0])):
            months = int(re.findall(r'\d+', raw_years.find_all('span')[0].text)[0])
            exp_period = months

        return exp_period
    
    except:
        return None
    
def get_gender(soup):
    try:
        raw_main_info = soup.find('div', {'class' : 'resume-header-main'})
        gender = raw_main_info.find('span', {'data-qa' : 'resume-personal-gender'}).text
        return gender
    except:
        return None
        
        
def get_age(soup):
    try:
        raw_main_info = soup.find('div', {'class' : 'resume-header-main'})
        age = int(re.findall(r'\d+', raw_main_info.find('span', {'data-qa' : 'resume-personal-age'}).text)[0])
        return age
    except:
        return None
    
def get_city(soup):
    try:
        raw_main_info = soup.find('div', {'class' : 'resume-header-main'})
        city = raw_main_info.find('span', {'data-qa' : 'resume-personal-address'}).text
        return city
    except:
        return None
    
def get_key_sights(soup):
    try:
        key_sights = [s.text for s in soup.find_all('span', {'data-qa' : 'bloko-tag__text'})]
        return '|'.join(key_sights)
    except:
        return None
    
def get_self_description(soup):
    try:
        description = soup.find('div', {'data-qa' : 'resume-block-skills-content'}).text
        return description
    except:
        return None
    
def get_places_of_work(soup):
    try:
        places_raw = soup.find_all('div', {'class' : 'resume-block'})
        n_places = len(places_raw[1].find_all('div', {'class' : 'bloko-column bloko-column_xs-4 bloko-column_s-2 bloko-column_m-2 bloko-column_l-2'}))
        return n_places
    except:
        return None
    
def get_languages(soup):
    try:
        lang_block = soup.find('div', {'data-qa' : 'resume-block-languages'})
        n_langs = len(lang_block.find_all('p',{'data-qa' : 'resume-block-language-item'}))
        langs = [_.text.split(' — ')[0] for _ in lang_block.find_all('p',{'data-qa' : 'resume-block-language-item'})]
        return n_langs, '|'.join(langs)
    except:
        return None, None
    
def get_last_places(soup):
    try:
        exp_block = soup.find('div', {'data-qa' : 'resume-block-experience'})
        exp_texts = exp_block.find_all('div', {'class' : 'bloko-column bloko-column_xs-4 bloko-column_s-6 bloko-column_m-7 bloko-column_l-10'})
        exp_places = [t.find('div', {'class': 'bloko-text bloko-text_strong'}).text for t in exp_texts]
        
        if len(exp_places) > 1:
            return exp_places[0], exp_places[1]
        elif len(exp_places) == 1:
            return exp_places[0], None
    except:
        return None, None
    
def get_last_positions(soup):
    try:
        exp_block = soup.find('div', {'data-qa':'resume-block-experience'})
        positions = exp_block.find_all('div', {'data-qa':'resume-block-experience-position'})
        positions = [position.text for position in positions]
        if len(positions) > 1:
            positions = positions[:2]
        elif len(positions) == 1:
            positions.append(None)
        else:
            positions = [None, None]
        return positions[0], positions[1]
    except:
        return None, None
    
def get_citizenship(soup):
    try:
        additional_info = {'Гражданство': None, 'Разрешение на работу': None, 'Желательное время в пути до работы': None}
        additional_block = soup.find('div', {'data-qa' : 'resume-block-additional'})
        additional_block = additional_block.find('div', {'class': 'resume-block-item-gap'})
        blocks = additional_block.find_all('p')
        data = [block.text for block in blocks]
        for block in data:
            item = block.split(': ')
            additional_info[item[0]] = item[1]
        return additional_info
    except:
        return {'Гражданство': None, 'Разрешение на работу': None, 'Желательное время в пути до работы': None}

def get_education(soup): 
    try:
        education_block = soup.find('div', {'data-qa':'resume-block-education'})
        edu_type = education_block.find('span', {'class' : 'resume-block__title-text resume-block__title-text_sub'}).text
        unis = education_block.find_all('a', {'class' : 'bloko-link bloko-link_kind-tertiary'})
        unis = [uni.text for uni in unis]
        n_places = len(unis)
        if n_places > 1:
            unis = unis[:2]
        elif n_places == 1:
            unis.append(None)
        else:
            unis = [None, None]

        last_edu_year = int(education_block.find('div', {'class':'bloko-column bloko-column_xs-4 bloko-column_s-2 bloko-column_m-2 bloko-column_l-2'}).text)

        return last_edu_year, n_places, unis, edu_type
    except:
        return None, None, [None, None], None
    
def get_specialisation(soup):
    try:
        specialisation = soup.find('span', {'data-qa' : 'resume-block-specialization-category'}).text
        return specialisation
    except:
        return None
    
def get_work_type(soup):
    try:
        info_dict = dict()
        extra_info = soup.find('div', {'data-qa' : 'resume-block-position'}).find_all('p')
        extra_info = [_.text for _ in extra_info]
        for block in extra_info:
            data = block.split(': ')
            info_dict[data[0]] = data[1]
        return info_dict
    except:
        return {'Занятость': None, 'График работы' : None}
    
def get_movements(soup):
    try:
        movements_dict = dict()
        movements = soup.find('div', {'class' : 'resume-header-title'}).find('div', {'class':'bloko-translate-guard'}).text

        if 'не готов к переезду' in movements:
            movements_dict['ready_to_move'] = 0
        elif 'готов к переезду' in movements:
            movements_dict['ready_to_move'] = 1
        else:
            movements_dict['ready_to_move'] = None

        if 'не готов к командировкам' in movements:
            movements_dict['ready_4_business_trip'] = 0
        elif 'готов к командировкам' in movements:
            movements_dict['ready_4_business_trip'] = 1
        else:
            movements_dict['ready_4_business_trip'] = None
        return movements_dict
    except:
        return {'ready_to_move' : None, 'ready_4_business_trip' : None}
        
    

In [3]:
def __main__(path_to_file):
    df = pd.read_csv('resumes_2.csv')
    base_link = 'https://hh.ru'
    driver = webdriver.Chrome(options = options, executable_path = driver_path)
    urls = df['Url'].values
    
    data_dict = {'Title' : [],
                 'SpecCat' : [],
                 'ExpPeriod' : [],
                 'Salary': [],
                 'Age' : [],
                 'Gender' : [],
                 'City' : [],
                 'WorkType' : [],
                 'WorkSchedule' : [],
                 'N_places' : [],
                 'LastPlace' : [],
                 'LastPlace_2': [],
                 'LastPos' : [],
                 'LastPos_2' : [],
                 'N_langs' : [],
                 'Langs' : [],
                 'Citizenship' : [],
                 'Time2Work': [],
                 'WorkRights' : [],
                 'Ready_2_move' : [],
                 'Ready_4_business_trip' : [],
                 'Description' : [],
                 'Tags' : [],
                 'Graduation' : [],
                 'EduLevel': [],
                 'LastUni' : [],
                 'LastUni_2' : [],
                 'N_Unis' : [],
                 'URL' : []}
    cnt = 0
    for url in tqdm(urls[2000:2010]):
        cnt+=1
        driver.get(base_link + url) # open web_page
        soup = soup_get(driver) # get soup
        try:
            data_dict['URL'].append(url)
            data_dict['Title'].append(get_title(soup))
            data_dict['Salary'].append(get_money(soup))
            data_dict['SpecCat'].append(get_specialisation(soup))
            data_dict['ExpPeriod'].append(get_exp_period(soup))
            data_dict['Gender'].append(get_gender(soup))
            data_dict['Age'].append(get_age(soup))
            data_dict['City'].append(get_city(soup))
            data_dict['Tags'].append(get_key_sights(soup))
            data_dict['Description'].append(get_self_description(soup))
            data_dict['N_places'].append(get_places_of_work(soup))

            place_1, place_2 = get_last_places(soup)
            data_dict['LastPlace'].append(place_1)
            data_dict['LastPlace_2'].append(place_2)

            pos_1, pos_2 = get_last_positions(soup)
            data_dict['LastPos'].append(pos_1)
            data_dict['LastPos_2'].append(pos_2)

            n_langs, langs = get_languages(soup)
            data_dict['N_langs'].append(n_langs)
            data_dict['Langs'].append(langs)

            citizenship = get_citizenship(soup)
            data_dict['Citizenship'].append(citizenship['Гражданство'])
            data_dict['WorkRights'].append(citizenship['Разрешение на работу'])
            data_dict['Time2Work'].append(citizenship['Желательное время в пути до работы'])

            last_edu_year, n_places, unis, edu_type = get_education(soup)
            data_dict['Graduation'].append(last_edu_year)
            data_dict['EduLevel'].append(edu_type)
            data_dict['N_Unis'].append(n_places)
            data_dict['LastUni'].append(unis[0])
            data_dict['LastUni_2'].append(unis[1])

            work_type = get_work_type(soup)
            data_dict['WorkType'].append(work_type['Занятость'])
            data_dict['WorkSchedule'].append(work_type['График работы'])

            movements = get_movements(soup)
            data_dict['Ready_2_move'].append(movements['ready_to_move'])
            data_dict['Ready_4_business_trip'].append(movements['ready_4_business_trip'])
            time.sleep((np.random.sample(1) + 1)[0])
        except:
            continue
        
        if (cnt % 3 == 0) and cnt > 0:
            driver.close()
            driver = webdriver.Chrome(options = options, executable_path = driver_path)
    res = pd.DataFrame.from_dict(data_dict, orient = 'index').T
    res = res.dropna(subset = ['Title']).reset_index(drop = True)
    return res

In [4]:
# d = __main__(file_path)
# d

In [5]:
# d.columns

In [None]:
df = pd.read_csv('resumes_2.csv')
base_link = 'https://hh.ru'
driver = webdriver.Chrome(options = options, executable_path = driver_path)
urls = df['Url'].values
    
data_dict = {'Title' : [],
            'SpecCat' : [],
                 'ExpPeriod' : [],
                 'Salary': [],
                 'Age' : [],
                 'Gender' : [],
                 'City' : [],
                 'WorkType' : [],
                 'WorkSchedule' : [],
                 'N_places' : [],
                 'LastPlace' : [],
                 'LastPlace_2': [],
                 'LastPos' : [],
                 'LastPos_2' : [],
                 'N_langs' : [],
                 'Langs' : [],
                 'Citizenship' : [],
                 'Time2Work': [],
                 'WorkRights' : [],
                 'Ready_2_move' : [],
                 'Ready_4_business_trip' : [],
                 'Description' : [],
                 'Tags' : [],
                 'Graduation' : [],
                 'EduLevel': [],
                 'LastUni' : [],
                 'LastUni_2' : [],
                 'N_Unis' : [],
                 'URL' : []}
cnt = 0
for url in tqdm(urls[113000:123000]):
    cnt+=1
    driver.get(base_link + url) # open web_page
    soup = soup_get(driver) # get soup
    try:
        data_dict['URL'].append(url)
        data_dict['Title'].append(get_title(soup))
        data_dict['Salary'].append(get_money(soup))
        data_dict['SpecCat'].append(get_specialisation(soup))
        data_dict['ExpPeriod'].append(get_exp_period(soup))
        data_dict['Gender'].append(get_gender(soup))
        data_dict['Age'].append(get_age(soup))
        data_dict['City'].append(get_city(soup))
        data_dict['Tags'].append(get_key_sights(soup))
        data_dict['Description'].append(get_self_description(soup))
        data_dict['N_places'].append(get_places_of_work(soup))

        place_1, place_2 = get_last_places(soup)
        data_dict['LastPlace'].append(place_1)
        data_dict['LastPlace_2'].append(place_2)

        pos_1, pos_2 = get_last_positions(soup)
        data_dict['LastPos'].append(pos_1)
        data_dict['LastPos_2'].append(pos_2)

        n_langs, langs = get_languages(soup)
        data_dict['N_langs'].append(n_langs)
        data_dict['Langs'].append(langs)

        citizenship = get_citizenship(soup)
        data_dict['Citizenship'].append(citizenship['Гражданство'])
        data_dict['WorkRights'].append(citizenship['Разрешение на работу'])
        data_dict['Time2Work'].append(citizenship['Желательное время в пути до работы'])

        last_edu_year, n_places, unis, edu_type = get_education(soup)
        data_dict['Graduation'].append(last_edu_year)
        data_dict['EduLevel'].append(edu_type)
        data_dict['N_Unis'].append(n_places)
        data_dict['LastUni'].append(unis[0])
        data_dict['LastUni_2'].append(unis[1])

        work_type = get_work_type(soup)
        data_dict['WorkType'].append(work_type['Занятость'])
        data_dict['WorkSchedule'].append(work_type['График работы'])

        movements = get_movements(soup)
        data_dict['Ready_2_move'].append(movements['ready_to_move'])
        data_dict['Ready_4_business_trip'].append(movements['ready_4_business_trip'])
        time.sleep((np.random.sample(1) + 1)[0])
    except:
        continue
        
    if (cnt % 3 == 0) and cnt > 0:
        driver.close()
        driver = webdriver.Chrome(options = options, executable_path = driver_path)
res = pd.DataFrame.from_dict(data_dict, orient = 'index').T
res = res.dropna(subset = ['Title']).reset_index(drop = True)

 17%|█▋        | 1714/10000 [2:13:48<12:44:11,  5.53s/it]

In [None]:
res

In [None]:
res.to_csv('res_15.csv', index = False)

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('/Users/natalyakrauze/Desktop/диплом/parser/resume_128k.csv', lineterminator='\n')
df

Unnamed: 0,Title,ExpPeriod,Salary,Age,Gender,City,WorkType,WorkSchedule,N_places,LastPlace,...,Ready_4_business_trip,Description,Tags,Graduation,EduLevel,LastUni,LastUni_2,N_Unis,URL,Search
0,Аналитик,79.0,200000.0,29.0,Мужчина,Санкт-Петербург,полная занятость,"удаленная работа, гибкий график, полный день",6,A&D Mortgage,...,1.0,Имею экономическое образование. Продвинутый по...,Экономический анализ|Экономическое моделирован...,2015.0,Высшее образование (Бакалавр),Санкт-Петербургский государственный политехнич...,,1.0,/resume/bebf6a630008069d8c0039ed1f62654e484478...,Аналитик BI
1,Аналитик,8.0,80000.0,22.0,Мужчина,Москва,"проектная работа, стажировка, частичная занято...","удаленная работа, гибкий график, полный день, ...",1,MASTERDATA,...,1.0,"Имею гибкие организаторские навыки, являюсь чл...",Python|SQL|Математическая статистика|Аналитиче...,2022.0,Высшее образование,"Национальный исследовательский университет ""Вы...",,1.0,/resume/41f4c1d600073f62b10039ed1f516561794855...,Аналитик BI
2,Аналитик BI,21.0,180000.0,21.0,Мужчина,Москва,полная занятость,"удаленная работа, гибкий график, полный день",6,"ДИКСИ, группа компаний",...,0.0,Навыки:-Разработка и поддержка отчетности на б...,MS SQL|Business Intelligence Systems|Oracle Pl...,2022.0,Неоконченное высшее образование,Финансовый университет при Правительстве Росси...,,1.0,/resume/5ab4a3300007866f000039ed1f754a31314344...,Аналитик BI
3,Разработчик BI,27.0,,43.0,Мужчина,Москва,"проектная работа, частичная занятость, полная ...","удаленная работа, гибкий график, полный день, ...",3,Проект,...,,Ищу работу в штат компании на постоянной основ...,Управленческая отчетность|Долгосрочное бюджети...,2012.0,Высшее образование (Магистр),Дальневосточный государственный технический ры...,,1.0,/resume/75aa2b2100053492d70039ed1f6e5748375a75...,Аналитик BI
4,Аналитик BI (junior),157.0,40000.0,35.0,Женщина,Москва,"проектная работа, частичная занятость, полная ...","удаленная работа, гибкий график, полный день, ...",3,Школа иностранных языков Алибра,...,,Почти 12 лет я работала в туристической компан...,Python|PostgreSQL|1С: Предприятие 7|MS Excel|G...,2021.0,Высшее образование,,,0.0,/resume/02c4130b00050d82080039ed1f7a3634376a75...,Аналитик BI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128093,Python разработчик,75.0,,29.0,Мужчина,Нижний Новгород,,,4,Студия ТГ,...,,Ссылка на мой GitHub: https://github.com/eagurin,Python|Docker|Git|Django Framework|FastAPI|Pos...,2016.0,Неоконченное высшее образование,Нижегородский государственный архитектурно-стр...,,1.0,/resume/b32dfa7400084135f60039ed1f4558787a4f6b...,Базы данных
128094,Оператор ЭВМ,103.0,40000.0,46.0,Женщина,Москва,,,4,Поликлиника № 1 РАН,...,,"веду активный образ жизни , легко вступаю в ко...",Работа с базами данных|Умение работать в коман...,1992.0,Среднее специальное образование,,,0.0,/resume/34353d720004417d790039ed1f6f34377a706f...,Базы данных
128095,"Экономист, менеджер по работе с клиентами, опе...",128.0,45000.0,42.0,Женщина,Истра,,,1,"""Тушино-пиво""",...,,"уверенный пользователь ПК, 1с\rкоммуникабельн...",Уверенный пользователь ПК|Опыт работы с больши...,2003.0,Высшее образование,Московский автомобильно-дорожный государственн...,,1.0,/resume/3cbe08140001f43fbf0039ed1f4a5272484942...,Базы данных
128096,Специалист,98.0,,37.0,Женщина,Протвино,,,3,"ООО "" СИСТЕЛ АВТОМАТИЗАЦИЯ""",...,,"Ответственная, коммуникабельная, пунктуальная",MS Excel|MS Word|MS Access|MS PowerPoint|MS Pa...,2006.0,Высшее образование,Филиал Московского государственного техническо...,,1.0,/resume/94e033ac0002480dfd0039ed1f3977494b4b52...,Базы данных


In [6]:
df.to_excel('resume_128k.xlsx', index = False)

  (force_unicode(url), max_url))
  (force_unicode(url), max_url))
  (force_unicode(url), max_url))


  (force_unicode(url), max_url))
  (force_unicode(url), max_url))
