# Part 1

This part of the program collects unique resume identification numbers from the online job portal HeadHunter.ru (hh.ru).  

## Initialize log file (only once)

In [20]:
def init_log_writer(log_path, col_names): 
    ''' Initialize log file.
    Arguments:
        log_path (str) -- Relative or absolute path to log file.
        col_names (list) -- Column names to be exported with data.
    
    Returns: io Buffered Writer to tab-separated log file with column names.
    '''
    with open(log_path, 'wb') as log_writer:
        header = "\t".join(col_names) + '\n'
        log_writer.write(header.encode('utf-8'))
    return log_writer

# Initialize log file before running the main program. 
# Once log file is initialized, no need to repeat this code.
log_path = resumes_path + "resumes_log.txt"
col_names = ["Request time", "Unexpected error", "Applicants added"]
log_file = init_log_writer(log_path, col_names) 

## Creating a new directory for a given day

Every directory contains csv files that were created during the same day. The name of a directory corresponds to date when these files were created.  

In [26]:
import os
import datetime as dt

def directory(resumes_directory):
    ''' Create a directory for a given day.
    Argument:
        resumes_directory (str) -- Relative or absolute path to a given directory.
        
    Returns: directory with a given name.  
    '''
    if not os.path.exists(resumes_directory):
        os.makedirs(resumes_directory)
    return resumes_directory

## Write information to log file

In [27]:
def log_writer(log_path, log_row):
    '''Write information to log file.
    Argument:
        log_path(str) -- Relative or absolute path to log file.
        log_row(list) -- Data to be added to log file.
        
    Returns: io Buffered Writer to tab-separated log file with relevant information.
    '''
    with open(log_path, 'ab') as log_file:
        row = "\t".join(log_row) + '\n'
        log_file.write(row.encode('utf-8'))
    return log_file

## Write data to file

In [28]:
def data_writer(data_path, data_row):
    '''Write data to file.
    Arguments:
        data_path (str) -- Relative or absolute path to data file.
        data_row (list) -- Data to be added to data file.
        
    Returns: io Buffered Writer to data file.
    '''
    with open(data_path, "wb") as toWrite:
        for item in data_row:
            row = ",".join(item) + '\n'
            toWrite.write(row.encode('utf-8'))
    return toWrite

## Scraping data from the website hh.ru

The main goal of this program is to collect unique resume numbers for all the job seekers who have recently posted their resumes on the website. Every applicant may post more than one resume. All resume numbers from the same person are written on the same raw in csv data file.   

In [33]:
import requests
from bs4 import BeautifulSoup
from time import sleep
import datetime as dt
import sys
import csv

resumes_path = "hh_data/hh_resumes/" 
now=dt.datetime.today()
final_time = now + dt.timedelta(days=7) # time to stop scraping

while now < final_time: # scrape until final_time 
    
    date_prefix = dt.datetime.now().strftime('%Y-%m-%d--%H-%M-%S') # exact time of scraping
    log_row = [date_prefix] # the first element to be added to a log file
    
    # A new directory is created for every day of scraping:
    date_prefix_dir = dt.datetime.now().strftime('%Y-%m-%d')
    resumes_directory = resumes_path + date_prefix_dir
    directory(resumes_directory) # create a directory if it did not exist before
    
    # URL to search resumes with specific parameters: 
    # age: from 16 to 60; 
    # area: Russia; citizenship: Russian; 
    # salary: find resumes that state desired salary; 
    # type of employment: full-time. 
    url = 'https://hh.ru/search/resume?age_to=60&items_on_page=100&order_by=publication_time& \
    citizenship=113&area=113&text=&pos=full_text&label=only_with_salary&label=only_with_age& \
    exp_period=all_time&logic=normal&clusters=true&age_from=16&employment=full&page='

    resumes_list = [] # list of all resumes' id numbers for all applicants 

    # Collect resume numbers for every applicant using specific search url:  
    last_page = 50 # checking resumes on 50 pages
    for i in range(0,last_page):
        url_page = url + str(i) # url for every page
        headers = {'User-Agent': 'career-success (olga.boldareva@gmail.com)'} 
        try:
            response = requests.get(url_page, headers=headers)
            if i == (last_page-1):
                log_row.append('No errors') # to be added to a log file if no errors occured 
        except:
            print ("Unexpected error:", sys.exc_info()[0])
            log_row.append(sys.exc_info()[0]) # error code to be added to a log file
            break
    
        if response.status_code == 200: # 200 sends the right response 
            soup = BeautifulSoup(response.text, 'html.parser')
            # Every class identifies a unique applicant with one or more resumes available. 
            resumes_summary = soup.find_all('td', class_='output__main-cell')
            for item in resumes_summary:
                res_one_appl = [] #list of all resumes from one applicant
                res_temp=item.find_all('a')
                for resume in res_temp:
                    res_one_appl.append(resume.attrs[u'href'][8:]) # adding a resume number 
                # Adding all resumes (sorted by name) from one applicant:
                resumes_list.append(sorted(res_one_appl))  
        else:
            print (response.status_code)
    log_row.append(str(len(resumes_list))) # number of scraped resumes to be added to a log file
    
    data_path = resumes_directory + "/resumes_" + date_prefix +".csv"
    data_writer(data_path, resumes_list) # write data to csv file

    log_path = resumes_path + "resumes_log.txt"
    log_writer(log_path, log_row) # save information to log file 
    
    sleep(7200) # 2 hours to wait until the next request
    now=dt.datetime.today()
    
else:  
    print ("Scraping is finished!")       

Scraping is finished!
