# Part 2

This part of the program takes unique resume identification numbers collected in Part 1 and sends API requests to get all the necessary information about every job-seeker and job postings similar to his/her resume.

## Working with data

### Initialize data writer

In [95]:
def init_data_writer(data_path, col_names): 
    ''' Initialize data file.
    
    Arguments:
        data_path (str) -- Relative or absolute path to data file.
        col_names (list) -- Column names to be exported with data.
    
    Returns: io Buffered Writer to tab-separated data file with column names.
    '''
    with open(data_path, 'wb') as dataWriter:
        header = "\t".join(col_names) + '\n'
        dataWriter.write(header.encode('utf-8'))
    return dataWriter

### Write data to data writer

In [96]:
def data_writer(data_path, data_row):
    '''Write data to file.
    
    Arguments:
        data_path (str) -- Relative or absolute path to data file.
        data_row (list) -- Data to be added to data file.

    Returns: io Buffered Writer to data file.
    '''
    with open(data_path, "ab") as toWrite:
        for item in data_row:
            row = "\t".join(item) + '\n'
            toWrite.write(row.encode('utf-8'))
    return toWrite

### Combine several csv files into one big csv file

In [97]:
from glob import glob
import csv

def combine_files(data_file_path, csv_path, date_prefix_dir):
    ''' Combine all csv files from the same folder into one csv file.
    
    Arguments:
        data_file_path (str) -- Relative or absolute path to data file.
        csv_path (str) -- Relative or absolute path to csv files that need to be combined.
        date_prefix_dir (str) -- Relative or absolute path to a directory for a given day.  
        
    Returns: io Buffered Writer to csv file. 
    '''
    with open(data_file_path, 'a') as singleFile:
        for csv in glob(csv_path):
            if csv == "main_" + date_prefix_dir + ".csv":
                pass
            else:
                for line in open(csv, 'r'):
                    singleFile.write(line)
    return data_file_path

### Finding relevant vacancies for every applicant from API search

In [98]:
import requests
import json

def Vacancies_search(url, area, specialization_1, specialization_2, specialization_3):
    ''' Save relevent information about vacancies (job postings).

    Arguments:
        url (str) -- URL to search job posting with specific parameters.
        area (str) -- code of the city where to search for job postings.
        specialization_1 (str) -- specilization 1 mentioned in a job posting.
        specialization_2 (str) -- specialization 2 mentioned in a job posting.
        specialization_3 (str) -- specialization 3 mentioned in a job posting.

    Returns: list object that contains the number of job postings found,  
    mean, minimum, and maximum wage levels mentioned in the job postings found.
    '''
    if specialization_2 != '0':
        if specialization_3 != '0':
            url_search = url + '&area=' + str(area) + '&specialization=' + str(specialization_1) + \
            '&specialization=' + str(specialization_2) + '&specialization=' + str(specialization_3) + \
            '&employment=full&only_with_salary=true&order_by=relevance&per_page=500&page=0'
        else:
            url_search = url + '&area=' + str(area) + '&specialization=' + str(specialization_1) + \
            '&specialization=' + str(specialization_2) + \
            '&employment=full&only_with_salary=true&order_by=relevance&per_page=500&page=0'
    else:
        url_search = url + '&area=' + str(area) + '&specialization=' + str(specialization_1) + \
        '&employment=full&only_with_salary=true&order_by=relevance&per_page=500&page=0'

    headers = {'User-Agent': 'career-success (olga.boldareva@gmail.com)'}
    response = requests.get(url_search, headers=headers)
    search_results = json.loads(response.text)
    salary_av_list = []
    for i in range(len(search_results['items'])):
        if (search_results['items'][i]['salary']['currency']) == 'RUR':
            salary_from = search_results['items'][i]['salary']['from']
            salary_to = search_results['items'][i]['salary']['to']
            if salary_to == None:
                salary_av = salary_from
            elif salary_from == None:
                salary_av = salary_to
            else: salary_av = (salary_to + salary_from)/2.0
            salary_av_list.append(salary_av)
    salary_list = [str(len(salary_av_list))]
    if len(salary_av_list) > 0:
        salary_mean = round(sum(salary_av_list) / float(len(salary_av_list)), 2)
        salary_list.append(str(salary_mean))
        salary_min = min(salary_av_list)
        salary_list.append(str(salary_min))
        salary_max = max(salary_av_list)
        salary_list.append(str(salary_max))
    return salary_list

### Getting additional information about applicants from API requests:

In [99]:
import requests
import json

def API_resume_information(url_resume):
    '''Collect all the necessary information about an applicant using his/her resume 
    and job postings similar to this resume.

    Arguments:
        url_resume (str) -- URL to access all the information available about a job-seeker.

    Returns: list object that containes information about a job-seeker and relevant job postings.
    '''
    headers = {'User-Agent': 'career-success (olga.boldareva@gmail.com)'}
    response = requests.get(url_resume, headers=headers)
    resume = json.loads(response.text)
    resume_inf = [str(response.status_code)]
    if response.status_code == 200:
        if resume['birth_date'] is not None:
            resume_inf.append(resume['birth_date'])
        else:
            resume_inf.append('None')
        if resume['gender'] is not None:
            resume_inf.append(resume['gender']['id'])
        else:
            resume_inf.append('None')
        area = resume['area']['id']
        if resume['area'] is not None:
            resume_inf.append(area)
        else:
            resume_inf.append('None')
        text = resume['title']
        if resume['title'] is not None:
            resume_inf.append(text)
        else:
            resume_inf.append('None')
        if resume['specialization'] is not None:
            specialization_number = len(resume['specialization'])
        else:
            specialization_number = 0
        specializations = []
        if specialization_number > 0:
            for i, item in enumerate(resume['specialization']):
                if i <= 2:
                    resume_inf.append(item['id'])
                    specializations.append(item['id'])
                else:
                    pass
        if len(resume['specialization']) < 3:
            for j in range(specialization_number + 1, 4):
                resume_inf.append('0')
                specializations.append('0')
        specialization = resume['specialization'][0]['id']
        specialization_1 = specializations[0]
        specialization_2 = specializations[1]
        specialization_3 = specializations[2]

        if resume['salary'] is not None:
            resume_inf.append(str(resume['salary']['amount']))
        else:
            resume_inf.append('None')
        if resume['salary'] is not None:
            resume_inf.append(resume['salary']['currency'])
        else:
            resume_inf.append('None')
        if resume['employment'] is not None:
            resume_inf.append(resume['employment']['id'])
        else:
            resume_inf.append('None')
        if resume['education'] is not None:
            resume_inf.append(resume['education']['level']['id'])
        else:
            resume_inf.append('None')
        if resume['total_experience'] is not None:
            resume_inf.append(str(resume['total_experience']['months']))
        else:
            resume_inf.append('0')
        if resume['citizenship'] is not None:
            resume_inf.append(resume['citizenship'][0]['id'])
        else:
            resume_inf.append('None')
        if specialization_1 != '0':
            url = 'https://api.hh.ru/vacancies?'
            try:
                salary_row = Vacancies_search(url, area, specialization_1, specialization_2, specialization_3)
            except:
                salary_row = []
            resume_inf = resume_inf + salary_row
    return resume_inf

### Sending API requests and saving information in txt file

In [106]:
import requests
import json
from time import sleep
import datetime as dt

def API_requests(date_prefix_dir, data, num_columns, num_rows):
    '''Send reguests to hh.ru API using unique resume identification numbers 
    provided in a DataFrame.

    Arguments:
        date_prefix_dir (str) -- date identifier.
        data (DataFrame) -- DataFrame that contains unique id for every applicant 
        and all resume identification numbers he/she posted on the website.  
        num_columns (int) -- number of columns in a DataFrame provided.
        num_rows (int) -- number of rows in a DataFrame provided.

    Returns: io Buffered Writer to txt file with complete information about an applicant 
    and related job postings.
    '''
    # now=dt.datetime.today()
    # print("This is when I start. Now is {}".format(now))

    url = 'https://api.hh.ru/resumes/'
    headers = {'User-Agent': 'career-success (olga.boldareva@gmail.com)'}

    resumes_path = "hh_data/hh_resumes/"
    data_path = resumes_path + "final_datasets/final_" + date_prefix_dir + ".txt"
    col_names = ["id", "resumeId", "response_code", "birth_date", "gender", "area", "title", \
             "specialization_1", "specialization_2", "specialization_3", "salary_amount", \
             "salary_currency", "employment", "education_level", "total_experience", \
             "citizenship", "number_vac", "salary_av_vac", "salary_min_vac", "salary_max_vac"]
    file_initialized = init_data_writer(data_path, col_names)

    #for i in range(1, num_rows + 1):
    for i in range(1, 11):
        j = 1
        column = "resumeId_" + str(j)
        while (j <= num_columns)&(data.loc[[i],[column]].notnull().any().any() == True):
            data_res = [str(data.loc[[i],['id']].values[0][0])]
            data_res.append(data.loc[[i],[column]].values[0][0])
            url_resume = url + data_res[1]
            try:
                data_res = data_res + API_resume_information(url_resume)
            except:
                pass
            data_row = [data_res]
            data_file = data_writer(data_path, data_row)

            j = j + 1
            if j <= num_columns:
                column = "resumeId_" + str(j)
            sleep(1)

    # now=dt.datetime.today()
    # print("{} resumes were scraped".format(num_rows))
    # print("This is when I end. Now is {}".format(now))

    return data_file

### Combine all files together, delete duplicates, and convert csv file to dataframe

In [104]:
import datetime as dt
from time import sleep
import pandas as pd
import csv
import os

def data_file():
    '''Combine csv files, delete duplicates, convert csv to dataframe, and save main csv files.
    
    No arguments.
        
    Returns: io Buffered Writer to data file.
    '''

    #print("data_file function is about to start!")

    resumes_path = "hh_data/hh_resumes/"
    now = dt.datetime.now()
    date_prefix_dir = now.strftime('%Y-%m-%d')
    date_prefix_id = now.strftime('%m%d')
    resumes_directory = resumes_path + date_prefix_dir
    data_file_path = resumes_directory + "/main_" + date_prefix_dir + ".csv"
    data_file_path_no_dup = resumes_directory + "/main_" + date_prefix_dir + "_no_duplicates.csv"

    if os.path.exists(data_file_path):
        os.remove(data_file_path)
    if os.path.exists(data_file_path_no_dup):
        os.remove(data_file_path_no_dup)

    csv_path = resumes_directory + "/*.csv"
    combine_files(data_file_path, csv_path, date_prefix_dir)

    with open(data_file_path,"r") as csvfile:
        reader = csv.reader(csvfile)
        columns = []
        for row in reader:
            columns.append(len(row))
        num_columns = max(columns)

    names_columns = []
    for i in range(num_columns):
        name = 'resumeId_' + str(i+1)
        names_columns.append(name)

    df = pd.read_csv(data_file_path, sep = ',',
    header = None,
    usecols = range(num_columns),
    names = names_columns,
    encoding = 'utf-8',
    dtype = str,
    low_memory = False)

    data = df.drop_duplicates(keep = 'first') #drop all row duplicates

    pd.options.mode.chained_assignment = None  # default='warn'
    idx = pd.Int64Index(range(1, len(data) + 1))
    data.index = idx
    data['id'] = int(date_prefix_id)*100000 + data.index
    num_rows = data.shape[0]
    data.to_csv(data_file_path_no_dup, sep=',', index = False, encoding='utf-8')
    #print(data[0:6])

    data_file = API_requests(date_prefix_dir, data, num_columns, num_rows)
    return data_file

### Set timer
While resume numbers are being collected during the whole day, working with data should start at the end of the day. This program starts at 11.55 pm and works with all the data gathered during the day. 

In [105]:
import datetime as dt
from threading import Timer

def timer_fnc(delay):
    '''Set timer to run data_file() function at 11.55 pm.
    
    Argument:
        delay (float) -- number of seconds to wait until a given function should be called.  
        
    Returns: Timer Thread.
    '''
    t = Timer(delay, data_file)
    t.start()
    return t


now=dt.datetime.today()

nextTime = now.replace(day = now.day, hour=16, minute=31, second=0, microsecond=0)
#nextTime = now.replace(day = now.day, hour=23, minute=55, second=0, microsecond=0)
if now < nextTime:
    delay = (nextTime - now).total_seconds() + 1
    print("Delay is {}".format(delay))
    timer_fnc(delay)  

Delay is 75.236577
data_file function is about to start!
                               resumeId_1 resumeId_2 resumeId_3 resumeId_4  \
1  1ecfcee700034fa06b0039ed1f5854754b5152        NaN        NaN        NaN   
2  6b3f75d00003b360c10039ed1f32756c4b4437        NaN        NaN        NaN   
3  1d42fee200023043a40039ed1f55334f315468        NaN        NaN        NaN   
4  b4b0d1b20003685d3b0039ed1f34534d4c6761        NaN        NaN        NaN   
5  a6f27d8800021c15510039ed1f757157485875        NaN        NaN        NaN   
6  2baeeab000004c4ae80039ed1f736563726574        NaN        NaN        NaN   

  resumeId_5 resumeId_6 resumeId_7 resumeId_8 resumeId_9 resumeId_10  \
1        NaN        NaN        NaN        NaN        NaN         NaN   
2        NaN        NaN        NaN        NaN        NaN         NaN   
3        NaN        NaN        NaN        NaN        NaN         NaN   
4        NaN        NaN        NaN        NaN        NaN         NaN   
5        NaN        NaN        NaN  