# AviaHack (UTAir, AviaJob)

Made by **Pelmeni** team

### Matching between single CV & job

In [6]:
# Initializing FireBase admin

import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import json

# Use the application default credentials
cred = credentials.Certificate("hrapp-b56d4-firebase-adminsdk-w16yw-8319963b70.json")
firebase_admin.initialize_app(cred)

db = firestore.client()

ValueError: The default Firebase app already exists. This means you called initialize_app() more than once without providing an app name as the second argument. In most cases you only need to call initialize_app() once. But if you do want to initialize multiple apps, pass a second argument to initialize_app() to give each app a unique name.

In [180]:
# Collecting data from FireBase

def all_employees_cvs():
    cvs = []
    users = db.collection(u'users').stream()
    for user in users:
        if (u'type' and u'cv') in user.to_dict() and user.get(u'type') == u'employee':
            cvs.append(json.loads(user.get(u'cv')))
    return cvs


def all_hrs_cvs():
    cvs = []
    users = db.collection(u'users').stream()
    for user in users:
        if (u'type' and u'cv') in user.to_dict() and user.get(u'type') == u'employer':
            cvs.append(json.loads(user.get(u'cv')))
    return cvs

all_hrs_cvs()

[{'name': 'ИП Аэрофлотов Икар Петрович | Кадрова Екатерина Подборовна',
  'title': 'Пилот воздушного судна',
  'city': 'Москва',
  'occupation': '5',
  'weekend': '2',
  'salary': '30000',
  'responsibilities': ['Управление воздушным судном', 'Пилотирование'],
  'requirements': {'vital': ['Ответственность', 'Умение управлять самолётом'],
   'extra': ['Владение английским языком']},
  'tags': ['пилот', 'самолёт']}]

In [188]:
# Loading data from JSONs

# Test
resume = "test_resume.txt"                # JSON
job_description = "test_description.txt"  # JSON

resume = open(resume, 'r').read()
job_description = open(job_description, 'r').read()

resume_data = json.loads(resume)
job_data = json.loads(job_description)

#for key, value in resume_data.items():
#    print(key, value)

# !Test

#resume_data = all_employees_cvs()[0]

In [189]:
text = [resume, job_description]

# Filtering

def is_valid(resume_data, job_data):
    if resume_data['city'] != job_data['city'] and resume_data['relocate'] == False:
        return False
    if resume_data['occupation'] != job_data['occupation'] and resume_data['weekend'] != job_data['weekend'] and resume_data['occupation'] != "-1":
        return False
    if job_data['salary'] < resume_data['salary']:
        return False
    if not list(set(resume_data['tags']) & set(job_data['tags'])):
        return False
    reqs_length = len(list(set(job_data['requirements']['vital'])))
    if len(list(set(resume_data['skills']) & set(job_data['requirements']['vital']))) != reqs_length:
        return False
    return True
    
if not is_valid(resume_data, job_data):
    print("Candidate is inapropriate for this job")

In [190]:
# Preprocessing

import Levenshtein
resume_title = resume_data['title']
job_title = job_data['title']
title_match_percentage = (1 - Levenshtein.distance(resume_title, job_title) / len(job_title)) * 100
print("Your resume title matches about " + str(title_match_percentage) + "% of the job title.")

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(text)

from sklearn.metrics.pairwise import cosine_similarity
match_percentage = cosine_similarity(count_matrix)[0][1] * 100
print("Your resume generally matches about " + str(match_percentage) + "% of the job description.")

Your resume title matches about 77.27272727272727% of the job title.
Your resume generally matches about 43.26234269258105% of the job description.


In [191]:
# Hard & soft skills division

import re
import pandas as pd
import sys, os
import numpy as np
import nltk
import operator
import math


class Extractor():
        def __init__(self, resume, job):
                self.softskills=self.load_skills('softskills_ru.txt')
                self.hardskills=self.load_skills('hardskills_ru.txt')
                self.jb_distribution=self.build_ngram_distribution(job)
                self.cv_distribution=self.build_ngram_distribution(resume)
                self.table=[]

        def load_skills(self,filename):
                f=open(filename,'r')
                skills=[]
                for line in f:
                        skills.append(self.clean_phrase(line)) 
                f.close()
                return list(set(skills))


        def build_ngram_distribution(self,filename):
                n_s=[1,2,3]
                dist={}
                for n in n_s:
                        dist.update(self.parse_file(filename,n))
                return dist
                        

        def parse_file(self,filename,n):
                f=open(filename,'r')
                results={}
                for line in f:
                        words=self.clean_phrase(line).split(" ")
                        ngrams=self.ngrams(words,n)
                        for tup in ngrams:
                                phrase=" ".join(tup)
                                if phrase in results.keys():
                                        results[phrase]+=1
                                else:
                                        results[phrase]=1
                return results

        
        def clean_phrase(self,line):
                return re.sub(r'[^\w\s]','',line.replace('\n','').replace('\t','').lower())             


        def ngrams(self,input_list, n):
                return list(zip(*[input_list[i:] for i in range(n)]))

        def measure1(self,v1,v2):
                return v1-v2

        def measure2(self,v1,v2):
                return max(v1-v2,0)

        def measure3(self,v1,v2):
                sumxx, sumxy, sumyy = 0, 0, 0
                for i in range(len(v1)):
                        x = v1[i]; y = v2[i]
                        sumxx += x*x
                        sumyy += y*y
                        sumxy += x*y
                if math.sqrt(sumxx*sumyy) == 0:
                        return 0
                return sumxy/math.sqrt(sumxx*sumyy)


        def printMeasures(self, skills_type):
                n_rows=len(self.table)
                        
                for type in [skills_type]:
                        v1=[self.table[jb][2] for jb in range(n_rows) if self.table[jb][0]==type]
                        v2=[self.table[cv][3] for cv in range(n_rows) if self.table[cv][0]==type]
                        return self.measure3(v1,v2) * 100          


        def makeTable(self):            
                parts_of_speech=['CD','JJ','JJR','JJS','MD','NN','NNS','NNP','NNPS','RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ']
                graylist=["you", "will"]
                tmp_table=[]
                
                for skill in self.hardskills:
                        if skill in self.jb_distribution:
                                count_jb=self.jb_distribution[skill]
                                if skill in self.cv_distribution:
                                        count_cv=self.cv_distribution[skill]
                                else:
                                        count_cv=0
                                m1=self.measure1(count_jb,count_cv)
                                m2=self.measure2(count_jb,count_cv)
                                tmp_table.append(['hard',skill,count_jb,count_cv,m1,m2])

                for skill in self.softskills:
                        if skill in self.jb_distribution:
                                count_jb=self.jb_distribution[skill]
                                if skill in self.cv_distribution:
                                        count_cv=self.cv_distribution[skill]
                                else:
                                        count_cv=0
                                m1=self.measure1(count_jb,count_cv)
                                m2=self.measure2(count_jb,count_cv)
                                tmp_table.append(['soft',skill,count_jb,count_cv,m1,m2])

                general_language = sorted(self.jb_distribution.items(), key=operator.itemgetter(1),reverse=True)
                for tuple in general_language:
                        skill = tuple[0]
                        if skill in self.hardskills or skill in self.softskills or skill in graylist:
                                continue
                        count_jb = tuple[1]
                        tokens=nltk.word_tokenize(skill)
                        parts=nltk.pos_tag(tokens)
                        if all([parts[i][1]in parts_of_speech for i in range(len(parts))]):
                                if skill in self.cv_distribution:
                                        count_cv=self.cv_distribution[skill]
                                else:
                                        count_cv=0
                                m1=self.measure1(count_jb,count_cv)
                                m2=self.measure2(count_jb,count_cv)
                                tmp_table.append(['general',skill,count_jb,count_cv,m1,m2])
                self.table=tmp_table


def calculate_skills():
        K=Extractor("test_resume.txt", "test_description.txt")
        K.makeTable()
        hard = K.printMeasures('hard')
        soft = K.printMeasures('soft')
        return [hard, soft]

calculate_skills()

[57.73502691896258, 100.0]

In [192]:
# Metrics calculation

import statistics

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize)
 
def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1] * 100

def calculate_dist(text1, text2):
    return (1 - Levenshtein.distance(text1, text2) / max(len(text1), len(text2))) * 100


objective_score = calculate_dist(resume_data['objective'], ' '.join(job_data['responsibilities']))
if objective_score < match_percentage and objective_score + match_percentage < 100:
    objective_score += match_percentage
print("Your objective matches by "+str(objective_score)+"%.")

#TODO: Отдавать предпочтение ВУЗам по специальности тематики вакансий

education_score = 100
for curr_req in job_data['requirements']['extra']:
    for tmp in curr_req:
        if "Образование" in tmp.split():
            education_score = 0
            for curr_ed in resume_data['education']:
                    if cosine_sim(curr_ed['education_score'], job_data['title']) > 0:
                        education_score += match_percentage
                    if curr_ed['certified']:
                        education += 10
education_score = min(100, education_score)
print("Your education matches by "+str(education_score)+"%.")

experience_score = 100
experience_count = 0
for curr_req in job_data['requirements']['extra']:
    for tmp in curr_req:
        if "Опыт" in tmp.split():
            experience_score = 0
            for curr_exp in resume_data['experience']:
                if (curr_exp['end_date'] > 2016):
                    experience_count += (curr_exp['end_date'] - curr_exp['start_date']) + match_percentage
                if curr_exp['certified']:
                    experience_count += 10
experience_count = min(100, experience_count)
print("Your experience matches by "+str(experience_score)+"%.")

skills_score = statistics.mean(calculate_skills())
print("Your skills matched by "+str(skills_score)+"%")

tags_score = len(list(set(resume_data['tags']) & set(job_data['tags']))) / max(len(resume_data['tags']), len(job_data['tags'])) * 100
print("Your tags matches by "+str(tags_score)+"%.")

Your objective matches by 62.66532776720791%.
Your education matches by 100%.
Your experience matches by 100%.
Your skills matched by 78.86751345948129%
Your tags matches by 50.0%.


In [193]:
# Linear regression for total matching calculation

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

df = pd.read_csv('dataset.csv')

X = df.drop('overall_score', axis=1)
y = df['overall_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
#print(regressor.score(X_train, y_train))

d = {"education_score": [education_score], "experience_score": [experience_score], "skills_score": [skills_score]}
X_input = pd.DataFrame(data=d)
overall_score = regressor.predict(X_input)[0]
print(min(abs(overall_score), 100))

87.7792486259756


In [187]:
# Sending back all data to FireBase


def send_compute_res(login, d):
    db.collection(u'users').document(u'{}'.format(login)).update({u'computation_results': json.dumps(d)})
    


### Matching between multiple CVs & jobs

In [None]:
# Make it on server

## Additional features

### CVs parsing from hh.ru

In [None]:
# TODO

### PDF resume to JSON parsing

In [None]:
import io
import json
import re

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

password = b""
pagenos = set()
maxpages = 0
caching = True
outtype = "text"


def _get_content(fname):
    rsrcmgr = PDFResourceManager(caching=caching)
    laparams = LAParams()
    laparams.line_margin = 1.0
    laparams.boxes_flow = 1.0
    imagewriter = None
    with io.BytesIO() as outfp:
        device = TextConverter(
            rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter
        )
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        with open(fname, "rb") as f:
            for page in PDFPage.get_pages(
                    f,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True,
            ):
                interpreter.process_page(page)
        return outfp.getvalue().decode("utf-8")


def _parse(content):
    """Parse the content of the Resume pdf and return the sections with detail"""
    # add NULL to prefix and suffix of the heading
    # to easily split the sections
    sections = (
        section.strip()
        for section in re.sub(r"(\w+.*\w+)\s+_{2,}", "\0\g<1>\0", content).split("\x00")
        if section.strip()
    )

    # iter_sections = iter(sections)
    detail = next(sections)  # this one will be the head contain name, phone and address

    # x = [(a,b) for a,b in zip(sections[1::2], sections[2::2])]
    x = [(heading, body) for heading, body in zip(sections, sections)]

    match = re.search(
        r"(?P<name>\w+\s*\w+)\s*(?P<phone>\(\w+\)\s*(\w+)\-(\w+))\W+(?P<email>.*@.[^ ]*)\W+(?P<address>.*)",
        detail,
    )
    details = None
    if match:
        details = match.groupdict()

    details = {k.strip(): v.strip() for k, v in details.items()}

    for k, v in x:
        details[k] = "".join(line.strip() for line in v.strip().split("\n"))

    return details


def pdf2json(pdf_name, json_file):
    """Dump details in JSON from resume pdf"""
    content = _get_content(pdf_name)
    data = _parse(content)
    open(json_file, "w").write(json.dumps(data, indent=4))


import os
from google.cloud import storage

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'GoogleCloudServerAccount.json'
my_bucket_name = 'hrapp-b56d4.appspot.com'
path = "data/"


def _download_blob(bucket_name, source_blob_name, destination_file_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)


def _download_user_cv(login, dest):
    _download_blob(my_bucket_name, path + login + "/cv1.pdf", dest)


def user_json_cv(login, json_path):
    tmp_path = "tmp.pdf"
    _download_user_cv(login, tmp_path)
    pdf2json(tmp_path, json_path)
    os.remove(tmp_path)

user_json_cv("user2", 'user2-cv.json')
