# Downloading all needed models and importing libs

In [1]:
!pip install stanza



In [2]:
import pandas as pd
import numpy as np
import stanza
import nltk
from nltk.corpus import stopwords
from string import punctuation
import json
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from bs4 import BeautifulSoup
import re
import json
import requests

nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")
stanza.download('ru')
nlp = stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,depparse,ner')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/olegmelnikov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json: 140kB [00:00, 7.58MB/s]                    
2021-09-06 14:42:52 INFO: Downloading default packages for language: ru (Russian)...
2021-09-06 14:42:53 INFO: File exists: /Users/olegmelnikov/stanza_resources/ru/default.zip.
2021-09-06 14:42:57 INFO: Finished downloading models and saved to /Users/olegmelnikov/stanza_resources.
2021-09-06 14:42:57 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |
| depparse  | syntagrus |
| ner       | wikiner   |

2021-09-06 14:42:57 INFO: Use device: cpu
2021-09-06 14:42:57 INFO: Loading: tokenize
2021-09-06 14:42:57 INFO: Loading: pos
2021-09-06 14:42:57 INFO: Loading: lemma

# Stanza parse example

In [3]:

doc = nlp('Необходимы знания таких технологий как Python, Java, PostgreSQL')
print(doc)

[
  [
    {
      "id": 1,
      "text": "Необходимы",
      "lemma": "необходимый",
      "upos": "ADJ",
      "feats": "Degree=Pos|Number=Plur|Variant=Short",
      "head": 0,
      "deprel": "root",
      "start_char": 0,
      "end_char": 10,
      "ner": "O"
    },
    {
      "id": 2,
      "text": "знания",
      "lemma": "знание",
      "upos": "NOUN",
      "feats": "Animacy=Inan|Case=Nom|Gender=Neut|Number=Plur",
      "head": 1,
      "deprel": "nsubj",
      "start_char": 11,
      "end_char": 17,
      "ner": "O"
    },
    {
      "id": 3,
      "text": "таких",
      "lemma": "такой",
      "upos": "DET",
      "feats": "Case=Gen|Number=Plur",
      "head": 4,
      "deprel": "det",
      "start_char": 18,
      "end_char": 23,
      "ner": "O"
    },
    {
      "id": 4,
      "text": "технологий",
      "lemma": "технология",
      "upos": "NOUN",
      "feats": "Animacy=Inan|Case=Gen|Gender=Fem|Number=Plur",
      "head": 2,
      "deprel": "nmod",
      "start_char":

# Raw vacancy parsing & processing methods

In [4]:
def del_stopwords(doc):
    r = doc.copy()
    cur = 0
    for ind, i in enumerate(doc):
        if i['lemma'] in russian_stopwords or i['lemma'] in punctuation:
            r.pop(ind - cur)
            cur += 1
    return r

def doc_to_list(doc):
    doc_json = json.loads(str(doc))
    return doc_json[0]

def lemmatized_sentence(doc):
    arr = []
    for i in doc:
        arr.append(i['lemma'])
    return ' '.join(arr)

def process_doc(doc):
    doc = doc_to_list(doc)
    doc = del_stopwords(doc)
    return doc

def find_all_tech(doc):
    ans = []
    for i in doc:
        if i['ner'][-4:] == 'MISC' and i['upos'] == 'PROPN':
            ans.append(i['lemma'])
    return list(set(ans))

def get_lemms(doc):
    ans = []
    for i in doc:
        ans.append(i['lemma'].lower())
    return ans

def check_if_flexible(lemms):
    return ('гибкий' in lemms or 'удобный' in lemms or 'свободный' in lemms) and ('день' in lemms or 'график' in lemms or 'время' in lemms)

def get_specializations(lemms):
    keywords = [['docker', 'kubernetes', 'devops', 'развертывание', 'jenkins', 'ansible', 'ci', 'cd'], ['ml', 'dl', 'ds', 'machine', 'learning', 'data' ,'science', 'tensorflow', 'keras', 'pytorch', 'kaggle', 'comuter', 'vision'], ['test', 'testing', 'тест', 'тестирование', 'тестировщик']]
    ans = [0, 0, 0, 0]
    for i in lemms:
        for ind, block in enumerate(keywords):
            if i in block:
                ans[ind] += 1
    sm = ans[0] + ans[1] + ans[2]
    if sm == 0:
        ans[3] = 1
    return ans

def get_full_result(doc):
    doc = process_doc(doc)
    lemms = get_lemms(doc)
    techs = find_all_tech(doc)
    flexible = check_if_flexible(lemms)
    specs = get_specializations(lemms)
    return techs, flexible, specs

def clear_string(s):
    s = s.strip().replace('\n', '').replace('\r', '')
    s = re.sub('["«»;?!,()]', '', s)
    s = re.sub('[/—-]', ' ', s)
    s = re.sub(r"\\", ' ', s)
    s = re.sub('  ', ' ', s)
    rus_alphavite = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
    for c in rus_alphavite:
        s = s.replace(c, c.lower())
    return s

def check_eng(html_doc):
    a = re.search('[а-яА-Я]', html_doc)
    if a is None:
        return True
    return False

def get_lemms2(doc):
    ans = []
    for i in doc:
        ans.append(i['lemma'])
    return ans

def select_spec(a):
    if a[1] > 1:
        return 'Data Science'
    if a[2] > 1:
        return 'Тестирование'
    if a[0] > 2:
        return 'DevOps'
    return 'Разработка'


# main html processing func, parse vacancy on simple sections
def parse_html(html_doc):
    if check_eng(html_doc):
        print("This vacancy is English")
        return []
    MAX_TITLE_LEN = 5
    MAX_BODY_LEN = 10
    soup = BeautifulSoup(html_doc, 'html.parser')

    # process beginning part before <strong>
    first_ul = soup.ul
    try:
        first_strong = first_ul.find_previous_sibling('p')
    except Exception as e:
        first_strong = soup.find_all('p')[-1]
    if first_strong is None:
        first_strong = first_ul.find_previous_sibling('strong')
    informal_part = []
    current_p = first_strong
    result = []
    try:
        while current_p is not None and current_p.find_previous_sibling() is not None:
            next_p = current_p.find_previous_sibling()

            text = clear_string(next_p.text)
            if len(text) == 0:
                break
            words = get_lemms2(process_doc(nlp(text)))
            length = min(MAX_BODY_LEN, len(words))
            body = ' '.join(words[: length])
            informal_part.append(body)
            current_p = next_p
        informal_part.reverse()
        for paragraph in informal_part:
            result.append(['', paragraph])
    except Exception as e:
        pass

    # process <strong> and next <ul>
    ptr = 1 if len(result) > 0 else 0
    for ul in soup.find_all('ul'):
        try:
            text = clear_string(ul.find_previous_sibling().text)
            words = get_lemms2(process_doc(nlp(text)))
            length = min(MAX_TITLE_LEN, len(words))
            title = words[:length]
            title = ' '.join(title)
            items = [] 
            for li in ul.find_all('li'):
                s = clear_string(li.text)
                
                words = get_lemms2(process_doc(nlp(s)))
                length = min(MAX_BODY_LEN, len(words))
                body = words[:length]
                body = ' '.join(body)
                items.append(body)

            for i in items:
                result.append([title, i])
            ptr += 1
        except Exception as e:
            pass
    return result

# Section classifier

In [5]:
df_train = pd.read_csv('train_dataset_201.csv', index_col=0)
df_train_numpy = df_train.to_numpy()
x = df_train_numpy[:, 0]
y = df_train_numpy[:, 1].astype('int')

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', KNeighborsClassifier(n_neighbors = 6,weights = 'distance',algorithm = 'brute'))])
model_knn = pipe.fit(x, y)

def classify_section(title):
    prediction = model_knn.predict([title])[0]
    return prediction

# Facts extraction

In [6]:
def extract_facts_from_vacancy_sections(sections, key_skills):
    ans = [[], [], False, [0, 0, 0, 0]]
    for i in sections:
        target = classify_section(i[0])
        doc = nlp(i[1])
        res = get_full_result(doc)
        if target == 1:
            ans[0] += res[0]
        if target == 2:
            ans[1] += res[0]
        if res[1]:
            ans[2] = True
        for j in range(4):
            ans[3][j] += res[2][j]
    
    ans[0] = list(set(ans[0] + key_skills))
    ans[1] = list(set(ans[1]))
    ans[3] = select_spec(ans[3])
    d = {'Обязательные компетенции': ans[0], 'Желательные компетенции': ans[1], 'Гибкий график работы': ans[2], 'Подобласть': ans[3]}
    return d

In [7]:
# main function to call
def extract_facts_from_vacancy(vacancy_id):
    url = f'https://api.hh.ru/vacancies/{vacancy_id}'
    vacancy = requests.get(url).json()
    html_doc = vacancy['description']
    res1 = parse_html(html_doc)
    key_skills = []
    for i in vacancy['key_skills']:
        key_skills.append(i['name'])
    return extract_facts_from_vacancy_sections(res1, key_skills)

# Demonstration of model's results

In [8]:
# input here id of any vacancy on hh.ru
vacancy_id = 40612238

# then get it's facts extracted
print(extract_facts_from_vacancy(vacancy_id))

{'Обязательные компетенции': ['Mysql', 'PostgreSQL', 'Flask', 'Python', 'FastApi', 'Tornado'], 'Желательные компетенции': ['Go'], 'Гибкий график работы': False, 'Подобласть': 'Разработка'}
