## Requirements Deep Dive


In [1]:
import re
from collections import Counter
from itertools import combinations, product
import psycopg2
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs

import string
import nltk
from nltk import FreqDist, tokenize
from nltk.corpus import stopwords
from nltk.collocations import *

from nbstyler import DATA_STYLE as DS

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

%matplotlib notebook
%matplotlib inline

### Data preparation

In [2]:
data_querystr = """SELECT * FROM data_offers.do_full_offer_history"""
conn = psycopg2.connect('dbname=jobsbg')
data_df = pd.read_sql_query(data_querystr, conn, index_col='subm_date')
conn.close()

In [3]:
data_df.head(5)

Unnamed: 0_level_0,subm_type,job_id,company_id,norm_salary,job_title,company_name,text_salary,job_location,job_contents
subm_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-07-05,resubmission,4416332,179347,,Data Analyst,Технементалс Технолоджис (България) ЕАД,,София,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2019-09-12,submission,5045017,67058,,Experienced ETL Developer,Adastra Bulgaria Ltd.,,София,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2018-04-27,submission,4296771,116269,,REPORTING ANALYST (EXCEL/ SQL),Sales Scout/ Нова Лоджик Къмпани ООД,,София,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2019-09-13,submission,5046729,67058,,Experienced ETL Developer,Adastra Bulgaria Ltd.,,Пловдив,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2019-02-28,submission,4739592,52527,,Data Warehouse Developer,Менпауър България ООД,,София,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."


In [4]:
def check_archive_content(job_id):
    conn = psycopg2.connect('dbname=jobsbg')
    cur = conn.cursor()
    query = f'SELECT * FROM f_get_offer_contents_by_job_id({job_id})'
    cur.execute(query)
    result = cur.fetchone()
    cur.close()
    conn.close()
    return result[0]


# moved on to `build_nltk_freqdist()` to tackle bigrams and n-grams.
def build_counter(job_contents):
    contents_soup = bs(job_contents, 'html.parser')
    job_container = contents_soup.find('td', {'class': 'explainGray'}).parent.parent.parent.parent.parent
    if job_container.find('style'):
        job_container.find('style').decompose()
    job_text = job_container.get_text().lower()
    job_wordlist = re.findall(r"['\w']+", job_text)
    counter = Counter(job_wordlist)
    return counter


def build_nltk_tokenlist(job_contents, stopwords):
    contents_soup = bs(job_contents, 'html.parser')
    job_container = contents_soup.find('td', {'class': 'explainGray'}).parent.parent.parent.parent.parent
    if job_container.find('style'):
        job_container.find('style').decompose()
    job_text = job_container.get_text().lower() 
    nltk_ready = [word.lower() for word in tokenize.word_tokenize(job_text) if word.lower() not in stoplist and not word.isdigit()]
    return nltk_ready


def calc_nltk_bigrams_fdist(wlist):
    bigrams = nltk.bigrams(wlist)
    fdist = nltk.FreqDist(bigrams)
    return fdist


def calc_nltk_trigrams_fdist(wlist):
    trigrams = nltk.trigrams(wlist)
    fdist = nltk.FreqDist(trigrams)
    return fdist    

#### Counting individual word occurrencies (depr)

1. Get soup
2. Get `.explainGray` and traverse up the tree 5 times. This is the job contents table.
3. Remove any injected styles from the job contents.

In [5]:
data_df.iloc[10:11].job_contents.values[0]

' <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<title>Jobs.bg - BI Test Analyst, София, обява за работа от EXPERIAN BULGARIA EAD</title>\n<base href="https://www.jobs.bg/"/>\n<meta content="default-src \'self\' \'unsafe-inline\' \'unsafe-eval\' http://*.jobs.bg https://*.jobs.bg wss://im.jobs.bg http://www.box.bg https://www.box.bg http://www.google-analytics.com https://www.google-analytics.com http://i.newsroom.bg https://i.newsroom.bg https://*.youtube.com http://*.youtube.com http://*.googleapis.com https://*.googleapis.com http://*.gstatic.com https://*.gstatic.com;" http-equiv="Content-Security-Policy"/>\n<meta content="https://www.jobs.bg/assets/logo/2016-12-14/b_0b8142fb651e48a1750b4898cd22cc6b.png" property="og:image"/>\n<meta content="1200" property="og:image:width"/>\n<meta content="572" property="og:image:height"/>\n<meta content="About Experi

In [6]:
instance5 = data_df.iloc[10:11].job_contents.values
instance5_str = instance5[0]
instance5_soup = bs(instance5_str, 'html.parser')
job_container_table = instance5_soup.find('td', {'class': 'explainGray'}).parent.parent.parent.parent.parent

In [7]:
# remove style tag and its contents
#job_container_table.find('style').decompose()


4. Get only the text of the tags
5. Lower and split to prepare the list for the Counter
6. Check result

In [8]:
job_text = job_container_table.get_text().lower()

#### Calculate common words, bigrams and trigrams

In [9]:
stopwords_bg = ['за', 'на', 'в', 'и', 'с', 'със', 'не', 'да', 'без', 'по', 'от', 'или', 'обяви', 'принтирай', 'обява', '’', '–',
               'обявата', 'работно', 'време', 'всички', 'тази', 'разглеждания', 'проблем', 'визитка', '--', '•',
               'добави', 'моите', 'известия', 'запази', 'бележника', 'известие', 'фирма/организация',
               'българия', 'известие', 'месторабота', 'постоянна', 'търсеща', 'служители', 'компанията',
                'company_privacy_consent', 'подходяща', 'cv', 'us', 'оод', 'eood', 'еоод', '\uf451вижте',
                'софия', 'пълно', 'name=', 'considered', 'директно', 'безплатнa', 'услугa', 'лични',
                '★', '★★', '★★★', '★★★★', '★★★★★', 'name^=', 'err_job', 'var', 'bulgaria', 'ref.', 'contacted',
                'consent_error_privacy', 'consent_error_job', '.hide', '.show', 'err_privacy', 'return', 'shortlisted',
                'document.frmconsent.submit', 'съгласен', 'въпросите', 'съгласие', 'please', 'работа',
               ]
stoplist = stopwords.words('english') + stopwords_bg + list(string.punctuation)
stoplist = set(stoplist)

In [10]:
%%time

token_contents = [build_nltk_tokenlist(jc, stoplist) for jc in data_df.job_contents]

CPU times: user 1min 4s, sys: 37.7 ms, total: 1min 4s
Wall time: 1min 4s


In [11]:
%%time

data_bigrams = [calc_nltk_bigrams_fdist(tc) for tc in token_contents]
result_bigrams = sum(data_bigrams, FreqDist())

CPU times: user 3min 5s, sys: 63 ms, total: 3min 5s
Wall time: 3min 6s


In [12]:
%%time

data_trigrams = [calc_nltk_trigrams_fdist(tc) for tc in token_contents]
result_trigrams = sum(data_trigrams, FreqDist())

CPU times: user 4min 13s, sys: 175 ms, total: 4min 13s
Wall time: 4min 15s


In [13]:
%%time

data_words = [FreqDist(tc) for tc in token_contents]
result_words = sum(data_words, FreqDist())


CPU times: user 22.2 s, sys: 16.4 ms, total: 22.2 s
Wall time: 22.3 s


In [14]:
result_words.most_common(10)


[('data', 25008),
 ('business', 10759),
 ('experience', 9419),
 ('team', 7502),
 ('нови', 7020),
 ('skills', 6800),
 ('work', 6295),
 ("''", 5740),
 ('company', 5229),
 ('development', 4925)]

In [15]:
result_bigrams.most_common(10)

[(('нови', 'повече'), 2944),
 (("'input", "''"), 2630),
 (('business', 'intelligence'), 2387),
 (('data', 'analyst'), 1970),
 (('personal', 'data'), 1934),
 (('b', 'eye'), 1372),
 (('communication', 'skills'), 1326),
 (('short-listed', 'candidates'), 1220),
 (('data', 'warehouse'), 1117),
 (('false', "'input"), 1008)]

In [16]:
result_trigrams.most_common(10)


[(('false', "'input", "''"), 1008),
 (("''", '.is', 'checked'), 1008),
 (('.is', 'checked', 'true'), 1008),
 (("'input", "''", "''"), 1008),
 (('кандидати', 'малък', 'опит'), 918),
 (('treated', 'strict', 'confidentiality'), 817),
 (('applications', 'treated', 'strict'), 708),
 (('b', 'eye', 'ltd.'), 666),
 (("'input", "''", 'csrf_token'), 614),
 (("''", 'csrf_token', "''"), 614)]

### Identify Requirement Terms

The words or phrases we need for the chord chart can be defined using the three counters presented above. Here is an attempt:

In [17]:
candidate_terms = [
    'excel', 'tableau', 'access', 'qlik', 'hadoop', 'informatica', 'vmware', 'ssis', 'vba', 'python', 'powerpoint', 'mysql',
    'spark', 'microstrategy', 'deluge', 'ssrs', ('sql', 'server'), ('power', 'bi'), ('ms', 'office'), ('microsoft', 'office'), ]

In [18]:
from IPython.core.display import HTML
with open('../resources/styles/datum.css', 'r') as f:
    style = f.read()
HTML(style)

### Resources:

- https://nlp.stanford.edu/fsnlp/promo/colloc.pdf
- http://www.nltk.org/index.html