## Requirements Deep Dive


In [1]:
import re
from collections import Counter
from itertools import combinations, product
import psycopg2
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs

import string
import nltk
from nltk import FreqDist, tokenize
from nltk.corpus import stopwords
from nltk.collocations import *

from nbstyler import DATA_STYLE as DS

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

%matplotlib notebook
%matplotlib inline

### Data preparation

In [2]:
data_querystr = """SELECT * FROM v_full_data_offers_history"""
conn = psycopg2.connect('dbname=jobsbg')
data_df = pd.read_sql_query(data_querystr, conn, index_col='subm_date')
conn.close()

In [3]:
data_df.head(5)

Unnamed: 0_level_0,subm_type,job_id,company_id,norm_salary,job_title,company_name,text_salary,job_contents
subm_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-09-27,submission,3994437,124912,,Data Analyst,ПрайсуотърхаусКупърс Одит ООД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-27,submission,3994555,67058,,ETL Developer,Adastra Bulgaria Ltd.,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-27,submission,3994824,10839,,Senior and Junior Business Intelligence Analys...,Кодикс България ЕАД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-27,submission,3995044,144752,,BI Консултант,БИЗЛИНК ООД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-28,submission,3996312,204212,,Business Intelligence Analyst,ДОПАМИН ЕООД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."


In [5]:
def check_archive_content(job_id):
    conn = psycopg2.connect('dbname=jobsbg')
    cur = conn.cursor()
    query = f'SELECT * FROM f_get_offer_contents_by_job_id({job_id})'
    cur.execute(query)
    result = cur.fetchone()
    cur.close()
    conn.close()
    return result[0]


# moved on to `build_nltk_freqdist()` to tackle bigrams and n-grams.
def build_counter(job_contents):
    contents_soup = bs(job_contents, 'html.parser')
    job_container = contents_soup.find('td', {'class': 'explainGray'}).parent.parent.parent.parent.parent
    if job_container.find('style'):
        job_container.find('style').decompose()
    job_text = job_container.get_text().lower()
    job_wordlist = re.findall(r"['\w']+", job_text)
    counter = Counter(job_wordlist)
    return counter


def build_nltk_tokenlist(job_contents, stopwords):
    contents_soup = bs(job_contents, 'html.parser')
    job_container = contents_soup.find('td', {'class': 'explainGray'}).parent.parent.parent.parent.parent
    if job_container.find('style'):
        job_container.find('style').decompose()
    job_text = job_container.get_text().lower() 
    nltk_ready = [word.lower() for word in tokenize.word_tokenize(job_text) if word.lower() not in stoplist and not word.isdigit()]
    return nltk_ready


def calc_nltk_bigrams_fdist(wlist):
    bigrams = nltk.bigrams(wlist)
    fdist = nltk.FreqDist(bigrams)
    return fdist


def calc_nltk_trigrams_fdist(wlist):
    trigrams = nltk.trigrams(wlist)
    fdist = nltk.FreqDist(trigrams)
    return fdist    

#### Counting individual word occurrencies (depr)

1. Get soup
2. Get `.explainGray` and traverse up the tree 5 times. This is the job contents table.
3. Remove any injected styles from the job contents.

In [6]:
instance5 = data_df.iloc[10:11].job_contents.values
instance5_str = instance5[0]
instance5_soup = bs(instance5_str, 'html.parser')
job_container_table = instance5_soup.find('td', {'class': 'explainGray'}).parent.parent.parent.parent.parent

In [7]:
# remove style tag and its contents
job_container_table.find('style').decompose()


4. Get only the text of the tags
5. Lower and split to prepare the list for the Counter
6. Check result

In [8]:
job_text = job_container_table.get_text().lower()

#### Calculate common words, bigrams and trigrams

In [9]:
stopwords_bg = ['за', 'на', 'в', 'и', 'с', 'със', 'не', 'да', 'без', 'по', 'от', 'или', 'обяви', 'принтирай', 'обява', '’', '–',
               'обявата', 'работно', 'време', 'всички', 'тази', 'разглеждания', 'проблем', 'визитка', '--', '•',
               'добави', 'моите', 'известия', 'запази', 'бележника', 'известие', 'фирма/организация',
               'българия', 'известие', 'месторабота', 'постоянна', 'търсеща', 'служители', 'компанията',
                'company_privacy_consent', 'подходяща', 'cv', 'us', 'оод', 'eood', 'еоод', '\uf451вижте',
                'софия', 'пълно', 'name=', 'considered', 'директно', 'безплатнa', 'услугa', 'лични',
                '★', '★★', '★★★', '★★★★', '★★★★★', 'name^=', 'err_job', 'var', 'bulgaria', 'ref.', 'contacted',
                'consent_error_privacy', 'consent_error_job', '.hide', '.show', 'err_privacy', 'return', 'shortlisted',
                'document.frmconsent.submit', 'съгласен', 'въпросите', 'съгласие', 'please', 'работа',
               ]
stoplist = stopwords.words('english') + stopwords_bg + list(string.punctuation)
stoplist = set(stoplist)

In [10]:
%%time

token_contents = [build_nltk_tokenlist(jc, stoplist) for jc in data_df.job_contents]

CPU times: user 27.7 s, sys: 82.3 ms, total: 27.8 s
Wall time: 27.8 s


In [11]:
%%time

data_bigrams = [calc_nltk_bigrams_fdist(tc) for tc in token_contents]
result_bigrams = sum(data_bigrams, FreqDist())

CPU times: user 42.7 s, sys: 17.3 ms, total: 42.7 s
Wall time: 42.8 s


In [12]:
%%time

data_trigrams = [calc_nltk_trigrams_fdist(tc) for tc in token_contents]
result_trigrams = sum(data_trigrams, FreqDist())

CPU times: user 56.4 s, sys: 24.5 ms, total: 56.4 s
Wall time: 56.5 s


In [13]:
%%time

data_words = [FreqDist(tc) for tc in token_contents]
result_words = sum(data_words, FreqDist())


CPU times: user 6.07 s, sys: 69 µs, total: 6.07 s
Wall time: 6.07 s


In [30]:
result_words.most_common(10)


[('data', 11351),
 ('business', 5373),
 ('experience', 4449),
 ('team', 3552),
 ('нови', 3278),
 ('skills', 3137),
 ('work', 2953),
 ('company', 2502),
 ('development', 2331),
 ('bi', 2281)]

In [32]:
result_bigrams.most_common(10)

[(('нови', 'повече'), 1374),
 (('business', 'intelligence'), 1274),
 (('data', 'analyst'), 1150),
 (('personal', 'data'), 913),
 (('data', 'warehouse'), 677),
 (('b', 'eye'), 660),
 (("'input", "''"), 620),
 (('communication', 'skills'), 616),
 (('short-listed', 'candidates'), 606),
 (('strict', 'confidentiality'), 460)]

In [34]:
result_trigrams.most_common(10)


[(('treated', 'strict', 'confidentiality'), 401),
 (('кандидати', 'малък', 'опит'), 359),
 (('applications', 'treated', 'strict'), 340),
 (('b', 'eye', 'ltd.'), 330),
 (('false', "'input", "''"), 310),
 (("''", '.is', 'checked'), 310),
 (('.is', 'checked', 'true'), 310),
 (("'input", "''", "''"), 310),
 (('candidates', 'applications', 'treated'), 260),
 (('си', 'екс', 'джи'), 252)]

### Identify Requirement Terms

The words or phrases we need for the chord chart can be defined using the three counters presented above. Here is an attempt:

In [17]:
candidate_terms = [
    'excel', 'tableau', 'access', 'qlik', 'hadoop', 'informatica', 'vmware', 'ssis', 'vba', 'python', 'powerpoint', 'mysql',
    'spark', 'microstrategy', 'deluge', 'ssrs', ('sql', 'server'), ('power', 'bi'), ('ms', 'office'), ('microsoft', 'office'), ]

In [1]:
from IPython.core.display import HTML
with open('../resources/styles/datum.css', 'r') as f:
    style = f.read()
HTML(style)

### Resources:

- https://nlp.stanford.edu/fsnlp/promo/colloc.pdf
- http://www.nltk.org/index.html