## Finance Requirements Deep Dive


In [1]:
import re
from collections import Counter
import psycopg2
import pandas as pd
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from IPython import display
from bs4 import BeautifulSoup as bs

import nltk
import string
from nltk.collocations import *
from nltk import FreqDist, tokenize
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

from nltk.corpus import stopwords

from nbstyler import DATA_STYLE as s

plotly.offline.init_notebook_mode(connected=True) # run at the start of every ipython notebook to use plotly.offline

%matplotlib notebook
%matplotlib inline

### Data preparation

In [2]:
data_querystr = """SELECT * FROM v_full_finance_offers_history"""
conn = psycopg2.connect('dbname=jobsbg')
data_df = pd.read_sql_query(data_querystr, conn, index_col='subm_date')
conn.close()

In [3]:
data_df.head(5)


Unnamed: 0_level_0,subm_type,job_id,company_id,norm_salary,job_title,company_name,text_salary,job_contents
subm_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-09-27,submission,3994985,197145,,Finance Application Support Analyst (Accountin...,ЛУИ ДРАЙФУС КОМОДИТИС СЪРВИСИЗ БЪЛГАРИЯ ЕООД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-28,submission,3996359,4310,,JUNIOR ACCOUNTANT - Structured Finance specialist,ТМФ СЪРВИСИЗ ЕООД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-28,submission,3996773,239646,,"Senior Project Manager, Finance Practice",DataArt Bulgaria Ltd,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-29,submission,3999032,1144,,Finance Reporting Expert,Telenor Bulgaria EAD,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-10-02,submission,4000578,144532,,KYC & Finance Specialist,БУЛАНЕТ ЕООД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."


In [4]:
def check_archive_content(job_id):
    conn = psycopg2.connect('dbname=jobsbg')
    cur = conn.cursor()
    query = f'SELECT * FROM f_get_offer_contents_by_job_id({job_id})'
    cur.execute(query)
    result = cur.fetchone()
    cur.close()
    conn.close()
    return result[0] 

# moved on to `build_nltk_freqdist()` to tackle bigrams and n-grams.
def build_counter(job_contents):
    contents_soup = bs(job_contents, 'html.parser')
    job_container = contents_soup.find('td', {'class': 'explainGray'}).parent.parent.parent.parent.parent
    if  job_container.find('style'):
        job_container.find('style').decompose()
    job_text = job_container.get_text().lower()
    job_wordlist = re.findall(r"['\w']+", job_text)
    counter = Counter(job_wordlist)
    return counter

def build_nltk_tokenlist(job_contents, stopwords):
    contents_soup = bs(job_contents, 'html.parser')
    job_container = contents_soup.find('td', {'class': 'explainGray'}).parent.parent.parent.parent.parent
    if  job_container.find('style'):
        job_container.find('style').decompose()
    job_text = job_container.get_text().lower() 
    nltk_ready = [word.lower() for word in tokenize.word_tokenize(job_text) if word.lower() not in stoplist and not word.isdigit()]
    return nltk_ready

def calc_nltk_bigrams_fdist(wlist):
    bigrams = nltk.bigrams(wlist)
    fdist = nltk.FreqDist(bigrams)
    return fdist
    
def calc_nltk_trigrams_fdist(wlist):
    trigrams = nltk.trigrams(wlist)
    fdist = nltk.FreqDist(trigrams)
    return fdist    

#### Counting individual word occurrencies (depr)

1. Get soup
2. Get `.explainGray` and traverse up the tree 5 times. This is the job contents table.
3. Remove any injected styles from the job contents.

In [5]:
instance5 = data_df.iloc[10:11].job_contents.values
instance5_str = instance5[0]
instance5_soup = bs(instance5_str, 'html.parser')
job_container_table = instance5_soup.find('td', {'class': 'explainGray'}).parent.parent.parent.parent.parent

In [6]:
# remove style tag and its contents
job_container_table.find('style').decompose()


AttributeError: 'NoneType' object has no attribute 'decompose'

4. Get only the text of the tags
5. Lower and split to prepare the list for the Counter
6. Check result

In [7]:
job_text = job_container_table.get_text().lower()

#### Calculate common words, bigrams and trigrams

In [8]:
stopwords_bg = ['за', 'на', 'в', 'и', 'с', 'със', 'не', 'да', 'без', 'по', 'обяви', 'принтирай', 'обява',
               'обявата', 'работно', 'време', 'всички', 'тази', 'разглеждания', 'проблем', 'визитка', '--', 
               'добави', 'моите', 'известия', 'запази', 'бележника']
stoplist = stopwords.words('english') + stopwords_bg + list(string.punctuation)
stoplist = set(stoplist)

In [9]:
%%time
token_contents = [build_nltk_tokenlist(jc, stoplist) for jc in data_df.job_contents]

CPU times: user 8.9 s, sys: 0 ns, total: 8.9 s
Wall time: 8.92 s


In [10]:
%%time

data_bigrams = [calc_nltk_bigrams_fdist(tc) for tc in token_contents]
result_bigrams = sum(data_bigrams, FreqDist())

CPU times: user 7.5 s, sys: 95.1 ms, total: 7.6 s
Wall time: 7.6 s


In [11]:
result_bigrams.most_common(100)

[(('известие', 'нови'), 1084),
 (('месторабота', 'софия'), 502),
 (('работа', 'пълно'), 489),
 (('постоянна', 'работа'), 470),
 (('нови', 'фирма/организация'), 419),
 (('фирма/организация', 'директно'), 419),
 (('директно', 'търсеща'), 419),
 (('търсеща', 'служители'), 419),
 (('служители', 'повече'), 419),
 (('софия', 'постоянна'), 400),
 (('българия', 'еоод'), 367),
 (('пълно', 'известие'), 305),
 (('finance', 'manager'), 219),
 (('candidates', 'contacted'), 218),
 (('нови', 'finance'), 217),
 (('еоод', 'известие'), 208),
 (('university', 'degree'), 199),
 (('bulgaria', 'ead'), 181),
 (('finance', 'accounting'), 181),
 (('инграм', 'микро'), 180),
 (('микро', 'есесси'), 180),
 (('есесси', 'емеа'), 180),
 (('емеа', 'еоод'), 180),
 (('short-listed', 'candidates'), 177),
 (('skills', '•'), 172),
 (('communication', 'skills'), 169),
 (('\uf451вижте', 'компанията'), 168),
 (('компанията', 'месторабота'), 167),
 (('bulgaria', 'eood'), 167),
 (('•', 'excellent'), 163),
 (('подходяща', 'канди

In [64]:
%%time

data_trigrams = [calc_nltk_trigrams_fdist(tc) for tc in token_contents]
result_trigrams = sum(data_trigrams, FreqDist())

CPU times: user 1min, sys: 3.39 ms, total: 1min
Wall time: 1min


In [65]:
%%time

data_words = [FreqDist(tc) for tc in token_contents]
result_words = sum(data_words, FreqDist())


CPU times: user 6.07 s, sys: 1 µs, total: 6.07 s
Wall time: 6.08 s


In [74]:
result_words.most_common(150)

[('data', 11119),
 ('•', 7483),
 ('business', 5251),
 ('experience', 4354),
 ('team', 3501),
 ('нови', 3220),
 ('известие', 3184),
 ('skills', 3081),
 ('work', 2912),
 ('company', 2466),
 ('development', 2274),
 ('bi', 2251),
 ('bulgaria', 2185),
 ('english', 2177),
 ('analyst', 2137),
 ('reporting', 2124),
 ('работа', 1915),
 ('knowledge', 1889),
 ('’', 1783),
 ('requirements', 1744),
 ('management', 1703),
 ('excellent', 1687),
 ('месторабота', 1598),
 ('пълно', 1598),
 ('кандидатствай', 1598),
 ('повече', 1590),
 ('еоод', 1565),
 ('постоянна', 1563),
 ('working', 1535),
 ('sql', 1506),
 ('analysis', 1497),
 ('софия', 1492),
 ('solutions', 1478),
 ("''", 1473),
 ('services', 1418),
 ('support', 1397),
 ('intelligence', 1397),
 ('information', 1393),
 ('new', 1374),
 ('служители', 1374),
 ('–', 1372),
 ('environment', 1360),
 ('фирма/организация', 1348),
 ('директно', 1348),
 ('търсеща', 1348),
 ('ability', 1348),
 ('personal', 1344),
 ('tools', 1317),
 ('българия', 1249),
 ('us', 123

In [73]:
result_bigrams.most_common(150)


[(('известие', 'нови'), 3184),
 (('работа', 'пълно'), 1578),
 (('постоянна', 'работа'), 1563),
 (('месторабота', 'софия'), 1451),
 (('нови', 'фирма/организация'), 1348),
 (('фирма/организация', 'директно'), 1348),
 (('директно', 'търсеща'), 1348),
 (('търсеща', 'служители'), 1348),
 (('служители', 'повече'), 1348),
 (('софия', 'постоянна'), 1318),
 (('business', 'intelligence'), 1256),
 (('data', 'analyst'), 1134),
 (('пълно', 'известие'), 989),
 (('personal', 'data'), 887),
 (('data', 'warehouse'), 664),
 (('candidates', 'contacted'), 657),
 (('\uf451вижте', 'компанията'), 657),
 (('компанията', 'месторабота'), 653),
 (('българия', 'еоод'), 641),
 (('b', 'eye'), 636),
 (('communication', 'skills'), 605),
 (('short-listed', 'candidates'), 593),
 (('bulgaria', 'eood'), 489),
 (("'input", 'name='), 453),
 (('name=', "''"), 453),
 (('strict', 'confidentiality'), 447),
 (('cv', 'english'), 436),
 (('computer', 'science'), 427),
 (('treated', 'strict'), 403),
 (('big', 'data'), 401),
 (('da

In [72]:
result_trigrams.most_common(30)

[(('постоянна', 'работа', 'пълно'), 1546),
 (('известие', 'нови', 'фирма/организация'), 1348),
 (('нови', 'фирма/организация', 'директно'), 1348),
 (('фирма/организация', 'директно', 'търсеща'), 1348),
 (('директно', 'търсеща', 'служители'), 1348),
 (('търсеща', 'служители', 'повече'), 1348),
 (('софия', 'постоянна', 'работа'), 1318),
 (('месторабота', 'софия', 'постоянна'), 1316),
 (('работа', 'пълно', 'известие'), 989),
 (('пълно', 'известие', 'нови'), 989),
 (('\uf451вижте', 'компанията', 'месторабота'), 653),
 (('компанията', 'месторабота', 'софия'), 591),
 (('short-listed', 'candidates', 'contacted'), 504),
 (("'input", 'name=', "''"), 453),
 (('treated', 'strict', 'confidentiality'), 389),
 (('работа', 'пълно', 'подходяща'), 376),
 (('еоод', 'известие', 'нови'), 370),
 (('подходяща', 'кандидати', 'малък'), 349),
 (('кандидати', 'малък', 'или'), 349),
 (('малък', 'или', 'опит'), 349),
 (('applications', 'treated', 'strict'), 329),
 (('b', 'eye', 'ltd.'), 318),
 (("''", '.is', 'che

In [69]:
# Uncomment the line below to export an HTML version of the chart.
# plotly.offline.plot(fig, filename = 'data_offers_requirements.html')

In [70]:
from IPython.core.display import HTML
with open('../resources/styles/datum.css', 'r') as f:
    style = f.read()
HTML(style)

https://nlp.stanford.edu/fsnlp/promo/colloc.pdf
