## Requirements Deep Dive


In [1]:
import re
import psycopg2
import pandas as pd
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from IPython import display
from bs4 import BeautifulSoup as bs

from nbstyler import DATA_STYLE as s

plotly.offline.init_notebook_mode(connected=True) # run at the start of every ipython notebook to use plotly.offline

%matplotlib notebook
%matplotlib inline

### Data preparation

In [2]:
data_querystr = """SELECT * FROM v_full_data_offers_history"""
conn = psycopg2.connect('dbname=jobsbg')
data_df = pd.read_sql_query(data_querystr, conn, index_col='subm_date')
conn.close()

In [3]:
data_df.head(5)

Unnamed: 0_level_0,subm_type,job_id,company_id,norm_salary,job_title,company_name,text_salary,job_contents
subm_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-09-27,submission,3994437,124912,,Data Analyst,ПрайсуотърхаусКупърс Одит ООД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-27,submission,3994555,67058,,ETL Developer,Adastra Bulgaria Ltd.,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-27,submission,3994824,10839,,Senior and Junior Business Intelligence Analys...,Кодикс България ЕАД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-27,submission,3995044,144752,,BI Консултант,БИЗЛИНК ООД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."
2017-09-28,submission,3996312,204212,,Business Intelligence Analyst,ДОПАМИН ЕООД,,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."


In [4]:
def check_archive_content(job_id):
    conn = psycopg2.connect('dbname=jobsbg')
    cur = conn.cursor()
    query = f'SELECT * FROM f_get_offer_contents_by_job_id({job_id})'
    cur.execute(query)
    result = cur.fetchone()
    cur.close()
    conn.close()
    return result[0] 

In [7]:
offer = check_archive_content(3994555)
# display.HTML(offer)

In [8]:
instance1 = data_df.iloc[:1].job_contents.values
instance1_str = instance1[0]
instance1_soup = bs(instance1_str, 'html.parser')

In [None]:
# instance1_soup


In [31]:
instance1_soup.find_all(string=re.compile('requirements', re.IGNORECASE))[0].parent.parent.contents

['\n',
 <strong>PwC</strong>,
 ' is\xa0a network of firms in ',
 <strong>157 countries</strong>,
 ' with more than ',
 <strong>223,000 people</strong>,
 ' around the globe who are committed to delivering the highest quality solutions in assurance, tax and advisory services.',
 <br/>,
 'Amongst our clients there are',
 <strong> 422 companies from Fortune 500 list.</strong>,
 <br/>,
 <br/>,
 'We’ve been in Central and Eastern Europe for the past 25 years, employing around 10,000 professionals across the CEE region. Thanks to our many',
 <strong> talented people</strong>,
 ' we are recognised as a top provider of professional services. We help to ',
 <strong>build trust and solve important problems</strong>,
 ' across 29 countries of CEE. We value ',
 <strong>innovation, teamwork, integrity,</strong>,
 ' professionalism and we care about our people.',
 <br/>,
 <br/>,
 'Do you want to have impact on the reality that surrounds us?',
 <br/>,
 <br/>,
 'Choose future with PwC. We combine new '

In [17]:
instance2 = data_df.iloc[1:2].job_contents.values
instance2_str = instance2[0]
instance2_soup = bs(instance2_str, 'html.parser')
instance2_soup

 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Jobs.bg - ETL Developer, София, обява за работа от Adastra Bulgaria Ltd.</title>
<base href="https://www.jobs.bg/"/>
<meta content="default-src 'self' 'unsafe-inline' 'unsafe-eval' http://*.jobs.bg https://*.jobs.bg wss://im.jobs.bg http://www.box.bg https://www.box.bg http://www.google-analytics.com https://www.google-analytics.com http://i.newsroom.bg https://i.newsroom.bg https://*.youtube.com http://*.youtube.com;" http-equiv="Content-Security-Policy"/>
<meta content="https://www.jobs.bg/assets/logo/2012-02-19/b_e2f9e33dcaf31127e3a08f2b9835445c.jpg" property="og:image"/>
<meta content="265" property="og:image:width"/>
<meta content="31" property="og:image:height"/>
<meta content="Adastra Group is a leading provider of Information Management solutions to global Fortune 1000 companies. One of Canada`s 50 

In [30]:
instance2_soup.find_all(string=re.compile('required', re.IGNORECASE))[1].parent.parent.contents

['\n',
 <p><strong>Adastra Group</strong> is a leading provider of Information Management solutions to global Fortune 1000 companies. One of Canada`s 50 Best Managed companies, Adastra has over 1200 employees worldwide with headquarters in Canada and the Czech Republic and offices in Russia, Germany, Slovakia and Bulgaria.<br/><br/><strong>Adastra Bulgaria</strong> was founded in the year 2007 and currently has two offices in the cities of Sofia and Varna. Our portfolio includes various projects in the areas of Data Warehousing, Business Intelligence, Data Integration, Master Data Management and Big Data for large clients, such as Bank of Montreal, Volkswagen and Vivacom.</p>,
 '\n',
 <p>We are currently looking for a talented and experienced software specialist and consultant for Data Warehouse projects to fill the open position in Adastra Bulgaria professional teams. </p>,
 '\n',
 <p>The successful candidate should be skilled in Data Warehousing, relational structures, dimensional da

In [32]:
instance3 = data_df.iloc[2:3].job_contents.values
instance3_str = instance3[0]
instance3_soup = bs(instance3_str, 'html.parser')
instance3_soup

 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Jobs.bg - Senior and Junior Business Intelligence Analyst (French Speaker), София, обява за работа от Кодикс България ЕАД</title>
<base href="https://www.jobs.bg/"/>
<meta content="default-src 'self' 'unsafe-inline' 'unsafe-eval' http://*.jobs.bg https://*.jobs.bg wss://im.jobs.bg http://www.box.bg https://www.box.bg http://www.google-analytics.com https://www.google-analytics.com http://i.newsroom.bg https://i.newsroom.bg https://*.youtube.com http://*.youtube.com;" http-equiv="Content-Security-Policy"/>
<meta content="https://www.jobs.bg/assets/logo/2013-12-09/b_95e929f2bd1e3270720adc0f65d0816e.gif" property="og:image"/>
<meta content="305" property="og:image:width"/>
<meta content="128" property="og:image:height"/>
<meta content="CODIX is an international software company, with its head-quarters located 

In [38]:
instance3_soup.find_all(string=re.compile('you have', re.IGNORECASE))[0].parent.contents

['\n\t\t\t\t\t\t \t\t\t\t\t\t \tCODIX is an international software company, with its head-quarters located in France, and provides a unique all-in-one innovative solution “iMX” catering to the needs of banks, telecoms, utilities and insurance companies to name a few.',
 <br/>,
 '\n',
 <br/>,
 '\r\nCODIX is a dynamic company with a rapidly expanding client portfolio (currently working with leaders in the market including BNP PARIBAS, SOCIETE GENERALE, GE Capital, KBC, BARCLAYS, BBVA, Orange, SANTANDER, Raiffeisen Bank & more) and a worldwide presence through its subsidiaries on 4 continents.',
 <br/>,
 '\r\nCODIX customers are all key players in their respective markets and they chose iMX for one or several of the following activities: account receivables management, commercial finance (including leasing), debt collection (B2B, B2C) and credit insurance management',
 <br/>,
 '\r\nThe fully integrated iMX solution offers end-to-end support for all front office and back office operations,

In [39]:
instance4 = data_df.iloc[3:4].job_contents.values
instance4_str = instance4[0]
instance4_soup = bs(instance4_str, 'html.parser')
instance4_soup

 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Jobs.bg - BI Консултант, София, обява за работа от БИЗЛИНК ООД</title>
<base href="https://www.jobs.bg/"/>
<meta content="default-src 'self' 'unsafe-inline' 'unsafe-eval' http://*.jobs.bg https://*.jobs.bg wss://im.jobs.bg http://www.box.bg https://www.box.bg http://www.google-analytics.com https://www.google-analytics.com http://i.newsroom.bg https://i.newsroom.bg https://*.youtube.com http://*.youtube.com;" http-equiv="Content-Security-Policy"/>
<meta content="https://www.jobs.bg/assets/logo/2012-08-29/b_4b460de5781167b9e126ad261f62020b.GIF" property="og:image"/>
<meta content="116" property="og:image:width"/>
<meta content="51" property="og:image:height"/>
<meta content="Бизлинк предоставя професионални решения, подпомагащи управлението на бизнеса, свързани с цялостното изграждане и внедряване на системи

In [40]:
instance4_soup.find_all(string=re.compile('Очакванията ни към Вас', re.IGNORECASE))[0].parent.contents

['\n\t\t\t\t\t\t \t\t\t\t\t\t \tБизлинк предоставя професионални решения, подпомагащи управлението на бизнеса, свързани с цялостното изграждане и внедряване на системи за управление на взаимоотношенията с клиенти. Предлагаме цялостни CRM решения и сме парньори на Qlik, SugarCRM и Act-On.',
 <br/>,
 '\n',
 <br/>,
 '\r\nТърсим професионалист, който да заеме позицията „BI Консултант”.',
 <br/>,
 '\n',
 <br/>,
 '\r\nОчакванията ни към Вас:',
 <br/>,
 '\n',
 <br/>,
 '\r\n- Познаване на принципите на релационните бази данни и опит при изготвяне на SQL заявки',
 <br/>,
 '\r\n- Умения за работа с бизнес клиенти и добри организационни и комуникативни качества',
 <br/>,
 '\r\n- Опит с продуктите Qlik Sense / Qlik View  или алтернативни платформи - предимство',
 <br/>,
 '\r\n- Добри комуникационни умения и работа в екип',
 <br/>,
 '\n',
 <br/>,
 '\n',
 <br/>,
 '\r\nРоля:',
 <br/>,
 '\n',
 <br/>,
 '\r\n- Участие в проекти по разработка на BI системи, базирани на продуктите на Qlik',
 <br/>,
 '\r\n

In [42]:
instance5 = data_df.iloc[4:5].job_contents.values
instance5_str = instance5[0]
instance5_soup = bs(instance5_str, 'html.parser')
instance5_soup

 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Jobs.bg - Business Intelligence Analyst, София, обява за работа от ДОПАМИН ЕООД</title>
<base href="https://www.jobs.bg/"/>
<meta content="default-src 'self' 'unsafe-inline' 'unsafe-eval' http://*.jobs.bg https://*.jobs.bg wss://im.jobs.bg http://www.box.bg https://www.box.bg http://www.google-analytics.com https://www.google-analytics.com http://i.newsroom.bg https://i.newsroom.bg https://*.youtube.com http://*.youtube.com;" http-equiv="Content-Security-Policy"/>
<meta content="https://www.jobs.bg/assets/logo/2014-11-10/b_219f8f2813c6ed701373ae7038e4020f.png" property="og:image"/>
<meta content="1015" property="og:image:width"/>
<meta content="413" property="og:image:height"/>
<meta content="We invite you at Dopamine, a software company making immersive online casino games &amp; gaming platforms.     Bring

In [44]:
instance5_soup.find_all(string=re.compile('requirements', re.IGNORECASE))[0].parent.parent.contents

['\n',
 <p class="dopamin_p_title">Player needed: </p>,
 '\n',
 <h1> BUSINESS INTELLIGENCE ANALYST </h1>,
 '\n',
 <h2>MISSIONS: </h2>,
 '\n',
 <ul>
 <li>Аccess and data mine constantly growing multidimensional games and financial data</li>
 <li>Seek patterns and recommend solutions to improve the success of our products</li>
 <li>Build prototype reports to get users engaged</li>
 <li>Define KPIs and metrics to communicate trends</li>
 <li>Monitor and validate our financial data streams</li>
 <li>Present data in influential and comprehensive way</li>
 </ul>,
 '\n',
 <h2> REQUIREMENTS: </h2>,
 '\n',
 <ul>
 <li>Experience in seeking patterns and identifying trends</li>
 <li>Problem solving personality</li>
 <li>Advanced knowledge of SQL/MySQL</li>
 <li>Advanced knowledge of PowerBI/Pentaho/Tableau/SPSS/SAS/R or similar</li>
 </ul>,
 '\n',
 <h2> ADVANTAGES: </h2>,
 '\n',
 <ul>
 <li>Knowledge of NoSQL DB</li>
 <li>Passion for data &amp; statistics</li>
 <li>Excellent analytical skills </li>

### Preparing the background stats line

In [None]:
# Uncomment the line below to export an HTML version of the chart.
# plotly.offline.plot(fig, filename = 'data_offers_requirements.html')

In [41]:
from IPython.core.display import HTML
with open('../resources/styles/datum.css', 'r') as f:
    style = f.read()
HTML(style)