## Data Offers Pie Chart and Table

In [1]:
import itertools
import datetime as dt
import psycopg2
import pandas as pd
import plotly
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go

from nbstyler import DATA_STYLE as DS

plotly.offline.init_notebook_mode(connected=True) # run at the start of every ipython notebook to use plotly.offline

%matplotlib notebook
%matplotlib inline

### Objectives

Provide a simple visualization illustrating the size of our target subset of job offers and which data proffessions are most popular (as matched against the job offer titles in the target subset).  

### Data Preparation

Get the data and prepare the `Pandas.DataFrame` object.

In [4]:
conn = psycopg2.connect("dbname=jobsbg")
cur = conn.cursor()

cur.execute('SELECT count(*) FROM v_full_offers_history')
all_jobs_count = cur.fetchone()[0]
datajobs_df = pd.read_sql_query('SELECT * FROM v_full_data_offers_history', conn, index_col='subm_date')

cur.close()
conn.close()

In [5]:
datajobs_count = len(datajobs_df)
datajobs_count, all_jobs_count

(2339, 823196)

### Pie Chart Preparation

In [8]:
pie_trace = go.Pie(
    values=[all_jobs_count, datajobs_count],
    labels=['All Other', 'Data Jobs'],
    marker=dict(
        colors=[
            DS['colorramp']['acc1'][3],
            DS['colors']['acc1'],
        ],
        line=dict(
            color=DS['colors']['acc1'],
            width=2
        )
    ),
    showlegend=False,
    domain={'x':[0, 0.39], 'y':[0,0.89]},
    name='Data Jobs Share of the Job Market',
    hoverinfo='label+percent',
    textinfo='value',
    textfont=DS['chart_fonts']['anno'],
    insidetextfont=DS['chart_fonts']['anno_in'],
    hole=0.45,
    rotation=0,
    pull=0.05,)

### Bar Chart Preparation

In [9]:
ptrn = '((data analy(st|tics|sis))|(анализ.*данни))|(data (engineer|warehouse))|((data|business) intelligence|(\W|^)bi(\W|$))|(reporting (analyst|specialist))|(etl( |$))|(data (engineer|warehouse|scientist))'

# looking at unmatched offers for count stats:
datajobs_df[~datajobs_df['job_title'].str.lower().str.contains(ptrn)].head(1)


This pattern has match groups. To actually get the groups, use str.extract.



Unnamed: 0_level_0,subm_type,job_id,company_id,norm_salary,job_title,company_name,text_salary,job_location,job_contents
subm_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-10-05,submission,4006785,232178,3450.0,"DB Developer (Database, ETL, Teradata)",Sciant AD,от 2800 до 4100 BGN (Нето),София,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 ..."


In [10]:
bi_count = sum(datajobs_df.job_title.str.lower().str.count('(data|business) intelligence|(\W|^)bi(\W|$)'))
da_count = sum(datajobs_df.job_title.str.lower().str.count('(data analy(st|tics|sis))|(анализ.*данни)|(reporting (analyst|specialist))'))
de_count = sum(datajobs_df.job_title.str.lower().str.count('((\W|^)(etl|dwh)(\W|$))|data (engineer|warehouse)'))
ds_count = sum(datajobs_df.job_title.str.lower().str.count('data scien.+'))


bi_count, da_count, de_count, ds_count

(807, 1011, 473, 152)

In [11]:
bars_trace = go.Bar(
    x=['Data<br>Analysis', 'Business<br>Intelligence', 'Data<br>Engineering', 'Data<br>Science'],
    y=[da_count, bi_count, de_count, ds_count],
    orientation='v',
    marker=dict(
        line = dict(
            width=1, 
            color=DS['colorramp']['acc1'][-1]
        ),
        color = DS['colors']['acc1'],
        opacity=0.8,
    ),
    xaxis='x2',
    showlegend=False,
)

In [12]:
data = [pie_trace, bars_trace]
layout=go.Layout(
    paper_bgcolor=DS['colors']['bg1'],            
    plot_bgcolor=DS['colors']['bg1'],
    title="Data Jobs Market Share",
    titlefont=DS['chart_fonts']['title'],
    font = DS['chart_fonts']['text'],
    autosize=True,
    showlegend=False,
    hidesources=True,
    clickmode='none',
    xaxis=dict(
    ),
    xaxis2=dict(
        domain=[0.50, 1],
        tickangle=0,
    ),
)

In [13]:
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='data_offers_pie_and_bar.html')

In [14]:
# Uncomment the line below to export an HTML version of the chart.
# plotly.offline.plot(fig, filename='data_offers_pie_and_bar.html', show_link=False)

'file:///data/WORKSPACE/jpynb_Employment_Trends_Bulgaria/workbooks/data_offers_pie_and_bar.html'

In [15]:
from IPython.core.display import HTML
with open('../resources/styles/datum.css', 'r') as f:
    style = f.read()
HTML(style)