## Data Offers Salary Scatter Plot


In [14]:
import itertools
import psycopg2
import pandas as pd
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

from nbstyler import DATA_STYLE as DS

plotly.offline.init_notebook_mode(connected=True)

%matplotlib notebook
%matplotlib inline

### Objectives

Provide a customized scatter plot showing how the data jobs that have salary information provided relate to the averages on the job market.

### Data preparation

In [15]:
data_querystr = """SELECT
    subm_date, job_title, company_name, company_id, job_id, norm_salary, text_salary, job_contents
FROM data_offers.do_full_offer_history"""

In [16]:
conn = psycopg2.connect('dbname=jobsbg')
data_df = pd.read_sql_query(data_querystr, conn, index_col='subm_date')
stats_df = pd.read_sql_query('SELECT * FROM v_salary_stats_monthly', conn, index_col='month_ts')
conn.close()

data_df contains all data jobs, but for this plot we are only going to use those that have salary information. By using the notna() method we filter only the rows in the data_df DataFrame where we have that info available.

In [17]:
data_df = data_df[data_df.norm_salary.notna()]

### Preparing the background stats line

In [18]:
trace_med = go.Scatter(
    x=[w for w in stats_df.index],
    y=[v for v in stats_df['median']],
    mode='lines',
    showlegend=False,
    fill=None,
    line=dict(
        width=1.5,
        color='rgba(213, 145, 113, 0.6)',
    ),
    hoverinfo='none',
)

trace_q3 = go.Scatter(
    x=[w for w in stats_df.index],
    y=[v for v in stats_df.third_quartile],
    mode='lines',
    showlegend=False,
    fill=None,
    line=dict(
        width=0,
    ),
    hoverinfo='none',
)

trace_q1 = go.Scatter(
    x=[w for w in stats_df.index],
    y=[v for v in stats_df.first_quartile],
    mode='lines',
    showlegend=False,
    fill='tonexty',
    fillcolor='rgba(236, 213, 187, 0.8)',
    line=dict(
        width=0,
    ),
    hoverinfo='none',
)

#### Build a custom hoverbox label

The `Scatter.hovertemplate` property described in the documentation is not available yet, so in order to create a custom hover box we can define our own template and wrap it in a function.

In [19]:
def hoverbox_builder(job):
    """Populates a text template to prepare a hoverbox from hovertext."""
    return f"\
<b>{job.job_title}</b><br>\
{job.Index.strftime('%d %b %Y')}<br>\
{job.company_name}<br>\
<b>{job.text_salary}</b> ➡ <i>(norm. {job.norm_salary:.0f} BGN)</i>"

hoverboxes = [hoverbox_builder(job) for job in data_df.itertuples()]

#### Build a marker generator


In [20]:
class MarkerBuilder:
    _allowed_colors = ['#C75129', '#BE8E1E', '#0B7761', '#655B4F', '#265BCA', '#9746a2']
    _allowed_symbols = ['star', 'pentagon', 'diamond', 'cross', 'triangle-up', 'hexagon', 'star', 'x']

    def __init__(self):
        self._active_markers = list(itertools.product(self._allowed_symbols, self._allowed_colors))
        self._max_symbol_index = len(self._active_markers) - 1
        self.tracked_company_names = []

    def get_marker(self, company_name):
        if len(self.tracked_company_names) >= self._max_symbol_index:
            return ('circle-open', '#80BCA3')
        if not company_name in self.tracked_company_names:
            self.tracked_company_names.append(company_name)

        return self._active_markers[self.tracked_company_names.index(company_name)]


mb = MarkerBuilder()
markers = [mb.get_marker(c) for c in data_df.company_name]

In [21]:
trace_datajobs = go.Scatter(
    x=[w for w in data_df.index],
    y=[v for v in data_df.norm_salary],
    mode='markers',
    showlegend=False,
    marker=dict(
        color=[opt[1] for opt in markers],
        symbol=[opt[0] for opt in markers],
        opacity=1,
        size=6,
    ),
    hoverinfo='text',
    hoveron='points',
    hovertext=hoverboxes,
    hoverlabel=dict(
        bgcolor=DS['colors']['bg2'],
        bordercolor=DS['colors']['fg2'],
        font=DS['chart_fonts']['text']
    ),
)

In [22]:
data = [trace_med, trace_q3, trace_q1, trace_datajobs]

In [23]:
layout = go.Layout(
    paper_bgcolor=DS['colors']['bg1'],
    plot_bgcolor=DS['colors']['bg1'],
    title='Data Offers Salary Scatter Plot',
    titlefont=DS['chart_fonts']['title'],
    font=DS['chart_fonts']['text'],
    autosize=True,
    hidesources=True,
    xaxis=dict(
        type='date',
        range=[min(stats_df.index), max(stats_df.index)],
        fixedrange=False,
        zerolinecolor=DS['colors']['fg2'],
        ticks='outside',
        tickmode='auto',
        nticks=len(stats_df.index),
        tickformat='%b,\n%Y',
    ),
    yaxis=dict(
        title='Normalized monthly salary',
        titlefont=DS['chart_fonts']['text'],
        type='linear',
        fixedrange=False,
        zerolinecolor=DS['colors']['fg2'],
        ticks='outside',
        tickwidth=1,
    ),
    hovermode='closest',
    hoverdistance=10,
)


In [24]:
fig = go.Figure(data=data, layout=layout)

plotly.offline.iplot(fig, filename='data_offers_scatter.html')

In [25]:
# Uncomment the line below to export an HTML version of the chart.
plotly.offline.plot(fig, filename='data_offers_scatter.html', show_link=False)

'file:///games/WORKSPACE/jpynb_Job_Market_Trends_Bulgaria/workbooks/data_offers_scatter.html'

In [26]:
from IPython.core.display import HTML
with open('../resources/styles/datum.css', 'r') as f:
    style = f.read()
HTML(style)