In [1]:
import pandas as pd
import numpy as np

import plotly
plotly.offline.init_notebook_mode(connected=True)

from plotly.graph_objs import Figure, Layout
from plotly.graph_objs import Bar, Scatter, Scattergl, Box

jobs = pd.read_csv('jobs_2019-10-26.csv')

In [2]:
# cleansing + calculate average salary
jobs = jobs.drop_duplicates(keep=False)
jobs['salary'] = (jobs['salary_from'] + jobs['salary_to']) / 2
jobs.loc[jobs['salary_unit'] == 'month', 'salary'] = jobs.loc[jobs['salary_unit'] == 'month', 'salary'] * 12
jobs = jobs[(jobs['salary_unit'] == 'year') | (jobs['salary_unit'] == 'month')]

jobs

Unnamed: 0,company,level,location,rating,salary_from,salary_to,salary_unit,skill,state,title,salary
0,READY ARTWORK,entry,"Monrovia, CA 91016",3.0,40000.0,60000.0,year,java,California,Quality Assurance Engineer,50000.0
1,VNTANA,entry,"Los Angeles, CA",,110000.0,130000.0,year,python,California,Lead Full Stack Developer,120000.0
5,SDH Systems,entry,"Dublin, OH 43017",,84219.0,84219.0,year,python,Ohio,Cloud Engineer,84219.0
6,"Kent State University, College of Aeronautics ...",entry,"Kent, OH 44240",,20000.0,24000.0,year,python,Ohio,"One fully funded PHD position, and two masters...",22000.0
7,Cerkl,entry,"Blue Ash, OH 45242",,65000.0,95000.0,year,python,Ohio,Software Engineer (Full Stack),80000.0
8,Ohio State University Medical Center,entry,"Columbus, OH 43210",4.1,47500.0,47500.0,year,python,Ohio,Post Doctoral Researcher,47500.0
10,vMOX,entry,"Lancaster, PA",,50000.0,125000.0,year,python,Pennsylvania,Full Stack Ruby on Rails Developer,87500.0
11,Delaware Valley Regional Planning Commission,entry,"Philadelphia, PA 19106",3.7,50000.0,50000.0,year,python,Pennsylvania,Back End Developer,50000.0
12,Oncora Medical,entry,"Philadelphia, PA",,80000.0,110000.0,year,python,Pennsylvania,Software Engineer (Imaging),95000.0
14,Authentise Inc.,entry,"Philadelphia, PA",,40000.0,100000.0,year,python,Pennsylvania,DevOps Engineer,70000.0


In [3]:
# EDA: visualize bar plot (Job Count of State)
state_count = jobs.groupby(['state']).agg({'title':'count'}).sort_values('title', ascending=False).rename(columns={'title':'job_count'})

data = [Bar(x=state_count.index, y=state_count['job_count'], text=state_count.index, marker={'color':'orange'})]
layout = Layout(title="Job Count of State Related to Computer Skills", width = 900, height = 500)

fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False)

In [4]:
# EDA: visualize bar plot (Salary of State)
state_count = jobs.groupby(['state']).agg({'salary':'mean'}).sort_values('salary', ascending=False).rename(columns={'salary':'salary_mean'})

data = [Bar(x=state_count.index, y=state_count['salary_mean'], text=state_count.index, marker={'color':'orangered'})]
layout = Layout(title="Salary of State Related to Computer Skills", width = 900, height = 500)

fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False)

In [5]:
# EDA: visualize bar plot (Salary for Skill Experience Level)
salary_level_mean = jobs.groupby(['level']).agg({'salary':'mean'}).rename(columns={'salary':'salary_mean'})

data = [Scatter(x=salary_level_mean.index, y=salary_level_mean['salary_mean'], mode='lines')]
layout = Layout(title="Salary for Experience Level of Computer Skills", width = 900, height = 500)

fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False)

In [6]:
# visualize bar plot (Salary for Software Skills)
salary_mean = jobs.groupby(['skill']).agg({'salary':'mean'}).sort_values('salary', ascending=True).rename(columns={'salary':'salary_mean'})

data = [Bar(y=salary_mean.index, x=salary_mean['salary_mean'], text=salary_mean.index, orientation='h', marker={'color':'darkblue'})]
layout = Layout(title="Salary for Computer Skills", width = 900, height = 800)

fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False)

In [7]:
# visualize box plot (Salary for Software Skills)
layout = Layout(title="Salary for Computer Skills", width = 1000, height = 800, showlegend=False)

fig = Figure(layout=layout)

for i in range(0, len(salary_mean.index)):
    fig.add_trace(Box(x=jobs[jobs['skill'] == salary_mean.index[i]]['salary'], name=salary_mean.index[i]))

plotly.offline.iplot(fig, show_link=False)

In [8]:
# visualize bar plot (Most Required Computer Skills)
salary_count = jobs.groupby(['skill']).agg({'salary':'count'}).sort_values('salary', ascending=True).rename(columns={'salary':'salary_count'})

data = [Bar(y=salary_count.index, x=salary_count['salary_count'], text=salary_count.index, orientation='h', marker={'color':'darkred'})]
layout = Layout(title="Most Required Computer Skills", width = 900, height = 800)

fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False)

In [9]:
# visualize bar plot with regression line (Skill Required vs Salary)
salary_summary = salary_count.reset_index().merge(salary_mean, on='skill')

p = np.polyfit(salary_summary['salary_count'], salary_summary['salary_mean'], 1)
f = np.polyval(p, salary_summary['salary_count'])

data = [Scattergl(x=salary_summary['salary_count'], y=salary_summary['salary_mean'], mode = 'markers'), Scattergl(x=salary_summary['salary_count'], y=f)]
layout = Layout(title="Skill Required vs Salary", showlegend=False)

fig = Figure(data=data, layout=layout)

plotly.offline.iplot(fig, show_link=False)