In [74]:
DF_PATH = "../data/processed/1_preprocessed_df.pkl"
ROLES_PATH = "../data/raw/roles_short_names.csv" 

NA_STRING = 'Not Specified'
TRANSPARENT_STRING = 'rgba(0, 0, 0, 0)'

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageWorkedWith',    'DatabaseWorkedWith',    'WebframeWorkedWith',    'MiscTechWorkedWith',    ]
TECH_NEXT_COLS = ['LanguageDesireNextYear','DatabaseDesireNextYear','WebframeDesireNextYear','MiscTechDesireNextYear',]

In [75]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle

import plotly 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default = 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'

from sklearn.manifold import TSNE
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.cluster import AgglomerativeClustering

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

In [97]:
# Read Data 
raw_df = pd.read_pickle(DF_PATH)
roles_names = pd.read_csv(ROLES_PATH, sep=';')

In [107]:
df = raw_df.copy()
encoded_dfs = {}
for col in ROLE_COLS + TECH_COLS:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(binarizer.fit_transform(df[col]),
                               columns=binarizer.classes_,
                               index=df[col].index)
    print(binarizer.classes_)
    print('-----------------------')
    print()
    encoded_dfs[col] = encoded_df

['Academic researcher' 'Data or business analyst'
 'Data scientist or machine learning specialist' 'Database administrator'
 'Designer' 'DevOps specialist' 'Developer, QA or test'
 'Developer, back-end' 'Developer, desktop or enterprise applications'
 'Developer, embedded applications or devices' 'Developer, front-end'
 'Developer, full-stack' 'Developer, game or graphics' 'Developer, mobile'
 'Educator' 'Engineer, data' 'Engineer, site reliability'
 'Engineering manager' 'Marketing or sales professional' 'Product manager'
 'Scientist' 'Senior executive/VP' 'System administrator']
-----------------------

['Assembly' 'Bash/Shell/PowerShell' 'C' 'C#' 'C++' 'Dart' 'Go' 'HTML/CSS'
 'Haskell' 'Java' 'JavaScript' 'Julia' 'Kotlin' 'Objective-C' 'PHP' 'Perl'
 'Python' 'R' 'Ruby' 'Rust' 'SQL' 'Scala' 'Swift' 'TypeScript' 'VBA']
-----------------------

['Cassandra' 'Couchbase' 'DynamoDB' 'Elasticsearch' 'Firebase' 'IBM DB2'
 'MariaDB' 'Microsoft SQL Server' 'MongoDB' 'MySQL' 'Oracle' 'PostgreS

In [108]:
encoded_df

Unnamed: 0,.NET,.NET Core,Ansible,Apache Spark,Chef,Cordova,Flutter,Hadoop,Keras,Node.js,Pandas,Puppet,React Native,TensorFlow,Teraform,Torch/PyTorch,Unity 3D,Unreal Engine,Xamarin
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64457,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64458,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64459,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [78]:
df = pd.concat(encoded_dfs, axis=1)


In [79]:
df.head()

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith,MiscTechWorkedWith
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,Designer,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices",...,Node.js,Pandas,Puppet,React Native,TensorFlow,Teraform,Torch/PyTorch,Unity 3D,Unreal Engine,Xamarin
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
# Calculate the sum for all cols except Devtypes
skills_freq = df.drop('DevType', axis=1).sum().reset_index()
skills_freq.columns = ['group', 'skill', 'freq']

In [81]:
skills_freq

Unnamed: 0,group,skill,freq
0,LanguageWorkedWith,Assembly,3553
1,LanguageWorkedWith,Bash/Shell/PowerShell,18980
2,LanguageWorkedWith,C,12487
3,LanguageWorkedWith,C#,18041
4,LanguageWorkedWith,C++,13707
...,...,...,...
69,MiscTechWorkedWith,Teraform,2518
70,MiscTechWorkedWith,Torch/PyTorch,1872
71,MiscTechWorkedWith,Unity 3D,4413
72,MiscTechWorkedWith,Unreal Engine,1316


In [82]:
fig = px.treemap(skills_freq, 
                 path=['group', 'skill'], 
                 color_continuous_scale='deep',
                 values='freq',color='freq')

fig.update_layout(width=1200, height=700)

fig.show()

In [83]:
sorted_roles = df['DevType'].sum().sort_values().index.tolist()
sorted_skills = df.drop('DevType', axis=1).sum().sort_values(ascending=False).droplevel(level=0).index.tolist()

In [84]:
skills = []
for role in sorted_roles:
    role_mask = (df[('DevType', role)] == 1)
    skills_role = pd.concat({tech_col: df.loc[role_mask, tech_col].mean() * 100
                             for tech_col in TECH_COLS})
    skills.append(skills_role)
skills = pd.concat(skills, axis=1)


skills.columns = sorted_roles
skills = skills.reset_index(level=0, drop=True)
skills = skills.loc[sorted_skills]

skills = skills.T

In [89]:
skills

Unnamed: 0,JavaScript,HTML/CSS,SQL,MySQL,Python,Java,Node.js,Bash/Shell/PowerShell,jQuery,C#,...,Gatsby,Cassandra,IBM DB2,Drupal,Unreal Engine,Haskell,Puppet,Couchbase,Chef,Julia
Marketing or sales professional,71.028037,76.635514,57.788162,61.370717,38.629283,32.242991,41.744548,31.152648,45.482866,25.700935,...,9.034268,5.919003,5.76324,9.968847,6.853583,4.205607,4.517134,6.23053,4.672897,4.049844
Senior executive/VP,72.878788,69.166667,65.075758,49.545455,45.984848,33.333333,43.787879,44.393939,36.439394,34.848485,...,5.227273,6.590909,5.227273,6.060606,4.318182,3.409091,4.848485,5.227273,3.939394,2.727273
"Engineer, site reliability",65.876289,58.505155,61.701031,53.762887,59.896907,39.948454,42.628866,61.494845,30.979381,26.134021,...,4.896907,8.865979,3.71134,5.206186,3.14433,3.969072,9.278351,3.659794,8.041237,2.010309
Scientist,48.465964,47.027804,43.6721,39.213806,67.689358,32.981783,25.982742,42.425695,25.071908,22.339406,...,2.924257,4.122723,3.499521,3.691275,4.026846,4.554171,2.301055,3.259827,2.157239,5.465005
Product manager,73.007609,69.603524,63.115739,52.422907,42.010412,36.28354,42.571085,37.484982,41.529836,34.521426,...,4.805767,4.32519,3.92471,5.246296,3.203845,2.282739,3.003604,3.123748,2.643172,1.561874
Engineering manager,69.346549,60.682819,60.425844,47.099853,46.512482,37.995595,43.465492,45.888399,30.837004,33.333333,...,4.809104,6.461087,3.817915,4.662261,2.679883,2.679883,4.515419,4.001468,4.331865,1.578561
"Developer, game or graphics",67.228397,62.459663,49.26497,48.045895,43.850843,41.376838,41.233417,35.353173,34.313374,53.531732,...,3.728935,2.653281,2.617426,3.657225,16.09896,3.155253,2.402295,3.011832,2.007888,1.398351
Educator,68.40847,66.803279,56.625683,53.85929,45.252732,39.993169,39.583333,38.18306,37.363388,29.576503,...,4.474044,3.51776,3.790984,4.678962,3.381148,4.20082,2.834699,2.766393,2.288251,2.185792
Academic researcher,54.19482,53.462838,46.706081,47.015766,59.037162,39.864865,28.293919,36.768018,29.898649,24.239865,...,2.533784,3.800676,3.040541,3.293919,2.956081,4.504505,2.111486,2.505631,1.52027,4.335586
"Engineer, data",56.902087,52.594971,67.17496,49.678973,65.088283,40.449438,31.942215,43.900482,29.507758,26.645265,...,2.782236,7.490637,4.333868,3.210273,2.70198,2.835741,2.808989,3.504548,2.3542,2.247191


In [86]:
fig = go.Figure(data=go.Heatmap(z=skills, x=skills.columns,y=skills.index, colorscale='magma', ygap=1))
fig.update_layout(width=1600, height=700)
fig.show()

In [98]:
roles_short_dict = roles_names.set_index('Original name')["Short name "].to_dict()
short_labels = [roles_short_dict[role] 
                for role in sorted_roles]


In [99]:
roles_short_dict

{'Developer, back-end': 'Back-end dev',
 'Developer, full-stack': 'Full-stack dev',
 'Developer, front-end': 'Front-end dev',
 'Developer, desktop or enterprise applications': 'Desktop dev',
 'Developer, mobile': 'Mobile dev',
 'DevOps specialist': 'DevOps',
 'Database administrator': 'Database admin',
 'Designer': 'Designer',
 'System administrator': 'System admin',
 'Developer, embedded applications or devices': 'Embedded dev',
 'Data or business analyst': 'Analyst',
 'Data scientist or machine learning specialist': 'Data Scientist ',
 'Developer, QA or test': 'Quality Assurance',
 'Engineer, data': 'Data Engineer',
 'Academic researcher': 'Researcher',
 'Educator': 'Educator',
 'Developer, game or graphics': 'Game dev',
 'Engineering manager': 'Engineering manager',
 'Product manager': 'Product manager',
 'Scientist': 'Scientist',
 'Engineer, site reliability': 'Site Engineer',
 'Senior executive/VP': 'Senior Executive',
 'Marketing or sales professional': 'MarketingSales'}

In [100]:
fig = ff.create_dendrogram(skills, labels=short_labels, orientation='left', color_threshold=0)
fig.update_layout(height=600, width=600, showlegend=False) 
fig.show()


In [101]:
std_skills = StandardScaler().fit_transform(skills)
std_skills = pd.DataFrame(std_skills, columns=skills.columns, index=skills.index)


In [112]:
std_skills

Unnamed: 0,JavaScript,HTML/CSS,SQL,MySQL,Python,Java,Node.js,Bash/Shell/PowerShell,jQuery,C#,...,Gatsby,Cassandra,IBM DB2,Drupal,Unreal Engine,Haskell,Puppet,Couchbase,Chef,Julia
Marketing or sales professional,0.433816,1.510025,-0.260708,2.216239,-0.766784,-1.369926,0.528826,-1.040611,1.505056,-1.020478,...,3.452427,0.903457,2.315078,3.687879,1.109495,1.468417,0.763845,3.170701,1.239791,1.589551
Senior executive/VP,0.636243,0.649598,0.622312,-0.185986,-0.14033,-1.1014,0.843117,0.472389,0.076494,0.163804,...,0.944052,1.288963,1.714835,1.193263,0.218279,0.627067,0.952938,2.187051,0.785153,0.621855
"Engineer, site reliability",-0.12966,-0.578629,0.213405,0.670758,1.044524,0.527746,0.664846,2.426403,-0.786003,-0.964409,...,0.726379,2.594284,0.017105,0.64789,-0.19434,1.218568,3.480932,0.650205,3.327545,0.097269
Scientist,-2.033926,-1.900843,-1.971116,-2.284794,1.708186,-1.187979,-1.895546,0.24749,-1.719184,-1.455676,...,-0.573372,-0.127158,-0.220117,-0.319072,0.115872,1.836601,-0.500806,0.258055,-0.319457,2.624992
Product manager,0.650333,0.699925,0.384822,0.39855,-0.478822,-0.374835,0.655958,-0.317055,0.880611,0.121461,...,0.666328,-0.010992,0.256063,0.673492,-0.17342,-0.562685,-0.099882,0.124635,-0.018268,-0.230841
Engineering manager,0.249902,-0.327758,0.058894,-0.682794,-0.095392,0.046803,0.79353,0.643151,-0.808494,-0.032354,...,0.668527,1.214478,0.136461,0.300705,-0.357597,-0.143187,0.762866,0.985202,1.028413,-0.218632
"Developer, game or graphics",0.018228,-0.123062,-1.293442,-0.490612,-0.322077,0.879523,0.450207,-0.560643,-0.259346,2.582619,...,-0.043181,-0.97025,-1.207997,-0.340806,4.359325,0.35894,-0.443031,0.014906,-0.412027,-0.350488
Educator,0.147299,0.377331,-0.401562,0.690342,-0.202682,0.538758,0.196403,-0.23729,0.222453,-0.51873,...,0.447761,-0.474256,0.1063,0.311365,-0.111096,1.46336,-0.196271,-0.225736,-0.238253,0.225666
Academic researcher,-1.407328,-1.159514,-1.603496,-0.699876,0.971301,0.50716,-1.540057,-0.398978,-0.956723,-1.209634,...,-0.830649,-0.311933,-0.73414,-0.572703,-0.260511,1.784139,-0.608987,-0.481403,-0.714261,1.798622
"Engineer, data",-1.111219,-1.259494,0.876667,-0.158863,1.486659,0.651127,-0.978901,0.416004,-1.01847,-0.898221,...,-0.666947,1.805182,0.71429,-0.626094,-0.34983,0.021444,-0.210943,0.497993,-0.197377,0.27059


In [104]:
fig = go.Figure(data=go.Heatmap(z=std_skills, x=skills.columns,y=skills.index, colorscale='magma', ygap=1))
fig.update_layout(width=1200, height=700)
fig.show()


In [109]:
role = 'Data scientist or machine learning specialist'


In [110]:
single_role_skills = pd.concat([skills.loc[role], std_skills.loc[role]], axis=1)
single_role_skills.columns = ['percentage', 'specificity']
single_role_skills = single_role_skills.sort_values('percentage')

In [114]:
threshold = 10

single_role_skills = single_role_skills[single_role_skills['percentage'] > threshold]

fig = px.bar(df, 
             y=single_role_skills.index, 
             x=single_role_skills['percentage'], 
             color=single_role_skills['specificity'], 
             color_continuous_scale='orrd', 
             range_color=[std_skills.values.min(),std_skills.values.max()],
             orientation='h')

fig.update_layout(width=800, height=800, title=role)
fig.show()