In [1]:
DF_PATH = "../data/processed/1_preprocessed_df.pkl"
ROLES_PATH = "../data/raw/roles_short_names.csv" 

NA_STRING = 'Not Specified'
TRANSPARENT_STRING = 'rgba(0, 0, 0, 0)'

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageWorkedWith',    'DatabaseWorkedWith',    'WebframeWorkedWith',    'MiscTechWorkedWith',    ]
TECH_NEXT_COLS = ['LanguageDesireNextYear','DatabaseDesireNextYear','WebframeDesireNextYear','MiscTechDesireNextYear',]

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle

import plotly 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff


from sklearn.manifold import TSNE
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.cluster import AgglomerativeClustering

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

## Read data and preprocess

In [3]:
# Read Data 
raw_df = pd.read_pickle(DF_PATH)
roles_names = pd.read_csv(ROLES_PATH, sep=';')

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/roles_short_names.csv'

## Onehot enconde

In [4]:
df = raw_df.copy()
encoded_dfs = {}
for col in ROLE_COLS + TECH_COLS:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(binarizer.fit_transform(df[col]),
                               columns=binarizer.classes_,
                               index=df[col].index)
    encoded_dfs[col] = encoded_df

In [8]:
# Merge 1-hot encoded 
df = pd.concat(encoded_dfs, axis=1)

## Display skills frequency

In [None]:
# Calculate the sum for all cols except Devtypes
skills_freq = df.drop('DevType', axis=1).sum().reset_index()
skills_freq.columns = ['group', 'skill', 'freq']

In [None]:
skills_freq

In [None]:
fig = px.treemap(skills_freq, 
                 path=['group', 'skill'], 
                 color_continuous_scale='deep',
                 values='freq',color='freq')

fig.update_layout(width=1400, height=700)

fig.show()

### Create Jobs & Skills Heatmap

In [None]:
sorted_roles = df['DevType'].sum().sort_values().index.tolist()
sorted_skills = df.drop('DevType', axis=1).sum().sort_values(ascending=False).droplevel(level=0).index.tolist()

In [None]:
skills = []
for role in sorted_roles:
    role_mask = (df[('DevType', role)] == 1)
    skills_role = pd.concat({tech_col: df.loc[role_mask, tech_col].mean() * 100
                             for tech_col in TECH_COLS})
    skills.append(skills_role)
skills = pd.concat(skills, axis=1)


skills.columns = sorted_roles
skills = skills.reset_index(level=0, drop=True)
skills = skills.loc[sorted_skills]

skills = skills.T

In [None]:
fig = go.Figure(data=go.Heatmap(z=skills, x=skills.columns,y=skills.index, colorscale='magma', ygap=1))
fig.update_layout(width=1600, height=700)
fig.show()

### Create Jobs dendrogram

In [None]:
roles_short_dict = roles_names.set_index('Original name')["Short name "].to_dict()
short_labels = [roles_short_dict[role] 
                for role in sorted_roles]