In [2]:
PATH_PROCESSED = '../data/process/stack_overflow_processed.pkl'

DEVELOPER_TYPE = ['DevType']

WORKED_WITH_IN_PAST_YEARS = ['DatabaseWorkedWith', 'LanguageWorkedWith', 'MiscTechWorkedWith', 'PlatformWorkedWith', 'WebframeWorkedWith']
WANT_TO_WORK_WITH_NEXT_YEAR =  ['DatabaseDesireNextYear', 'LanguageDesireNextYear', 'MiscTechDesireNextYear', 'PlatformDesireNextYear'
                                , 'WebframeDesireNextYear']

## Load important packages

In [3]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer, RobustScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from sklearn.preprocessing import OneHotEncoder

from scipy.cluster.hierarchy import dendrogram

In [4]:
# load Data
data = pd.read_pickle(PATH_PROCESSED)
data.head(3)

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13.0,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",[ASP.NET Core],"[ASP.NET, ASP.NET Core]",Just as welcome now as I felt last year,50.0,36.0,27.0
1,2,I am a developer by profession,No,,19.0,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",[],[],Somewhat more welcome now than last year,,7.0,4.0
2,3,I code primarily as a hobby,Yes,,15.0,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,[],[],Somewhat more welcome now than last year,,4.0,


### One Hot Enconing For the columns that we interested in

In [12]:
dataCopy = data.copy()
dataEncoded = {}

for column in DEVELOPER_TYPE + WORKED_WITH_IN_PAST_YEARS:
    binarizer = MultiLabelBinarizer()
    df = pd.DataFrame(binarizer.fit_transform(dataCopy[column]), columns=binarizer.classes_, index=dataCopy[column].index)
    dataEncoded[column] = df

In [13]:
# merge
dataCopy = pd.concat(dataEncoded, axis=1)
dataCopy

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,WebframeWorkedWith,WebframeWorkedWith,WebframeWorkedWith,WebframeWorkedWith,WebframeWorkedWith,WebframeWorkedWith,WebframeWorkedWith,WebframeWorkedWith,WebframeWorkedWith,WebframeWorkedWith
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,Designer,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices",...,Express,Flask,Gatsby,Laravel,React.js,Ruby on Rails,Spring,Symfony,Vue.js,jQuery
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64458,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# calculate sum of all developer technologies
skills = dataCopy.drop(['DevType'], axis=1).sum().reset_index()
skills.columns = ['Group' ,'Technologies', 'Count']
skills

Unnamed: 0,Group,Technologies,Count
0,DatabaseWorkedWith,Cassandra,1654
1,DatabaseWorkedWith,Couchbase,937
2,DatabaseWorkedWith,DynamoDB,3497
3,DatabaseWorkedWith,Elasticsearch,6817
4,DatabaseWorkedWith,Firebase,7128
...,...,...,...
85,WebframeWorkedWith,Ruby on Rails,2944
86,WebframeWorkedWith,Spring,6941
87,WebframeWorkedWith,Symfony,1851
88,WebframeWorkedWith,Vue.js,7322


Visualize The number of tecnologthies based on groups using tree

In [17]:
import plotly.graph_objects as go
import plotly.express as px


fig = px.treemap(skills, path=['Group', 'Technologies'], values='Count',
                  color='Count', hover_data=['Technologies'],
                  color_continuous_scale='RdBu',
                  )
fig.update_layout(width=1400, height = 700)
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



***Notice from the Tree Map that Most Used Language is JavaScript, Most platform Used Linux and Windows, Most Database is MySQL Most webframework is JQuery-React JS and finally most MiscTech is NodeJs*** 

## Creat Heatmap between jobs and skills

In [30]:
sorted_type = dataCopy['DevType'].sum().sort_values(ascending=False).index.to_list()
sorted_skills = dataCopy.drop(['DevType'], axis=1).sum().sort_values(ascending=False).droplevel(level=0).index.to_list()
