In [1]:
DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/datasets/survey_results_public.pkl"

In [8]:
ROLE_COLS = ['DevType']
TECH_COLS = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith']

In [3]:
# Load packages
import pandas as pd
import numpy as np
import logging
import pickle
import os
import yaml

import plotly
import plotly.graph_objects as go
import plotly.express as px

import matplotlib.cm as cm
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.manifold import TSNE

from sklearn.pipeline import make_pipeline
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import silhouette_score

pd.options.display.max_rows = 1000

# Read data and preprocess data

In [5]:
# Read Data
df = pd.read_pickle(DATA_PATH)

In [9]:
# One hot encode
encoded_dfs = {}
for col in ROLE_COLS + TECH_COLS:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(binarizer.fit_transform(df[col]),
                               columns=binarizer.classes_,
                               index=df[col].index)
    encoded_dfs[col] = encoded_df

# Merge 1-hot encoded
ohe_df = pd.concat(encoded_dfs, axis=1)

In [10]:
# Prepare sub data frames
skills_ohe = ohe_df.drop('DevType', axis=1).copy()
std_skills = StandardScaler().fit_transform(skills_ohe)

In [11]:
tsne_projection = TSNE(n_components=2,
                       perplexity=3,
                       learning_rate=0.01,
                       init='pca',
                       method='barnes_hut',
                       n_jobs=2,
                       n_iter=10**10,
                       random_state=0).fit_transform(std_skills.T)

tsne_projection = pd.DataFrame(tsne_projection, index=skills_ohe.columns)



In [12]:
tsne_projection.shape

(97, 2)

In [13]:
fig = px.scatter(x=tsne_projection[0], y=tsne_projection[1], text=tsne_projection.droplevel(0).index)
fig.update_traces(textposition='top center')
fig.update_layout(height=1000, width=1000, title_text='TSNE')
fig.show()