In [1]:
from os.path import join
import pandas as pd
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sys import path
path.insert(0, '..')
from utils import format_decision_tree_plot
from sklearn.metrics import silhouette_score, calinski_harabasz_score

# Configurations

In [2]:
SAVE_FIGURES = False
PRINT_OUTPUT = False
SEED = 42

# Arranging Input Data

In [3]:
onet_skills_unstacked = pd.read_csv(join('..', 'onet_skills_unstacked.csv'), index_col=['O*NET-SOC Code', 'Title'])

# Finding Ideal Number of Clusters

In [6]:
max_num_clusters_trying = 20

losses = []
for num_clusters in range(2, max_num_clusters_trying):
    model = KMeans(
        n_clusters=num_clusters,
        init='k-means++',
        n_init=10,
        max_iter=300,
        algorithm='lloyd',
        random_state=SEED
    ).fit(
        onet_skills_unstacked.values
    )
    losses.append(model.inertia_)

figure = plt.figure();
axes = figure.add_subplot(111);
axes.set_title('Elbow Plot');
axes.set_xlabel('Number of Clusters');
axes.set_xticks(ticks=range(len(losses)), labels=range(2, max_num_clusters_trying));
axes.set_ylabel('Sum of Squared Distances of Points to their Centroid');

axes.plot(losses, marker='o');

if SAVE_FIGURES:
    figure.savefig('onet_skills_elbow_plot.png')
if PRINT_OUTPUT:
    plt.show(figure)
else:
    plt.close(figure)

My judgement is that around 10 clusters is where the plot seems to plateau.

# Clustering

In [7]:
model = KMeans(
    n_clusters=15,
    init='k-means++',
    n_init=10,
    max_iter=300,
    algorithm='lloyd',
    random_state=SEED
).fit(onet_skills_unstacked.values)

In [8]:
onet_skills_unstacked['Cluster'] = model.labels_

# Analyzing Clusters

## Assessing Fit

In [9]:
print(f"Silhouette Score: {silhouette_score(onet_skills_unstacked.drop(columns='Cluster'), onet_skills_unstacked['Cluster'])}")
print(f"Calinski Harabasz Score: {calinski_harabasz_score(onet_skills_unstacked.drop(columns='Cluster'), onet_skills_unstacked['Cluster'])}")

Silhouette Score: 0.1499923737522422
Calinski Harabasz Score: 174.8618735246516


## Visualizing with Decision Trees

Idea of visualizing cluster models with decision trees taken from <url>https://docs.interpretable.ai/stable/examples/clustering/</url>

In [10]:
decision_tree = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=None,
    random_state=SEED
).fit(
    X=onet_skills_unstacked.drop(columns='Cluster').values, 
    y=onet_skills_unstacked['Cluster'].values,
)

In [11]:
figure = plt.figure(figsize=(70, 12));
axes = figure.add_subplot(111);

plot_tree(
    decision_tree, 
    feature_names=onet_skills_unstacked.drop(columns='Cluster').columns,
    impurity=True,
    fontsize=7, 
    rounded=True, 
    filled=True, 
    ax=axes
);

format_decision_tree_plot(axes)

if SAVE_FIGURES:
    figure.savefig('onet_skills_decision_tree.png')
if PRINT_OUTPUT:
    figure.show()
else:
    plt.close(figure)

## Descriptive Statistics of the Clusters

In [12]:
if SAVE_FIGURES:
    excel_writer = pd.ExcelWriter('optimalKMeansClustering.xlsx')

In [13]:
centroids = pd.DataFrame(
    columns = onet_skills_unstacked.drop(columns='Cluster').columns, 
    index = pd.Series(range(model.n_clusters), name='Cluster'), 
    data=model.cluster_centers_
)
if SAVE_FIGURES:
    centroids.to_excel(excel_writer, 'Cluster Centers')
if PRINT_OUTPUT:
    display(centroids.style)

In [15]:
centroids_standardized = centroids.copy()
for skill in centroids_standardized.columns:
    centroids_standardized[skill] = \
        (centroids[skill] - centroids[skill].mean()) / centroids[skill].std()

centroids_standardized_display = centroids_standardized.style.apply(
    lambda col:
        [
            "background-color:mediumseagreen;" if z == col.max() 
            else "background-color:lightgreen;" if z > 0.75
            else None 
            for z in col 
        ],
    axis=0
)

if SAVE_FIGURES:
    centroids_standardized_display.to_excel(excel_writer, 'Centroids Standardized')
if PRINT_OUTPUT:
    display(centroids_standardized_display)

In [16]:
cluster_assignments_display = onet_skills_unstacked['Cluster'] \
    .sort_values() \
    .reset_index() \
    .set_index('Cluster')[['Title', 'O*NET-SOC Code']] \
    .style

if SAVE_FIGURES:
    cluster_assignments_display.to_excel(excel_writer, sheet_name='Cluster Assignments')
if PRINT_OUTPUT:
    display(cluster_assignments_display)

In [32]:
if SAVE_FIGURES:
    excel_writer.close()