referencing code samples in https://towardsdatascience.com/elbow-method-is-not-sufficient-to-find-best-k-in-k-means-clustering-fc820da0631d#:~:text=The%20elbow%20method%20is%20a,cluster%20and%20the%20cluster%20centroid.


## Do general imports

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer

pd.set_option('display.max_columns', None)
plt.rcParams.update({'font.size': 14})


## Load Datasets

In [None]:
issues_df = pd.read_csv('./temp_data/issues.csv')
max_clusters = 7
columns = ['processing_steps','issue_comments_count','issue_contr_count','wf_total_time']
types = ['Ticket', 'Deployment', 'HD Service']

issues_df = issues_df[(issues_df['issue_proj'].str.match('\w{2}\d{2}\w{1,}'))
                      & (issues_df['issue_type'].isin(types))
                      & (issues_df['issue_created'] >= '2022-01-01')
                      & (issues_df['issue_created'] <= '2022-12-31')
                      & pd.notna(issues_df['issue_resolution_date'])]
df = issues_df[columns].copy()

for c in df.columns:
    df.loc[:,c] = StandardScaler().fit_transform(df[c].values.reshape(-1, 1))
df

In [None]:
km = KMeans(n_init="auto",random_state=42)

if len(df.columns) == 1:
    unique_v = len(df.drop_duplicates())
    if unique_v < max_clusters:
        max_clusters = unique_v
visualizer = KElbowVisualizer(km, k=(2,max_clusters+1),timings=False)
visualizer.ax.set_xlabel('k',fontsize=14) 
visualizer.ax.set_ylabel('distortion score',fontsize=14) 
visualizer.fit(df)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
for i in range(2,max_clusters + 1):

    km = KMeans(n_clusters=i, n_init='auto', random_state=42)
    q, mod = divmod(i, 3)
    fig = plt.figure(figsize=(6,2))
    ax = fig.add_subplot(1,1,1)
    ax.set_ylabel('Size',fontsize=14)
    ax.set_xlabel(f'Score',fontsize=14)
    ax.set_title(f'{i} clusters',fontsize=14)
    visualizer = SilhouetteVisualizer(km,colors=sns.color_palette("tab10"), ax=ax)
    visualizer.fit(df)


In [None]:
km = KMeans(n_clusters=4, n_init='auto', random_state=42)
km.fit(df)
sns.set(font_scale=1)

df_pp = issues_df[columns].copy()
df_pp.loc[:,'category'] = km.labels_

cat = df_pp['category'].drop_duplicates().sort_values()
labels = []
for c in cat:
    c_len = len(df_pp[df_pp['category'] == c])
    labels.append(f'{c_len} issues')

pair_plot = sns.pairplot(df_pp.rename(columns={
'wf_total_time':'Total Time',
'issue_contr_count':'Contributors count',
'issue_comments_count':'Comments count',
'processing_steps':'Processing Steps'
}),hue='category',palette=sns.color_palette("tab10")[0:(len(cat))],corner=True)

lgnd = pair_plot.legend
for i,l in enumerate(labels):
    lgnd.texts[i].set_text(l)

In [None]:
ax = plt.figure(figsize=(8,8)).add_subplot(projection='3d')
colors = palette=sns.color_palette("tab10")
categories = df_pp['category'].drop_duplicates().sort_values()

x= ('Workflow Total Time','wf_total_time',)
y= ('Comments Count','issue_comments_count',)
z= ('Processing Steps', 'processing_steps',)

df = df_pp[[x[1],y[1],z[1],'category']].copy()

for i,c in enumerate(categories):
    df_c = df_pp[df_pp['category'] == c]
    x_v = df_c[x[1]]
    y_v = df_c[y[1]]
    z_v = df_c[z[1]]
    ax.scatter(x_v, y_v, z_v,color=colors[i],label=f'{len(df_c)} issues')

ax.legend()
if len(z) > 2:
    ax.set_zticks(z[2])
if len(x) > 2:
    ax.set_xticks(x[2])
if len(y) > 2:
    ax.set_yticks(y[2])
        
ax.set_xlabel(x[0])
ax.set_ylabel(y[0])
ax.set_zlabel(z[0])

ax.view_init(elev=20, azim=-130, roll=0)
ax.set_box_aspect(aspect=None, zoom=0.85)