In [None]:
%matplotlib notebook
import ast
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from itertools import cycle
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AffinityPropagation, OPTICS, DBSCAN, cluster_optics_dbscan
from gensim.models.keyedvectors import KeyedVectors

sns.set_style('darkgrid')

# Load data

In [None]:
# Load code vectors and labels
with open('./data/output_large.jsonl') as f:
    lines = f.read().splitlines()
    
df_inter = pd.DataFrame(lines)
df_inter.columns = ['json_element']
df_inter['json_element'].apply(json.loads)
df = pd.json_normalize(df_inter['json_element'].apply(json.loads).apply(ast.literal_eval))
df

In [None]:
data = np.array(df.features.values.tolist())

In [None]:
pca = PCA(n_components=100)
pca_result = pca.fit_transform(data)

In [None]:
print(f"Cumulative explained variation for {len(pca.explained_variance_ratio_)} principal components: {np.sum(pca.explained_variance_ratio_)}")
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(pca.explained_variance_ratio_)), np.cumsum(pca.explained_variance_ratio_))
plt.ylim([0, 1])
plt.ylabel('Cumulative Explained Variatiance')
plt.xlabel('Number of Principal Components')
plt.tight_layout()

# Visualize the dataset with t-SNE & PCA

In [None]:
# Use to limit the number of samples (if necessary for computational reasons)
# rndperm = np.random.permutation(df.shape[0])
# N = 10000
# df_small = df.loc[rndperm[:N],:].copy()

# t-SNE on many different labels
tsne2d = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=3000)
tsne_result_2d_all = tsne2d.fit_transform(pca_result)

tsne3d = TSNE(n_components=3, verbose=1, perplexity=30, n_iter=3000)
tsne_result_3d_all = tsne3d.fit_transform(pca_result)

In [None]:
# Try to visualize clusters using t-SNE in 3D
fig = plt.figure(figsize=(14,5))
ax1 = fig.add_subplot(121, projection='3d')
ax1.set_title('Dataset visualized with t-SNE')
ax1.scatter(tsne_result_3d_all[:, 0], tsne_result_3d_all[:, 1], tsne_result_3d_all[:, 2], alpha=0.1)

ax2 = fig.add_subplot(122, projection='3d')
ax2.set_title('Dataset visualized with PCA')
ax2.scatter(pca_result[:, 0], pca_result[:, 1], pca_result[:, 2], alpha=0.1)

# Attempt k-means clustering on the large dataset
Use the Elbow method (sum of square distances between clusters) and silhouette score to try to find a good value for k.

## Silhouette Score
Meaning of the value
 - 1: Clusters are well apart/distinguished.
 - 0: Clusters are indifferent / no meaningful distance between them.
 - -1: Clusters are assigned in the wrong way

In [None]:
# Search for a good k
sum_of_squared_distances = []
silhouette_scores = []
K = range(2,50,2)
for k in K:
    km = KMeans(n_clusters=k, random_state=0)
    km = km.fit(pca_result)
    sum_of_squared_distances.append(km.inertia_)
    silhouette_scores.append(metrics.silhouette_score(pca_result, km.labels_, metric='sqeuclidean'))

In [None]:
fig = plt.figure(figsize=(14,5))
ax1 = fig.add_subplot(121)
ax1.set_title('Elbow method for optimal k')
ax1.plot(K, sum_of_squared_distances, 'bx-')
ax1.set_xlabel('k')
ax1.set_ylabel('Sum of squared distances')

ax2 = fig.add_subplot(122)
ax2.set_title('Silhouette score for optimal k')
ax2.plot(K, silhouette_scores, 'bx-')
ax2.set_xlabel('k')
ax2.set_ylabel('Silhouette Score')

# Only consider a set of chosen methods

In [None]:
# Randomly select k frequently occuring labels to consider for analysis
# k = 5
# Only consider labels that are "frequently" occuring
# df_subset = df[(df['count'] >= 100) & (df['count'] <= 3000)]
# np.random.seed(20) #31
# cats = np.random.choice(df_subset.label.unique(), size=k, replace=False)
# df_subset = df_subset[df_subset.label.isin(cats)]

# manually choose
cats = ['predict', 'train', 'update', 'preprocess', 'save', 'transform']
k = len(cats)
df_subset = df[df.method_name.isin(cats)]

# Assign numerical values for each category
df_subset.method_name = pd.Categorical(df_subset.method_name)
df_subset['category'] = df_subset.method_name.cat.codes
unique_labels = dict( enumerate(df_subset['method_name'].cat.categories ) )
print(f"Unique labels:{unique_labels}\nNumber of samples {len(df_subset)}")

# Use to limit the number of samples (if necessary for computational reasons)
# rndperm = np.random.permutation(df.shape[0])
# N = 10000
# df_subset = df_subset.loc[rndperm[:N],:].copy()

data_subset = np.array(df_subset.features.values.tolist())

## Check for similarities and analogies

In [None]:
df_subset.sort_values('category').groupby('method_name').first()

In [None]:
# Try to identify any similarities in code vectors
word1 = 'save'
word2 = 'transform'
# Calculate cosine similarity between code vectors that represent different methods
cosine_similarities = metrics.pairwise.cosine_similarity(np.array(df_subset[df_subset['method_name'] == word1].features.values.tolist()), np.array(df_subset[df_subset['method_name'] == word2].features.values.tolist()))
pd.DataFrame(cosine_similarities).describe()

## PCA

In [None]:
pca = PCA(n_components=100)
pca_result = pca.fit_transform(data_subset)

print(f"Cumulative explained variation for {len(pca.explained_variance_ratio_)} principal components: {np.sum(pca.explained_variance_ratio_)}")
plt.figure(figsize=(10,8))
plt.plot(np.arange(len(pca.explained_variance_ratio_)), np.cumsum(pca.explained_variance_ratio_))
plt.ylim([0, 1])
plt.ylabel('Cumulative Explained Variatiance')
plt.xlabel('Number of Principal Components')
plt.tight_layout()

## t-SNE

In [None]:
# t-SNE
tsne2d = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=3000)
tsne_result_2d = tsne2d.fit_transform(pca_result)

tsne3d = TSNE(n_components=3, verbose=1, perplexity=30, n_iter=3000)
tsne_result_3d = tsne3d.fit_transform(pca_result)

## Agglomerative Clustering

In [None]:
def plot_clustering(data, labels, title=None):
    fig = plt.figure(figsize=(14,5))
    fig.suptitle('Visualizing clusters with t-SNE (3D)')
    ax1 = fig.add_subplot(121)
    ax1.set_title(f'Agglomerative Clusters (k={k})')
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for klass, color in zip(range(0, k), colors):
        Xk = tsne_result_3d[labels == klass]
        ax1.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3, label=unique_labels[klass])
        
    ax2 = fig.add_subplot(122)
    ax2.set_title('True labels')
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for klass, color in zip(range(0, k), colors):
        Xk = tsne_result_3d[df_subset['category'] == klass]
        ax2.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3, label=unique_labels[klass])

    ax2.legend()

In [None]:
# Agglomerative (hierarchical) clustering
from time import time
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(linkage='ward', n_clusters=k)
clustering.fit(pca_result)
plot_clustering(data_subset, clustering.labels_, "%s linkage" % linkage)

In [None]:
# Visualize with kepler map
import kmapper as km
mapper = km.KeplerMapper(verbose=2)

# Fit and transform data, use TSNE 
projected_data = mapper.fit_transform(data_subset, projection=TSNE())

# Create the graph (we cluster on the projected data and suffer projection loss)
graph = mapper.map(
    projected_data,
    clusterer=AgglomerativeClustering(linkage='ward', n_clusters=k),
    cover=km.Cover(35, 0.4),
)

In [None]:
# Create the visualizations (increased the graph_gravity for a tighter graph-look.)
print("Output graph examples to html")
# Tooltips with image data for every cluster member
mapper.visualize(
    graph,
    title="Method Name Mapper",
    path_html="./method_names.html",
    color_values=df_subset['category'].values,
)

In [None]:
# Tooltips with the target y-labels for every cluster member
mapper.visualize(
    graph,
    title="Method Name Mapper",
    path_html="./method_names_tooltips.html",
    custom_tooltips=df_subset['category'].values,
)

In [None]:
km.draw_matplotlib(graph, layout="spring")
plt.show()

# K-means Clustering and Visualization with t-SNE & PCA
Here we have selected a subset of methods, using their name as the label and try to cluster by their code vector representations.

In [None]:
# Apply K-means
kmeans = KMeans(n_clusters=k, random_state=0).fit(data_subset)

print('Number of clusters: %d' % k)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(df_subset['category'], kmeans.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(df_subset['category'], kmeans.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(df_subset['category'], kmeans.labels_))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(df_subset['category'], kmeans.labels_))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(df_subset['category'], kmeans.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(data_subset, kmeans.labels_, metric='sqeuclidean'))


# Try to visualize clusters using PCA
fig = plt.figure(figsize=(14,5))
fig.suptitle('Visualizing clusters with PCA')
ax1 = fig.add_subplot(121, projection='3d')
ax1.set_title(f'K-means Clusters (k={k})')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for klass, color in zip(range(0, k), colors):
    Xk = pca_result[kmeans.labels_ == klass]
    ax1.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3)

ax2 = fig.add_subplot(122, projection='3d')
ax2.set_title('True labels')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for klass, color in zip(range(0, k), colors):
    Xk = pca_result[df_subset['category'] == klass]
    ax2.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3, label=unique_labels[klass])

ax2.legend()

# Try to visualize clusters using t-SNE in 3D
fig = plt.figure(figsize=(14,5))
fig.suptitle('Visualizing clusters with t-SNE (3D)')
ax1 = fig.add_subplot(121, projection='3d')
ax1.set_title(f'K-means Clusters (k={k})')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for klass, color in zip(range(0, k), colors):
    Xk = tsne_result_3d[kmeans.labels_ == klass]
    ax1.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3, label=unique_labels[klass])


ax2 = fig.add_subplot(122, projection='3d')
ax2.set_title('True labels')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for klass, color in zip(range(0, k), colors):
    Xk = tsne_result_3d[df_subset['category'] == klass]
    ax2.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3, label=unique_labels[klass])

ax2.legend()

# Try to visualize clusters using t-SNE in 2D
fig = plt.figure(figsize=(14,5))
fig.suptitle('Visualizing clusters with t-SNE (2D)')
ax1 = fig.add_subplot(121)
ax1.set_title(f'K-means Clusters (k={k})')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for klass, color in zip(range(0, k), colors):
    Xk = tsne_result_2d[kmeans.labels_ == klass]
    ax1.scatter(Xk[:, 0], Xk[:, 1], c=color, alpha=0.3, label=unique_labels[klass])


ax2 = fig.add_subplot(122)
ax2.set_title('True labels')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for klass, color in zip(range(0, k), colors):
    Xk = tsne_result_2d[df_subset['category'] == klass]
    ax2.scatter(Xk[:, 0], Xk[:, 1], c=color, alpha=0.3, label=unique_labels[klass])
    
ax2.legend()

In [None]:
# Apply OPTICS algorithm
eps = [0.00, 0.05]
model = OPTICS(cluster_method='xi', min_samples=10, metric='cosine')
model.fit(data_subset)

labels_eps0 = cluster_optics_dbscan(reachability=model.reachability_,
                                   core_distances=model.core_distances_,
                                   ordering=model.ordering_, eps=eps[0])
labels_eps1 = cluster_optics_dbscan(reachability=model.reachability_,
                                   core_distances=model.core_distances_,
                                   ordering=model.ordering_, eps=eps[1])

space = np.arange(len(data_subset))
reachability = model.reachability_[model.ordering_]
optics_labels = model.labels_[model.ordering_]


print(f'{"-"*10} METRICS FOR OPTICS {"-"*10}')
print('Number of clusters: %d' % len(model.cluster_hierarchy_))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(df_subset['category'], optics_labels))
print("Completeness: %0.3f" % metrics.completeness_score(df_subset['category'], optics_labels))
print("V-measure: %0.3f" % metrics.v_measure_score(df_subset['category'], optics_labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(df_subset['category'], optics_labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(df_subset['category'], optics_labels))
if (len(model.cluster_hierarchy_) > 1):
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(data_subset, optics_labels, metric='sqeuclidean'))

print(f'{"-"*10} METRICS FOR DBSCAN EPS {eps[0]} {"-"*10}')
print('Number of clusters: %d' % len(np.unique(labels_eps0[labels_eps0 > -1])))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(df_subset['category'], labels_eps0))
print("Completeness: %0.3f" % metrics.completeness_score(df_subset['category'], labels_eps0))
print("V-measure: %0.3f" % metrics.v_measure_score(df_subset['category'], labels_eps0))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(df_subset['category'], labels_eps0))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(df_subset['category'], labels_eps0))
if (len(np.unique(labels_eps0[labels_eps0 > -1])) > 1):
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(data_subset, labels_eps0, metric='sqeuclidean'))

print(f'{"-"*10} METRICS FOR DBSCAN EPS {eps[1]} {"-"*10}')
print('Number of clusters: %d' % len(np.unique(labels_eps1[labels_eps1 > -1])))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(df_subset['category'], labels_eps1))
print("Completeness: %0.3f" % metrics.completeness_score(df_subset['category'], labels_eps1))
print("V-measure: %0.3f" % metrics.v_measure_score(df_subset['category'], labels_eps1))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(df_subset['category'], labels_eps1))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(df_subset['category'], labels_eps1))
if (len(np.unique(labels_eps1[labels_eps1 > -1])) > 1):
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(data_subset, labels_eps1, metric='sqeuclidean'))


labels = ['N/A']
for val in unique_labels.values():
    labels.append(val)

# Reachability plot
plt.figure(figsize=(10, 5))
plt.suptitle('Reachability Plot')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for klass, color in zip(range(0, len(model.cluster_hierarchy_)), colors):
    Xk = space[optics_labels == klass]
    Rk = reachability[optics_labels == klass]
    plt.plot(Xk, Rk, color, alpha=0.3)
plt.plot(space[optics_labels == -1], reachability[optics_labels == -1], 'k.', alpha=0.3)
plt.plot(space, np.full_like(space, eps[0], dtype=float), 'k-.', alpha=0.5)
plt.plot(space, np.full_like(space, eps[1], dtype=float), 'k-', alpha=0.5)
plt.ylabel('Reachability (epsilon distance)')



# Scatter plots
fig = plt.figure(figsize=(14, 10))
ax1 = fig.add_subplot(221, projection='3d')
ax1.set_title('Automatic Clustering\nOPTICS')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
ax1.scatter(pca_result[:, 0], pca_result[:, 1], pca_result[:, 2], c='k', marker='+', alpha=0.1)
for klass, color in zip(range(0, len(model.cluster_hierarchy_)), colors):
    Xk = pca_result[optics_labels == klass]
    ax1.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3)
    
ax2 = fig.add_subplot(222, projection='3d')
ax2.set_title(f'Clustering at {eps[0]} epsilon cut\nDBSCAN')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
ax2.scatter(pca_result[:, 0], pca_result[:, 1], pca_result[:, 2], c='k', marker='+', alpha=0.1)
for klass, color in zip(range(0, len(np.unique(labels_eps0))), colors):
    Xk = pca_result[labels_eps0 == klass]
    ax2.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3)
    

ax3 = fig.add_subplot(223, projection='3d')
ax3.set_title(f'Clustering at {eps[1]} epsilon cut\nDBSCAN')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
ax3.scatter(pca_result[:, 0], pca_result[:, 1], pca_result[:, 2], c='k', marker='+', alpha=0.1)
for klass, color in zip(range(0, len(np.unique(labels_eps1))), colors):
    Xk = pca_result[labels_eps1 == klass]
    ax3.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3)


ax4 = fig.add_subplot(224, projection='3d')
ax4.set_title('True Labels')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for klass, color in zip(range(0, k), colors):
    Xk = pca_result[df_subset['category'] == klass]
    ax4.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3, label=unique_labels[klass])
ax4.legend()
plt.tight_layout()

## OPTICS with t-SNE (2D)

In [None]:
# Scatter plots
fig = plt.figure(figsize=(14, 10))
ax1 = fig.add_subplot(221)
ax1.set_title('Automatic Clustering\nOPTICS')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
ax1.scatter(tsne_result_2d[:, 0], tsne_result_2d[:, 1], c='k', marker='+', alpha=0.1)
for klass, color in zip(range(0, len(model.cluster_hierarchy_)), colors):
    Xk = tsne_result_2d[optics_labels == klass]
    ax1.scatter(Xk[:, 0], Xk[:, 1], c=color, alpha=0.3)
    
ax2 = fig.add_subplot(222)
ax2.set_title(f'Clustering at {eps[0]} epsilon cut\nDBSCAN')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
ax2.scatter(tsne_result_2d[:, 0], tsne_result_2d[:, 1], c='k', marker='+', alpha=0.1)
for klass, color in zip(range(0, k), colors):
    Xk = tsne_result_2d[labels_eps0 == klass]
    ax2.scatter(Xk[:, 0], Xk[:, 1], c=color, alpha=0.3)
    

ax3 = fig.add_subplot(223)
ax3.set_title(f'Clustering at {eps[1]} epsilon cut\nDBSCAN')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
ax3.scatter(tsne_result_2d[:, 0], tsne_result_2d[:, 1], c='k', marker='+', alpha=0.1)
for klass, color in zip(range(0, k), colors):
    Xk = tsne_result_2d[labels_eps1 == klass]
    ax3.scatter(Xk[:, 0], Xk[:, 1], c=color, alpha=0.3)


ax4 = fig.add_subplot(224)
ax4.set_title('True Labels')
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for klass, color in zip(range(0, k), colors):
    Xk = tsne_result_2d[df_subset['category'] == klass]
    ax4.scatter(Xk[:, 0], Xk[:, 1], c=color, alpha=0.3, label=unique_labels[klass])
ax4.legend()
plt.tight_layout()