In [1]:
%matplotlib notebook
import os
import csv
import tikzplotlib
import numpy as np
import pandas as pd
import kmapper as km
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

from kmapper import jupyter
from itertools import cycle
from sklearn import metrics
from collections import OrderedDict
from sklearn.cluster import KMeans, AgglomerativeClustering, OPTICS
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

sns.set_style('whitegrid')

## Load extracted features

In [2]:
DATASET_NAME = "libraries"
os.makedirs(f"./results/tables/{DATASET_NAME}")
os.makedirs(f"./results/figures/{DATASET_NAME}")

In [3]:
# Load code vectors and labels
vectors = pd.read_csv(f'./data/{DATASET_NAME}/{DATASET_NAME}.test.c2v.vectors', sep=" ", header=None)
labels = pd.read_csv(f'./data/{DATASET_NAME}/{DATASET_NAME}.test.c2v', sep=" ").iloc[:, 0]

embedding_dim = vectors.values.shape[1]

# Merge vectors and labels
df = vectors.assign(method_name=labels)
df.dropna(inplace=True)
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,method_name
0,0.659533,0.111065,-0.002645,0.296223,0.184254,-0.04815,-0.441473,-0.074131,-0.614883,-0.050383,...,0.044025,-0.379658,0.03761,-0.216069,-0.054703,0.035554,-0.628919,0.369026,0.736787,save
1,-0.639811,-0.188797,-0.601161,0.377131,0.56926,-0.739172,0.681987,0.54559,-0.189288,0.497499,...,0.204609,-0.191001,-0.742706,-0.613184,-0.242451,0.20106,0.548044,-0.202271,-0.646138,transform
2,0.280379,-0.245354,0.26565,0.241979,-0.539864,0.419182,-0.072197,-0.215407,-0.117679,-0.401295,...,0.239741,0.156146,0.3299,0.359694,0.259056,-0.484015,-0.014807,0.082783,0.286431,predict
3,0.546732,0.148203,-0.141675,0.15112,0.324562,-0.192991,-0.400285,-0.053316,-0.600185,0.136884,...,-0.050554,-0.374854,-0.141102,-0.277682,-0.207434,0.177123,-0.619998,0.292892,0.667141,predict
4,0.395525,-0.149843,0.161392,0.08706,-0.200444,0.116178,-0.058122,-0.277543,-0.4181,-0.090064,...,0.209881,-0.16287,-0.036207,0.151648,-0.061068,-0.0035,-0.321799,0.073458,0.571418,train


## Data preparation

In [4]:
classes = { 0: 'train', 1: 'save', 2: 'process', 3: 'forward', 4: 'predict' }

In [5]:
# Assign categories based on method name
df['category'] = df.method_name.map(lambda x: np.array([x.find(s) for s in classes.values()]).argmax())
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,method_name,category
0,0.659533,0.111065,-0.002645,0.296223,0.184254,-0.048150,-0.441473,-0.074131,-0.614883,-0.050383,...,-0.379658,0.037610,-0.216069,-0.054703,0.035554,-0.628919,0.369026,0.736787,save,1
1,-0.639811,-0.188797,-0.601161,0.377131,0.569260,-0.739172,0.681987,0.545590,-0.189288,0.497499,...,-0.191001,-0.742706,-0.613184,-0.242451,0.201060,0.548044,-0.202271,-0.646138,transform,0
2,0.280379,-0.245354,0.265650,0.241979,-0.539864,0.419182,-0.072197,-0.215407,-0.117679,-0.401295,...,0.156146,0.329900,0.359694,0.259056,-0.484015,-0.014807,0.082783,0.286431,predict,4
3,0.546732,0.148203,-0.141675,0.151120,0.324562,-0.192991,-0.400285,-0.053316,-0.600185,0.136884,...,-0.374854,-0.141102,-0.277682,-0.207434,0.177123,-0.619998,0.292892,0.667141,predict,4
4,0.395525,-0.149843,0.161392,0.087060,-0.200444,0.116178,-0.058122,-0.277543,-0.418100,-0.090064,...,-0.162870,-0.036207,0.151648,-0.061068,-0.003500,-0.321799,0.073458,0.571418,train,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,0.727051,-0.078742,0.329482,0.483167,-0.559327,0.414311,-0.565073,-0.219670,-0.580053,-0.490861,...,-0.330527,0.481850,0.357905,0.364508,-0.633342,-0.444316,0.357979,0.748239,transform,0
1052,-0.012338,-0.269567,0.211126,0.212429,-0.197486,0.192723,0.261622,-0.176338,-0.073806,-0.161888,...,-0.038924,-0.010053,0.159883,-0.039708,-0.057748,0.246974,0.026356,0.005275,transform,0
1053,0.308311,-0.181256,0.112361,0.073703,-0.103750,0.084704,-0.053471,-0.263630,-0.351362,-0.100620,...,-0.073809,-0.024388,0.073102,-0.065779,-0.032793,-0.242702,0.089417,0.431770,transform,0
1054,0.436868,-0.228527,0.322159,0.344238,-0.358604,0.392213,-0.142330,-0.204496,-0.285700,-0.402942,...,-0.063747,0.310004,0.351956,0.218492,-0.313415,-0.080680,0.192540,0.464870,train,0


In [6]:
df.groupby('category').size()

category
0    492
1    164
4    400
dtype: int64

In [7]:
train_df, test_df = train_test_split(df, test_size=0.2) 

In [8]:
train_features = train_df.iloc[:, 0:embedding_dim].values
test_features = test_df.iloc[:, 0:embedding_dim].values

In [9]:
train_features.shape, test_features.shape

((844, 128), (212, 128))

## Train an Autoencoder to (optionally) reduce the dimensionality of the features

In [10]:
class Autoencoder(Model):
    def __init__(self, latent_dim):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim   
        self.encoder = tf.keras.Sequential([
          layers.Flatten(),
          layers.Dense(latent_dim, activation='relu'),
        ])
        self.decoder = tf.keras.Sequential([
          layers.Dense(embedding_dim, activation='sigmoid')
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [11]:
## Train autoencoder

latent_dim = 10
autoencoder = Autoencoder(latent_dim)
autoencoder.compile(optimizer='adam', loss=losses.MeanSquaredError())

autoencoder.fit(train_features, train_features,
                epochs=20,
                shuffle=True,
                validation_data=(test_features, test_features),
                callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f3996561e20>

## Choose a subset of samples
We'll only consider a subset of samples for visualization. This is done by taking an equal number of instances from each category.

In [12]:
df_subset = df.groupby('category').apply(lambda grp: grp.sample(n=100)).reset_index(level=[0, 1], drop=True)
df_subset.groupby('category').size()

category
0    100
1    100
4    100
dtype: int64

In [13]:
# Choose the "default" high dim features
dim = "high"
features = df_subset.iloc[:, 0:embedding_dim].values

# Map the features into a low dimensional space using the encoder
# dim = "low"
# features = autoencoder.encoder(df_subset.iloc[:, 0:embedding_dim].values).numpy()
features.shape

(300, 128)

### Declare methods for computing metrics and visualizing clusters

In [14]:
def calculate_metrics(estimator, dataframe, predicted_labels):
    features = dataframe.iloc[:, 0:embedding_dim].values
    results = {}
    results['estimator'] = estimator.__class__.__name__
    results['homogeneity_score'] = metrics.homogeneity_score(dataframe['category'], predicted_labels)
    results['completeness_score'] = metrics.completeness_score(dataframe['category'], predicted_labels)
    results['v_measure_score'] = metrics.v_measure_score(dataframe['category'], predicted_labels)
    results['adjusted_rand_score'] = metrics.adjusted_rand_score(dataframe['category'], predicted_labels)
    results['adjusted_mutual_info_score'] = metrics.adjusted_mutual_info_score(dataframe['category'], predicted_labels)
    results['average_jaccard_score'] = np.mean(metrics.jaccard_score(dataframe['category'], predicted_labels, average=None))
    results['silhouette_score'] = metrics.silhouette_score(features, predicted_labels, metric='sqeuclidean')
    return results

def plot_clusters(estimator, dataframe, labels, metrics):
    estimator_name = estimator.__class__.__name__
    
    k = len(np.unique(labels))
    # Print metrics
    print('Number of clusters: %d' % k)
    print("Homogeneity: %0.3f" % metrics['homogeneity_score'])
    print("Completeness: %0.3f" % metrics['completeness_score'])
    print("V-measure: %0.3f" % metrics['v_measure_score'])
    print("Adjusted Rand Index: %0.3f"
          % metrics['adjusted_rand_score'])
    print("Adjusted Mutual Information: %0.3f"
          % metrics['adjusted_mutual_info_score'])
    print("Mean Jaccard Coefficient: %s"
          % metrics['average_jaccard_score'])
    print("Silhouette Coefficient: %0.3f"
          % metrics['silhouette_score'])

    # Visualize clusters with tSNE
    fig = plt.figure(figsize=(14,6))
    ax1 = fig.add_subplot(121, projection='3d')
    ax1.set_title(f'{estimator_name} Clusters (k={k})')
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    if estimator_name == 'OPTICS':
        ax1.scatter(tsne_result[:, 0], tsne_result[:, 1], tsne_result[:, 2], c='k', marker='+', alpha=0.1)
    for klass, color in zip(range(0, k), colors):
        Xk = tsne_result[labels == klass]
        ax1.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], c=color, alpha=0.3, label=f'Cluster ID {klass}')

    ax2 = fig.add_subplot(122, projection='3d')
    ax2.set_title('Method names as labels')
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for klass, color in zip(range(0, len(classes)), colors):
        Xk = tsne_result[dataframe['category'] == klass]
        ax2.scatter(Xk[:, 0], Xk[:, 1],  Xk[:, 2], c=color, alpha=0.3, label=classes[klass])
    ax2.legend()
    
    ax2.legend(title='Method name', bbox_to_anchor=(0.5, -0.2), loc='lower center', fancybox=True, ncol=klass+1)
    fig.tight_layout()
    plt.savefig(f"./results/figures/{DATASET_NAME}/code2vec_{dim}_dim_{DATASET_NAME}_{estimator.__class__.__name__}.svg")
    

def cv_silhouette_scorer(estimator, X):
    estimator.fit(X)
    if (estimator.__class__.__name__ == "OPTICS"):
        cluster_labels = estimator.labels_[estimator.ordering_]
    else:
        cluster_labels = estimator.labels_
    num_labels = len(set(cluster_labels))
    num_samples = len(X)
    if num_labels == 1 or num_labels == num_samples:
        return -1
    else:
        return metrics.silhouette_score(X, cluster_labels, metric='sqeuclidean')

In [15]:
# Use tSNE & clustering
tsne = TSNE(n_components=3, verbose=1, perplexity=30, n_iter=3000)
tsne_result = tsne.fit_transform(features)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 300 samples in 0.000s...
[t-SNE] Computed neighbors for 300 samples in 0.004s...
[t-SNE] Computed conditional probabilities for sample 300 / 300
[t-SNE] Mean sigma: 0.765834
[t-SNE] KL divergence after 250 iterations with early exaggeration: 106.511681
[t-SNE] KL divergence after 3000 iterations: 1.048846


## Model fitting and hyperparameter search

In [16]:
estimators = OrderedDict()
estimator_metrics = OrderedDict()

### Centroid-based clustering using K-means

In [17]:
# Apply K-means
search_params = {'n_clusters': np.arange(2,11)}

cv = [(slice(None), slice(None))] # Disable cv, only want grid search
gs = GridSearchCV(estimator=KMeans(random_state=0), param_grid=search_params, 
                  scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1)

res = gs.fit(X=features, y=None)

max_silhouette_score = np.max(res.cv_results_['mean_test_score'])
ind = np.argmax(res.cv_results_['mean_test_score'])
k = search_params['n_clusters'][ind]

print(f"Best silhouette score {max_silhouette_score:.3f} achieved with {k} clusters")
kmeans_estimator = KMeans(n_clusters=k, random_state=0)
kmeans_name = kmeans_estimator.__class__.__name__

kmeans = kmeans_estimator.fit(features)
kmeans_metrics = calculate_metrics(kmeans_estimator, df_subset, kmeans.labels_)
plot_clusters(kmeans, df_subset, kmeans.labels_, kmeans_metrics)

estimators[kmeans_name] = {'estimator': kmeans_estimator, 'score':  max_silhouette_score }
estimator_metrics[kmeans_name] = {'Estimator': kmeans_name, 'Adjusted Rand Index': kmeans_metrics['adjusted_rand_score'], 'Silhouette Score': kmeans_metrics['silhouette_score']}

Best silhouette score 0.688 achieved with 4 clusters
Number of clusters: 4
Homogeneity: 0.007
Completeness: 0.006
V-measure: 0.006
Adjusted Rand Index: -0.003
Adjusted Mutual Information: -0.002
Mean Jaccard Coefficient: 0.07604846225535881
Silhouette Coefficient: 0.688


<IPython.core.display.Javascript object>

### Density-Based Clustering using OPTICS

In [18]:
search_params = {'cluster_method': ['xi', 'dbscan'], 'xi': np.linspace(0,1,11), 'min_samples': [2,5,10,15,20,25,30], 'metric': ['minkowski', 'cosine', 'euclidean', 'sqeuclidean']}

cv = [(slice(None), slice(None))] # Disable cv, only want grid search
gs = GridSearchCV(estimator=OPTICS(cluster_method='xi'), param_grid=search_params, 
                  scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1)

res = gs.fit(X=features, y=None)

# Get best configuration
max_silhouette_score = np.max(res.cv_results_['mean_test_score'])
ind = np.argmax(res.cv_results_['mean_test_score'])
best_params = res.cv_results_['params'][ind]
k = best_params['min_samples']
metric = best_params['metric']

# Visualize best clusters
print(f"Best silhouette score {max_silhouette_score:.3f} achieved with {res.cv_results_['params'][ind]}")
optics_estimator = OPTICS(cluster_method='xi', min_samples=k, metric=metric)
optics_name = optics_estimator.__class__.__name__

optics_clusters = optics_estimator.fit(features)
optics_metrics = calculate_metrics(optics_estimator, df_subset, optics_clusters.labels_[optics_clusters.ordering_])
plot_clusters(optics_clusters, df_subset, optics_clusters.labels_[optics_clusters.ordering_], optics_metrics)

estimators[optics_name] = {'estimator': optics_estimator, 'score':  max_silhouette_score }
estimator_metrics[optics_name] = {'Estimator': optics_name, 'Adjusted Rand Index': optics_metrics['adjusted_rand_score'], 'Silhouette Score': optics_metrics['silhouette_score']}

  ratio = reachability_plot[:-1] / reachability_plot[1:]


Best silhouette score 0.088 achieved with {'cluster_method': 'xi', 'metric': 'minkowski', 'min_samples': 5, 'xi': 0.30000000000000004}
Number of clusters: 10
Homogeneity: 0.287
Completeness: 0.262
V-measure: 0.274
Adjusted Rand Index: 0.026
Adjusted Mutual Information: 0.252
Mean Jaccard Coefficient: 0.005
Silhouette Coefficient: -0.313


  ratio = reachability_plot[:-1] / reachability_plot[1:]


<IPython.core.display.Javascript object>

### Agglomerative Clustering (Hierarchical)

In [19]:
search_params = {'n_clusters': np.arange(2,11), 'linkage': ['ward', 'complete', 'average', 'single']}
# search_params = {'n_clusters': np.arange(2,11), 'linkage': ['complete', 'average', 'single'], 'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']}

cv = [(slice(None), slice(None))] # Disable cv, only want grid search
gs = GridSearchCV(estimator=AgglomerativeClustering(linkage='ward'), param_grid=search_params, 
                  scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1)

res = gs.fit(X=features, y=None)

# Get best configuration
max_silhouette_score = np.max(res.cv_results_['mean_test_score'])
ind = np.argmax(res.cv_results_['mean_test_score'])
best_params = res.cv_results_['params'][ind]
k = best_params['n_clusters']
linkage = best_params['linkage']

# Visualize best clusters
print(f"Best silhouette score {max_silhouette_score:.3f} achieved with {res.cv_results_['params'][ind]}")
agglomerative_estimator = AgglomerativeClustering(n_clusters=k, linkage=linkage)
agglomerative_name = agglomerative_estimator.__class__.__name__

agglomerative_clusters = agglomerative_estimator.fit(features)
agglomerative_metrics = calculate_metrics(agglomerative_clusters, df_subset, agglomerative_clusters.labels_)
plot_clusters(agglomerative_clusters, df_subset, agglomerative_clusters.labels_, agglomerative_metrics)

estimators[agglomerative_name] = {'estimator': agglomerative_estimator, 'score':  max_silhouette_score}
estimator_metrics[agglomerative_name] = {'Estimator': agglomerative_name, 'Adjusted Rand Index': agglomerative_metrics['adjusted_rand_score'], 'Silhouette Score': agglomerative_metrics['silhouette_score']}

Best silhouette score 0.682 achieved with {'linkage': 'average', 'n_clusters': 3}
Number of clusters: 3
Homogeneity: 0.002
Completeness: 0.003
V-measure: 0.003
Adjusted Rand Index: -0.003
Adjusted Mutual Information: -0.004
Mean Jaccard Coefficient: 0.08887503239181135
Silhouette Coefficient: 0.682


<IPython.core.display.Javascript object>

## Select best estimator

In [20]:
scores = [val['score'] for val in estimators.values()]
ind = np.argmax(scores)
best_estimator = list(estimators.values())[ind]['estimator']
print(f"Best cluster method: {best_estimator.__class__.__name__}\nParams:\n{best_estimator.__dict__}")

Best cluster method: KMeans
Params:
{'n_clusters': 4, 'init': 'k-means++', 'max_iter': 300, 'tol': 0.0001, 'precompute_distances': 'deprecated', 'n_init': 10, 'verbose': 0, 'random_state': 0, 'copy_x': True, 'n_jobs': 'deprecated', 'algorithm': 'auto', 'n_features_in_': 128, '_n_threads': 12, '_n_init': 10, '_tol': 9.838690764070104e-06, '_algorithm': 'elkan', 'cluster_centers_': array([[-0.25920076, -0.48482528,  0.1969332 , -0.37439219, -0.24297404,
        -0.01907708,  0.48986621, -0.4242104 , -0.2254472 ,  0.23594246,
         0.23830031, -0.23248571, -0.4167696 , -0.2591481 , -0.05937537,
         0.37139308,  0.43795562, -0.16208736,  0.40509006,  0.02159653,
         0.30979779, -0.47951577,  0.23970913,  0.31713633, -0.18112862,
        -0.02334929,  0.33685212,  0.04690266,  0.27801204,  0.28610459,
         0.26852736,  0.31851598,  0.03740966, -0.13097651,  0.37524617,
        -0.45170647, -0.50546226, -0.38201562,  0.12695992,  0.31567249,
        -0.35010285,  0.35801728,

In [21]:
# Write configs of the best estimators from each category to file
with open(f"./results/code2vec_{dim}_dim_{DATASET_NAME}_estimators_config.txt", "w") as writer:
    for conf, score in [(val['estimator'].__dict__,val['score']) for val in estimators.values()]:
        writer.write(f"Estimator config:\n{conf}\nSilhouette Score: {score}\n\n")

In [22]:
# Write the chosen cluster metrics for the best models to table
latex_table = pd.DataFrame([d for d in estimator_metrics.values()], columns=['Estimator', 'Adjusted Rand Index', 'Silhouette Score']).to_latex(index=False, float_format="%.3f").replace('\\toprule', '\\hline').replace('\\midrule', '\\hline').replace('\\bottomrule','\\hline')
with open(f"./results/tables/{DATASET_NAME}/code2vec_{dim}_dim_{DATASET_NAME}_table.tex", "w") as writer:
    writer.write(latex_table)
    
# Write all cluster metrics to table
latex_table = pd.DataFrame([d for d in [kmeans_metrics, optics_metrics, agglomerative_metrics]]).to_latex(index=False, float_format="%.3f").replace('\\toprule', '\\hline').replace('\\midrule', '\\hline').replace('\\bottomrule','\\hline')
with open(f"./results/tables/{DATASET_NAME}/code2vec_{dim}_dim_{DATASET_NAME}_all_metrics_table.tex", "w") as writer:
    writer.write(latex_table)

## Visualize Clusters with Kepler Mapper

In [23]:
# Visualize with kepler map
mapper = km.KeplerMapper(verbose=1)

# Fit and transform data, use TSNE 
projected_data = mapper.fit_transform(features, projection=TSNE(n_components=2))

# Create the graph (we cluster on the projected data and suffer projection loss)
graph = mapper.map(
    projected_data,
    clusterer=best_estimator,
    cover=km.Cover(5, 0.3),
)

# Create the visualizations
print("Output graph examples to html")
mapper.visualize(
    graph,
    title=f"{DATASET_NAME} Mapper",
    path_html=f"./data/{DATASET_NAME}/{DATASET_NAME}_{dim}_dim_visualization.html",
    custom_tooltips=df_subset['category'].values
)

jupyter.display(path_html=f"./data/{DATASET_NAME}/{DATASET_NAME}_{dim}_dim_visualization.html")

KeplerMapper(verbose=1)
..Composing projection pipeline of length 1:
	Projections: TSNE()
	Distance matrices: False
	Scalers: MinMaxScaler()
..Projecting on data shaped (300, 128)

..Projecting data using: 
	TSNE(verbose=1)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 300 samples in 0.000s...
[t-SNE] Computed neighbors for 300 samples in 0.004s...
[t-SNE] Computed conditional probabilities for sample 300 / 300
[t-SNE] Mean sigma: 0.765834
[t-SNE] KL divergence after 250 iterations with early exaggeration: 56.595985
[t-SNE] KL divergence after 1000 iterations: 0.285128

..Scaling with: MinMaxScaler()

Mapping on data shaped (300, 2) using lens shaped (300, 2)

Creating 25 hypercubes.

Created 78 edges and 72 nodes in 0:00:00.244770.
Output graph examples to html
Wrote visualization to: ./data/tokens-mini/tokens-mini_high_dim_visualization.html


