In [None]:
%matplotlib notebook
import os
import csv
import tikzplotlib
import numpy as np
import pandas as pd
import kmapper as km
import seaborn as sns
import tensorflow as tf
import matplotlib.cm as cm
import matplotlib.pyplot as plt


from kmapper import jupyter
from itertools import cycle
from sklearn import metrics
from collections import OrderedDict
from validclust.indices import dunn
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans, AgglomerativeClustering, OPTICS
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model
from gensim.models.keyedvectors import KeyedVectors
np.seterr(divide='ignore', invalid='ignore')
pd.options.mode.chained_assignment = None
sns.set_style('whitegrid')

## Load extracted features

In [None]:
DATASET_NAME = "pathcontext"
if not os.path.exists(f"./results/tables/{DATASET_NAME}"):
    os.makedirs(f"./results/tables/{DATASET_NAME}")
if not os.path.exists(f"./results/figures/{DATASET_NAME}"):    
    os.makedirs(f"./results/figures/{DATASET_NAME}")

In [None]:
# Load method names
method_names = pd.read_csv(f'./data/{DATASET_NAME}/{DATASET_NAME}.test.c2v', sep=" ", dtype=str).iloc[:, 0]
predicted_method_names = pd.read_csv(f'./data/{DATASET_NAME}/{DATASET_NAME}.test.c2v.predicted_names', sep=" ", dtype=str).iloc[:, 0]
print(len(method_names))

# Load code vectors
vectors = pd.read_csv(f'./data/{DATASET_NAME}/{DATASET_NAME}.test.c2v.vectors', sep=" ", header=None)
print(len(vectors))

# Load method embeddings
target_embeddings = f'./data/{DATASET_NAME}/targets.txt'
t2v = KeyedVectors.load_word2vec_format(target_embeddings, binary=False)
target_vocab = t2v.vocab.keys()

code_vector_dim = vectors.iloc[0].shape[0]
embedding_dim = 128
print(len(method_names))

In [None]:
# Merge vectors and labels
codevector_df = vectors.assign(method_name=method_names)
codevector_df['predicted_method_name'] = predicted_method_names

# Drop method names which are not contained in the embedding space
codevector_df = codevector_df[codevector_df.method_name.isin(target_vocab)]
method_names = method_names[method_names.isin(target_vocab)]
# method_names

codevector_df

In [None]:
# Create dataframe for method name embeddings
df_inter = pd.DataFrame([{"predicted_method_name": name, "embedding": t2v.get_vector(name)} for name in predicted_method_names])

embeddings = df_inter['embedding'].apply(pd.Series)
embeddings = embeddings.rename(columns = lambda x : 'feat_' + str(x))

method_name_embedding_df = pd.concat([embeddings[:], df_inter[:], method_names], axis=1)
method_name_embedding_df = method_name_embedding_df.rename(columns={method_name_embedding_df.columns[-1]: 'method_name'})
method_name_embedding_df.drop(columns=["embedding"], inplace=True)
method_name_embedding_df.dropna(inplace=True)
method_name_embedding_df.reset_index(drop=True, inplace=True)
method_name_embedding_df

In [None]:
print(len(method_names))
print(len(codevector_df))
print(len(method_name_embedding_df))

## Data preparation

In [None]:
classes = { 0: 'train', 1: 'save', 2: 'process', 3: 'forward', 4: 'predict' }

# Optional, filter out methods which do not contain any of the chosen classes
method_name_embedding_df = method_name_embedding_df[method_name_embedding_df.method_name.str.contains("|".join(classes.values()))]
codevector_df = codevector_df[codevector_df.method_name.str.contains("|".join(classes.values()))]
codevector_df

In [None]:
# Assign categories based on method name
method_name_embedding_df['category'] = method_name_embedding_df.method_name.map(lambda x: np.array([x.find(s) for s in classes.values()]).argmax())
codevector_df['category'] = codevector_df.method_name.map(lambda x: np.array([x.find(s) for s in classes.values()]).argmax())
method_name_embedding_df

In [None]:
matches = method_name_embedding_df.apply(lambda x: x['predicted_method_name'] in (x['method_name']), axis=1)
matches

In [None]:
print(f"Percentage of instances with matching subtokens: {sum(matches)} / {len(matches)} = {sum(matches)/len(matches)}")

In [None]:
method_name_embedding_df.groupby('category').size()

In [None]:
codevector_df.groupby('category').size()

## Choose a subset of samples
We'll only consider a subset of samples for visualization. This is done by taking an equal number of instances from each category.

In [None]:
method_name_subset_df = method_name_embedding_df.sample(n=1000, random_state=7).reset_index(drop=True)
codevector_subset_df = codevector_df.sample(n=1000, random_state=7).reset_index(drop=True)

In [None]:
method_name_subset_df.groupby('category').size()

In [None]:
codevector_subset_df.groupby('category').size()

In [None]:
# Choose the "default" high dim features
codevector_features = codevector_subset_df.iloc[:, 0:code_vector_dim].values
codevector_features.shape

In [None]:
method_name_features = method_name_subset_df.iloc[:, 0:embedding_dim].values
method_name_features.shape

### Declare methods for computing metrics and visualizing clusters

In [None]:
perplexity = 10

In [None]:
# Use tSNE
tsne = TSNE(n_components=3, verbose=1, perplexity=perplexity, n_iter=3000)
method_name_tsne = tsne.fit_transform(method_name_features)

In [None]:
tsne = TSNE(n_components=3, verbose=1, perplexity=perplexity, n_iter=3000)
codevector_tsne = tsne.fit_transform(codevector_features)

In [None]:
datasets = {'Method Name Embedding': {'DataFrame': method_name_subset_df, 'Features': method_name_features, 'TSNE': method_name_tsne}, 'Code Vectors': {'DataFrame': codevector_subset_df, 'Features': codevector_features, 'TSNE': codevector_tsne}}

In [None]:
def calculate_metrics(estimator, dataframe, features, predicted_labels):

    distances = pairwise_distances(features)
    
    results = {}
    results['estimator'] = estimator.__class__.__name__
    results['homogeneity_score'] = metrics.homogeneity_score(dataframe['category'], predicted_labels)
    results['completeness_score'] = metrics.completeness_score(dataframe['category'], predicted_labels)
    results['v_measure_score'] = metrics.v_measure_score(dataframe['category'], predicted_labels)
    results['adjusted_rand_score'] = metrics.adjusted_rand_score(dataframe['category'], predicted_labels)
    results['adjusted_mutual_info_score'] = metrics.adjusted_mutual_info_score(dataframe['category'], predicted_labels)
    results['average_jaccard_score'] = np.mean(metrics.jaccard_score(dataframe['category'], predicted_labels, average=None))
    results['dunn_index'] = dunn(distances, predicted_labels)
    
    if len(np.unique(predicted_labels)) == 1 or len(np.unique(predicted_labels)) == len(features):
        results['silhouette_score'] = -1
    else:
        results['silhouette_score'] = metrics.silhouette_score(features, predicted_labels, metric='sqeuclidean')
    return results

def plot_clusters(estimator, metrics):
    estimator_name = estimator.__class__.__name__
    
    
    code_vector_labels = metrics['Code Vectors']['labels']
    code_vector_metrics = metrics['Code Vectors']['metrics']
    
    method_name_labels = metrics['Method Name Embedding']['labels']
    method_name_metrics = metrics['Method Name Embedding']['metrics']
    
    fig = plt.figure(figsize=(28,8))
    
    for i, key in enumerate(metrics):
        labels =  metrics[key]['labels']
        metric = metrics[key]['metrics']
        dataframe = datasets[key]['DataFrame']
        tnse_projection = datasets[key]['TSNE']
                         
        k = len(np.unique(labels))
        # Print metrics
        print(key)
        print('Number of clusters: %d' % k)
        print("Homogeneity: %0.3f" % metric['homogeneity_score'])
        print("Completeness: %0.3f" % metric['completeness_score'])
        print("V-measure: %0.3f" % metric['v_measure_score'])
        print("Adjusted Rand Index: %0.3f"
              % metric['adjusted_rand_score'])
        print("Adjusted Mutual Information: %0.3f"
              % metric['adjusted_mutual_info_score'])
        print("Mean Jaccard Coefficient: %s"
              % metric['average_jaccard_score'])
        print("Silhouette Coefficient: %0.3f"
              % metric['silhouette_score'])
        print("Dunn Index: %0.3f\n"
              % metric['dunn_index'])
    
        # Visualize clusters with tSNE
        ax1 = fig.add_subplot(int(f"14{2*i + 1}"), projection='3d')
        ax1.set_title(f'{estimator_name} (k={k}) Clusters ({key})')
        colors = cm.tab10(np.linspace(0, 1, k))
        if estimator_name == 'OPTICS':
            ax1.scatter(tnse_projection[:, 0], tnse_projection[:, 1], tnse_projection[:, 2], c='k', marker='+', alpha=0.1)
        for klass, color in zip(range(0, k), colors):
            Xk = tnse_projection[labels == klass]
            ax1.scatter(Xk[:, 0], Xk[:, 1], Xk[:, 2], color=color, alpha=0.3, label=f'Cluster ID {klass+1}')

#         ax1.legend(title='Cluster IDs', bbox_to_anchor=(1.3, 0.5), loc='right', fancybox=True)
        h,l = ax1.get_legend_handles_labels()
        plt.legend(h[:10], l[:10], title='Cluster IDs', bbox_to_anchor=(1.3, 0.5), loc='right', fancybox=True)
        
        ax2 = fig.add_subplot(int(f"14{2*(i+1)}"), projection='3d')
        ax2.set_title(f'Method names as labels ({key})')
        colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
        for klass, color in zip(range(0, len(classes)), colors):
            Xk = tnse_projection[dataframe['category'] == klass]
            ax2.scatter(Xk[:, 0], Xk[:, 1],  Xk[:, 2], c=color, alpha=0.3, label=classes[klass])
        ax2.legend(title='Method name', bbox_to_anchor=(1.3, 0.5), loc='right', fancybox=True)
    
    fig.subplots_adjust(wspace=0.1)
    fig.tight_layout()
    plt.savefig(f"./results/figures/{DATASET_NAME}/code2vec_{DATASET_NAME}_{estimator.__class__.__name__}.pdf")
    

def cv_silhouette_scorer(estimator, X):
    estimator.fit(X)
    if (estimator.__class__.__name__ == "OPTICS"):
        cluster_labels = estimator.labels_[estimator.ordering_]
    else:
        cluster_labels = estimator.labels_
    num_labels = len(set(cluster_labels))
    num_samples = len(X)
    if num_labels == 1 or num_labels == num_samples:
        return -1
    else:
        return metrics.silhouette_score(X, cluster_labels, metric='sqeuclidean')

def cv_dunn_scorer(estimator, X):
    estimator.fit(X)
    if (estimator.__class__.__name__ == "OPTICS"):
        cluster_labels = estimator.labels_[estimator.ordering_]
    else:
        cluster_labels = estimator.labels_
    num_labels = len(set(cluster_labels))
    num_samples = len(X)
    if num_labels == 1 or num_labels == 0 or num_labels == num_samples:
        return 0
    else:
        return dunn_fast(X, cluster_labels)

## Model fitting and hyperparameter search

In [None]:
estimators = OrderedDict()
estimator_metrics = OrderedDict()

### Centroid-based clustering using K-means

In [None]:
# Apply K-means
search_params = {'n_clusters': np.arange(2,11)}

kmeans_results = {}

for key in datasets:
    df = datasets[key]['DataFrame']
    features = datasets[key]['Features']
    
    cv = [(slice(None), slice(None))] # Disable cv, only want grid search
    gs = GridSearchCV(estimator=KMeans(random_state=0), param_grid=search_params, 
                      scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1)

    res = gs.fit(X=features, y=None)
    
    max_score = np.max(res.cv_results_['mean_test_score'])
    ind = np.argmax(res.cv_results_['mean_test_score'])
    k = search_params['n_clusters'][ind]

    print(f"Best validation score {max_score:.3f} achieved with {k} clusters")
    kmeans_estimator = KMeans(n_clusters=k, random_state=0)
    kmeans_name = kmeans_estimator.__class__.__name__

    kmeans = kmeans_estimator.fit(features)
    kmeans_metrics = calculate_metrics(kmeans_estimator, df, features, kmeans.labels_)
    kmeans_results[key] = {'labels': kmeans.labels_, 'metrics': kmeans_metrics}
    
    estimators[f"{kmeans_name} on {key}"] = {'estimator': kmeans_estimator, 'score':  max_score, 'method representation': key }
    estimator_metrics[f"{kmeans_name} on {key}"] = {'Method Representation': key, 'Estimator': kmeans_name, 'Dunn Index': kmeans_metrics['dunn_index'], 'Silhouette Score': kmeans_metrics['silhouette_score'], 'Adjusted Rand Index': kmeans_metrics['adjusted_rand_score']}


plot_clusters(kmeans, kmeans_results)

### Density-Based Clustering using OPTICS

In [None]:
search_params = {'cluster_method': ['xi', 'dbscan'], 'xi': np.linspace(0,1,11), 'min_samples': [2,5,10,15,20,25,30], }
search_params = {'xi': np.linspace(0.1,1,10), 'min_samples': [2,5,10,15,20,25,30]}
optics_results = {}

for key in datasets:
    df = datasets[key]['DataFrame']
    features = datasets[key]['Features']
    
    cv = [(slice(None), slice(None))] # Disable cv, only want grid search
    gs = GridSearchCV(estimator=OPTICS(cluster_method='xi'), param_grid=search_params, 
                      scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1)

    res = gs.fit(X=features, y=None)

    # Get best configuration
    max_score = np.max(res.cv_results_['mean_test_score'])
    ind = np.argmax(res.cv_results_['mean_test_score'])
    best_params = res.cv_results_['params'][ind]
    k = best_params['min_samples']
    # metric = best_params['metric']

    # Visualize best clusters
    print(f"Best validation score {max_score:.3f} achieved with {res.cv_results_['params'][ind]}")
    optics_estimator = OPTICS(cluster_method='xi', min_samples=k)
    optics_name = optics_estimator.__class__.__name__

    optics_clusters = optics_estimator.fit(features)
    optics_metrics = calculate_metrics(optics_estimator, df, features, optics_clusters.labels_[optics_clusters.ordering_])
    optics_results[key] = {'labels': optics_clusters.labels_[optics_clusters.ordering_], 'metrics': optics_metrics}
    
    estimators[f"{optics_name} on {key}"] = {'estimator': optics_estimator, 'score':  max_score, 'method representation': key }
    estimator_metrics[f"{optics_name} on {key}"] = {'Method Representation': key, 'Estimator': optics_name, 'Silhouette Score': optics_metrics['silhouette_score'], 'Dunn Index': optics_metrics['dunn_index'], 'Adjusted Rand Index': optics_metrics['adjusted_rand_score']}
    
plot_clusters(optics_clusters, optics_results)

### Agglomerative Clustering (Hierarchical)

In [None]:
search_params = {'n_clusters': np.arange(2,11), 'linkage': ['ward', 'complete', 'average', 'single']}
# search_params = {'n_clusters': np.arange(2,11), 'linkage': ['complete', 'average', 'single'], 'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']}
agglomerative_results = {}

for key in datasets:
    df = datasets[key]['DataFrame']
    features = datasets[key]['Features']
    
    cv = [(slice(None), slice(None))] # Disable cv, only want grid search
    gs = GridSearchCV(estimator=AgglomerativeClustering(linkage='ward'), param_grid=search_params, 
                      scoring=cv_silhouette_scorer, cv=cv, n_jobs=-1)

    res = gs.fit(X=features, y=None)

    # Get best configuration
    max_score = np.max(res.cv_results_['mean_test_score'])
    ind = np.argmax(res.cv_results_['mean_test_score'])
    best_params = res.cv_results_['params'][ind]
    k = best_params['n_clusters']
    linkage = best_params['linkage']

    # Visualize best clusters
    print(f"Best validation score {max_score:.3f} achieved with {res.cv_results_['params'][ind]}")
    agglomerative_estimator = AgglomerativeClustering(n_clusters=k, linkage=linkage)
    agglomerative_name = agglomerative_estimator.__class__.__name__

    agglomerative_clusters = agglomerative_estimator.fit(features)
    agglomerative_metrics = calculate_metrics(agglomerative_clusters, df, features, agglomerative_clusters.labels_)
    
    agglomerative_results[key] = {'labels': agglomerative_clusters.labels_, 'metrics': agglomerative_metrics}
    
    estimators[f"{agglomerative_name} on {key}"] = {'estimator': agglomerative_estimator, 'score':  max_score, 'method representation': key}
    estimator_metrics[f"{agglomerative_name} on {key}"] = {'Method Representation': key, 'Estimator': agglomerative_name, 'Dunn Index': agglomerative_metrics['dunn_index'], 'Silhouette Score': agglomerative_metrics['silhouette_score'], 'Adjusted Rand Index': agglomerative_metrics['adjusted_rand_score']}
    
plot_clusters(agglomerative_clusters, agglomerative_results)

## Select best estimator

In [None]:
estimators.values()

In [None]:
scores = [val['score'] for val in estimators.values()]
ind = np.argmax(scores)
best_estimator = list(estimators.values())[0]['estimator']
best_method_representation = list(estimators.values())[0]['method representation']
print(f"Best cluster method and representation: {best_estimator.__class__.__name__} on {best_method_representation}\nParams:\n{best_estimator.__dict__}")

In [None]:
# Write configs of the best estimators from each category to file
with open(f"./results/code2vec_{DATASET_NAME}_estimators_config.txt", "w") as writer:
    for conf, score in [(val['estimator'].__dict__,val['score']) for val in estimators.values()]:
        writer.write(f"Estimator config:\n{conf}\nSilhouette Score: {score}\n\n")

In [None]:
# Write the chosen cluster metrics for the best models to table
latex_table = pd.DataFrame([d for d in estimator_metrics.values()], columns=['Method Representation','Estimator','Dunn Index', 'Silhouette Score', 'Adjusted Rand Index']).to_latex(index=False, float_format="%.3f").replace('\\toprule', '\\hline').replace('\\midrule', '\\hline').replace('\\bottomrule','\\hline')
with open(f"./results/tables/{DATASET_NAME}/code2vec_{DATASET_NAME}_table.tex", "w") as writer:
    writer.write(latex_table)

# Write all cluster metrics to table
latex_table = pd.DataFrame([d for d in [kmeans_metrics, optics_metrics, agglomerative_metrics]]).to_latex(index=False, float_format="%.3f").replace('\\toprule', '\\hline').replace('\\midrule', '\\hline').replace('\\bottomrule','\\hline')
with open(f"./results/tables/{DATASET_NAME}/code2vec_{DATASET_NAME}_all_metrics_table.tex", "w") as writer:
    writer.write(latex_table)

## Visualize Clusters with Kepler Mapper

In [None]:
# Visualize with kepler map
mapper = km.KeplerMapper(verbose=1)

# Fit and transform data, use TSNE 
projected_data = mapper.fit_transform(datasets[best_method_representation]['Features'], projection=TSNE(n_components=2))

# Create the graph (we cluster on the projected data and suffer projection loss)
graph = mapper.map(
    projected_data,
    clusterer=best_estimator,
    cover=km.Cover(5, 0.3),
)

# Create the visualizations
print("Output graph examples to html")
mapper.visualize(
    graph,
    title=f"{DATASET_NAME} {best_method_representation} Mapper",
    path_html=f"./results/figures/{DATASET_NAME}/{DATASET_NAME}_visualization.html",
    custom_tooltips=datasets[best_method_representation]['DataFrame']['category'].values
)

jupyter.display(path_html=f"./results/figures/{DATASET_NAME}/{DATASET_NAME}_visualization.html")