# Imports 

In [1]:
import os
import gc
import sys
import copy
import glob
import tqdm

%load_ext autoreload
%autoreload 2
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
from torch import nn
import random
import torch
import platform
from typing import Callable, List, Optional, Dict
import numpy as np
import scipy.sparse as sp

import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer, AutoModel

import torch_geometric
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    Batch
    )
import torch_geometric.datasets as datasets
import torch_geometric.transforms as transforms
from torch_geometric.data import Data
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.nn import global_mean_pool, global_max_pool

# Helper function for visualization.
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, silhouette_score

from unixcoder import UniXcoder

# To ensure determinism
seed = 1234
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed)

# Check versions
print(torch.__version__)
print(torch.version.cuda)
print(platform.python_version())
print(torch_geometric.__version__)

2.2.1
11.8
3.12.2
2.5.0


# Get GNN Embeddings

## Get the Graph Data

In [2]:
from collections import OrderedDict

def get_nodes_edges(inTextFile, add_reverse_edges = False, api_name = None):
  # FD = 0, CD = 1
  # to support the hetero data object as suggested by the documentation 
  nodes_dict = OrderedDict()
  edge_indices_CD = []
  edge_indices_FD = []

  #to support the Data object as used by the Entities dat object as used in RGAT source code
  edge_indices = []
  edge_type = []
  
  # nodes_dict is an index_map
  node_count=0
  fp = open(inTextFile, "r")
    
  file_name = inTextFile.split("/")[-1].strip()
  Lines = fp.readlines()
  
  # Capture the API nodes first
  number_of_api_nodes = 0
  if api_name != None:
    api_name = "." + api_name.split(".")[-1].strip() + "("
    #api_name = api_name[api_name.find("."):] + "("
    for line in Lines:
      nodes = line.split('-->')
      nodes[0], nodes[1] = nodes[0].strip(), nodes[1].strip()
      
      src = nodes[0]  
      if src not in nodes_dict.keys() and api_name in src:
        nodes_dict[src] = node_count
        node_count += 1
        number_of_api_nodes += 1
        
      right_idx = nodes[1].rfind('[')
      dst = nodes[1][:right_idx].strip()
      if dst not in nodes_dict.keys() and api_name in dst:
        nodes_dict[dst] = node_count
        node_count += 1
        number_of_api_nodes += 1
    if number_of_api_nodes == 0:
      print("No API Nodes found!!!!")
    
  # Process each edge
  for line in Lines:

      N = line.split('-->')
      N[0], N[1] = N[0].strip(), N[1].strip()
      
      #t1 = N[0].split('$$')   
      src = N[0].strip()   
      if src not in nodes_dict.keys():
        nodes_dict[src] = node_count
        node_count+=1
        
      #t2 = N[1].split('$$')
      right_idx = N[1].rfind('[')
      dst = N[1][:right_idx].strip()
      if dst not in nodes_dict.keys():
        nodes_dict[dst] = node_count
        node_count+=1

      x = N[1].strip()[right_idx + 1 : -1].strip()
      if(x == 'FD'):
        y=0
        edge_type.append(y)
        edge_indices.append([nodes_dict[src], nodes_dict[dst]])
        if add_reverse_edges:
          edge_type.append(y)
          edge_indices.append([nodes_dict[dst], nodes_dict[src]])
        edge_indices_FD.append([nodes_dict[src], nodes_dict[dst]])
      elif(x == 'CD'): 
        y=1
        edge_type.append(y)
        edge_indices.append([nodes_dict[src], nodes_dict[dst]])
        if add_reverse_edges:
          edge_type.append(y)
          edge_indices.append([nodes_dict[dst], nodes_dict[src]])
        edge_indices_CD.append([nodes_dict[src], nodes_dict[dst]])
      elif(x == 'SE'):
        y=2
        edge_type.append(y)
        edge_indices.append([nodes_dict[src], nodes_dict[dst]])
        if add_reverse_edges:
          edge_type.append(y)
          edge_indices.append([nodes_dict[dst], nodes_dict[src]])
      else:
          print("Edge type not found!!!")
     
  return nodes_dict, edge_indices_FD, edge_indices_CD, edge_indices, edge_type, file_name, number_of_api_nodes

In [None]:
import gc

#Set GPU
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

# Initialize the models
codebert_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
codebert_model = AutoModel.from_pretrained("microsoft/codebert-base")
codebert_model = codebert_model.to(device)

def get_node_embedding_from_codebert(nodes):
    list_of_embeddings = []
    for code_line in nodes.keys():
        code_line = code_line.split("$$")[1].strip()
        code_tokens = codebert_tokenizer.tokenize(code_line, truncation=True, max_length=510)
        tokens = [codebert_tokenizer.cls_token]+code_tokens+[codebert_tokenizer.eos_token]
        tokens_ids = torch.tensor(codebert_tokenizer.convert_tokens_to_ids(tokens))
        tokens_ids = tokens_ids.to(device)
        context_embeddings = codebert_model(tokens_ids[None,:])
        cls_token_embedding = context_embeddings.last_hidden_state[0,0,:]
        list_of_embeddings.append(cls_token_embedding.to("cpu"))
        del tokens_ids
        del context_embeddings
        del cls_token_embedding
    gc.collect()
    torch.cuda.empty_cache()
    return torch.stack(list_of_embeddings)

In [3]:
import gc

#Set GPU
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

# Initialize the models
checkpoint = "Salesforce/codet5p-110m-embedding"
codet5p_tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
codet5p_model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)
codet5p_model.eval()

def get_node_embedding_from_codet5p(nodes):
    list_of_embeddings = []
    for code_line in nodes.keys():
        code_line = code_line.split("$$")[1].strip()
        inputs_tokens = codet5p_tokenizer.encode(code_line, return_tensors="pt").to(device)
        embedding = codet5p_model(inputs_tokens)[0]
        list_of_embeddings.append(embedding.to("cpu"))
        del inputs_tokens
        del embedding
    # gc.collect()
    # torch.cuda.empty_cache()
    return torch.stack(list_of_embeddings)

In [None]:
import gc

#Set GPU
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

# Initialize the models
unixcoder_model = UniXcoder("microsoft/unixcoder-base")
unixcoder_model = unixcoder_model.to(device)
max_source_length= 512

def get_node_embedding_from_unixcoder(nodes):
    list_of_embeddings = []
    for code_line in nodes.keys():
        code_line = code_line.split("$$")[1].strip()
        tokens_ids = unixcoder_model.tokenize([code_line], max_length=512, mode="<encoder-only>")
        source_ids = torch.tensor(tokens_ids).to(device)
        tokens_embeddings, code_embedding = unixcoder_model(source_ids)
        code_embedding = code_embedding.squeeze(0)
        list_of_embeddings.append(code_embedding.to("cpu"))
    return torch.stack(list_of_embeddings)

In [4]:
def create_graph_dataset(folders, add_reverse_edges = False, track_api_nodes = False, remove_small_graphs = False):
  dataset =[]
  MINIMUM_NO_OF_EDGES = 3
  for label, folder in tqdm.tqdm(enumerate(folders)):
    print("\nProcessing: {}\n".format(folder))
    files = glob.glob(os.path.join(folder, '*.txt'))
    print("\nNumber of files: {}\n".format(len(files)))
    random.shuffle(files)
    count = 0
    for file in files:
      if(count == 1500):
         break
          
      # Only process the files having atleast "MINIMUM_NO_OF_EDGES"
      if remove_small_graphs:
        file_fp = open(file, "r")
        file_content = file_fp.readlines()
        lines_count = len(file_content)
        file_fp.close()
        if lines_count < MINIMUM_NO_OF_EDGES:
            continue
    
      if(count % 5 == 0):
          print("\nAt file: {}\n".format(count))

      try:
          if track_api_nodes:
              api_name = folder.split("/")[-1].strip()
          else:
              api_name = None
          nodes_dict, edge_indices_FD, edge_indices_CD, edge_indices, edge_type, file_name, number_of_api_nodes = get_nodes_edges(file, add_reverse_edges = add_reverse_edges, api_name = api_name)
      except Exception as e:
          print("\nError: ", e)
          continue
                    
      if(len(nodes_dict) == 0):
          print("\nNo Data: ", file)
          continue
      #print(nodes_dict, edge_indices_CD, edge_indices_FD, edge_type)

      # Node feature matrix with shape [num_nodes, num_node_features]=(N, 768).
      try:
          with torch.no_grad():
            CodeEmbedding = get_node_embedding_from_codet5p(nodes_dict)
      except Exception as e :
          print("\nError: ", e)
          print(nodes_dict)
          continue
      #print(CodeEmbedding.shape)

      # FIXING DATA FOTMATS AND SHAPE
      x = torch.tensor(CodeEmbedding)
      # print(x.shape)
  
      # data.y: Target to train against (may have arbitrary shape),
      # graph-level targets of shape [1, *]
      label = 1
      y = torch.tensor([label], dtype=torch.long)
      #print(type(y))

      # edge_index (LongTensor, optional) – Graph connectivity in COO format with shape [2, num_edges]
      edge_index_CD = torch.tensor(edge_indices_CD, dtype=torch.long).t().contiguous()
      edge_index_FD = torch.tensor(edge_indices_FD, dtype=torch.long).t().contiguous()
      edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
      edge_attr = torch.tensor(edge_type, dtype=torch.long).t().contiguous()
      #print(edge_index_CD, edge_index_FD, edge_index, edge_type)
  
      data = Data(edge_index=edge_index, edge_attr=edge_attr, x=x)
      data.id = torch.tensor([count])
      data.y = y
      data.number_of_api_nodes = number_of_api_nodes
      data.api = file_name
      dataset.append(data)
      count += 1
    print("\nTotal files processed: ", count)
    
  return dataset

In [None]:
CODE2SEQ_PDG_FOLDER_LOCATION = "./API-Minsuse/Repository/Benchmarks/Code2Seq-Examples/after_pruning_mubench"
project_folders = [os.path.join(CODE2SEQ_PDG_FOLDER_LOCATION, name) for name in os.listdir(CODE2SEQ_PDG_FOLDER_LOCATION) if os.path.isdir(os.path.join(CODE2SEQ_PDG_FOLDER_LOCATION, name))]
print(project_folders)

gnn_dataset = create_graph_dataset(project_folders, add_reverse_edges = True, track_api_nodes = True, remove_small_graphs = True)
print("\nLength of the dataset: ", len(gnn_dataset))

## Build/Load The Model

### Generic GNN Model

In [None]:
from model_ng import CustomGeneraicGNN

#set up model
num_layer = 5
emb_dim = 256
gnn_type = "rgcn"
JK = "last"
dropout_ratio = 0
input_model_file = "./API-Minsuse/Repository/Graph-Models/AFGNN/output/saved_models/context-prediction/rgcn_1_5_7_e100_code2seq_1.5M_with_se_atleast_3_edges_CodeT5+_sub_model.pth"

gnn_model = CustomGeneraicGNN(num_layer, emb_dim, JK, dropout_ratio, gnn_type)
gnn_model.load_state_dict(torch.load(input_model_file))

print("Loaded the model!!")

Loaded the model!!


In [7]:
# Count the total number of parameters
total_params = sum(p.numel() for p in gnn_model.parameters())

# Count the number of trainable parameters
trainable_params = sum(p.numel() for p in gnn_model.parameters() if p.requires_grad)

print("Total parameters:", total_params)
print("Trainable parameters:", trainable_params)

Total parameters: 1314560
Trainable parameters: 1314560


## Get the Embeddings

### Pool over only API nodes

In [8]:
gnn_embeddings = {}
model_name = "custom-generic-gnn" # "clone-detection" or "custom-generic-gnn"
for i in tqdm.tqdm(range(len(gnn_dataset))):
    number_of_api_nodes = gnn_dataset[i].number_of_api_nodes
    if model_name == "clone-detection":
        node_representation = gnn_model(gnn_dataset[i].x, gnn_dataset[i].edge_index)
    elif model_name == "custom-generic-gnn":
        if gnn_type in ["rgat", "rgcn"]:
            node_representation = gnn_model(x = gnn_dataset[i].x, edge_index = gnn_dataset[i].edge_index, edge_type = gnn_dataset[i].edge_attr)
        else:
            node_representation = gnn_model(x = gnn_dataset[i].x, edge_index = gnn_dataset[i].edge_index)
    graph_representation = global_mean_pool(x = node_representation[:number_of_api_nodes], batch = torch.tensor([0]*(number_of_api_nodes)))[0]
    gnn_dataset[i].embedding = graph_representation.detach().numpy()
    sample_name = gnn_dataset[i].api.split("_")[-4].strip()
    api_name = gnn_dataset[i].api.split("_")[-3].strip()
    #print(api_name)
    try:
        gnn_embeddings[api_name].append([gnn_dataset[i].api, sample_name, gnn_dataset[i].embedding])
    except:
        gnn_embeddings[api_name] = [[gnn_dataset[i].api, sample_name, gnn_dataset[i].embedding]]

 25%|██▍       | 6337/25846 [00:35<01:05, 295.68it/s]

100%|██████████| 25846/25846 [02:32<00:00, 169.86it/s]


# Cluster the GNN Embeddings

In [9]:
import clustering_and_evaluation 
import importlib
importlib.reload(clustering_and_evaluation)

total_sh, total_db, total_ch, total_ri, total_mu = 0, 0, 0, 0, 0
project_count = 0
cluster_count_list = []

for folder in project_folders:
    folder_name = folder.strip().split("/")[-1]
    print("\nAnalyzing: ", folder_name)
    only_file_names, only_embeddings, ground_truth = [], [], []
    label_mapping = {}
    label = 0
    if folder_name not in gnn_embeddings:
        print(f"\n{folder_name} API not found in Code2Seq Data!!\n")
        continue
    for file_name, sample_name, embedding in gnn_embeddings[folder_name]:
        if sample_name in label_mapping:
            ground_truth.append(label_mapping[sample_name])
        else:
            label_mapping[sample_name] = label
            label += 1
            ground_truth.append(label_mapping[sample_name])
        only_embeddings.append(embedding)
        only_file_names.append(file_name)
        
    print("Total no of examples: {} and no of ground-truth clusters: {}\n".format(len(only_file_names), len(set(ground_truth))))
    silhouette_result, davies_bouldin_result, calinski_harabasz_result, adjusted_rand_result, fowlkes_mallows_result, adjusted_mutual_info_result, v_measure_result, avg_cluster_count, cluster_count  = clustering_and_evaluation.find_and_evaluate_best_clustering(only_embeddings, only_file_names, ground_truth, "Birch", "DB")
    cluster_count_list.append([folder_name, avg_cluster_count, cluster_count])
    if silhouette_result != None and davies_bouldin_result != None:
        project_count += 1
        total_sh += silhouette_result
        total_db += davies_bouldin_result
        total_ri += adjusted_rand_result
        total_mu += adjusted_mutual_info_result
    print("===========================================================================================")
    
print("\nAverage SH score: {} and DB score: {}".format(round(total_sh/project_count, 3), round(total_db/project_count, 3)))
print("Average RI score: {} and MI score: {}".format(round(total_ri/project_count, 3), round(total_mu/project_count, 3)))


Analyzing:  java.nio.ByteBuffer.get
Total no of examples: 575 and no of ground-truth clusters: 1

Silhouette, Davies-Bouldin and Calinski-Harabasz score is not possible with 1 or N clusters


Best silhouette score: 0.10300000011920929 for cluster count: 2
Best davies_bouldin score: 1.113 for cluster count: 65
Best calinski_harabasz score: 68.874 for cluster count: 2
Average best cluster number: 65 using DB score

Clustering for cluster number:  65
Cluster Counts:  {4: 193, 2: 87, 26: 78, 0: 25, 14: 20, 53: 15, 1: 15, 21: 10, 9: 10, 11: 10, 24: 10, 10: 8, 5: 8, 6: 6, 28: 5, 12: 4, 13: 4, 41: 3, 61: 3, 60: 3, 64: 3, 62: 3, 51: 2, 58: 2, 54: 2, 43: 2, 7: 2, 36: 2, 3: 2, 30: 2, 17: 2, 31: 1, 35: 1, 45: 1, 15: 1, 25: 1, 50: 1, 59: 1, 32: 1, 46: 1, 38: 1, 40: 1, 16: 1, 63: 1, 18: 1, 47: 1, 27: 1, 29: 1, 48: 1, 19: 1, 34: 1, 39: 1, 23: 1, 57: 1, 33: 1, 22: 1, 52: 1, 44: 1, 8: 1, 56: 1, 20: 1, 55: 1, 49: 1, 42: 1, 37: 1}
Cluster Mapping:  {4: ['handleEndElementStart_NA_java.nio.ByteBuffer.get_graph_dump.txt', 'getUnsignedInt32_NA_java.nio.ByteBuffer.get_graph_dump.txt', 'getUnsignedInt_NA_java.nio.ByteBuffer.get_graph_dump.txt', 'get32_NA_java.nio.ByteBuffer.get_graph_dump.txt', 'v

# Find Clusters for MuBench Examples

In [None]:
OUTPUT_FOLDER_LOCATION = "./API-Minsuse/Repository/Benchmarks/MuBench/after_pruning"
llm_project_folders = [os.path.join(OUTPUT_FOLDER_LOCATION, name) for name in os.listdir(OUTPUT_FOLDER_LOCATION) if os.path.isdir(os.path.join(OUTPUT_FOLDER_LOCATION, name))]

llm_gnn_dataset = create_graph_dataset(llm_project_folders, add_reverse_edges = True, track_api_nodes = True, remove_small_graphs = False)
print("\nLength of the dataset: ", len(llm_gnn_dataset))

In [11]:
llm_gnn_embeddings = {}
model_name = "custom-generic-gnn" # "clone-detection" or "custom-generic-gnn"
for i in tqdm.tqdm(range(len(llm_gnn_dataset))):
    number_of_api_nodes = llm_gnn_dataset[i].number_of_api_nodes
    if model_name == "clone-detection":
        node_representation = gnn_model(llm_gnn_dataset[i].x, llm_gnn_dataset[i].edge_index)
    elif model_name == "custom-generic-gnn":
        if gnn_type in ["rgat", "rgcn"]:
            node_representation = gnn_model(x = llm_gnn_dataset[i].x, edge_index = llm_gnn_dataset[i].edge_index, edge_type = llm_gnn_dataset[i].edge_attr)
        else:
            node_representation = gnn_model(x = llm_gnn_dataset[i].x, edge_index = llm_gnn_dataset[i].edge_index)
    graph_representation = global_mean_pool(x = node_representation[:number_of_api_nodes], batch = torch.tensor([0]*(number_of_api_nodes)))[0]
    llm_gnn_dataset[i].embedding = graph_representation.detach().numpy()
    sample_name = llm_gnn_dataset[i].api.split("_")[-4].strip()
    api_name = llm_gnn_dataset[i].api.split("_")[-3].strip()
    try:
        llm_gnn_embeddings[api_name].append([llm_gnn_dataset[i].api, sample_name, llm_gnn_dataset[i].embedding])
    except:
        llm_gnn_embeddings[api_name] = [[llm_gnn_dataset[i].api, sample_name, llm_gnn_dataset[i].embedding]]

  0%|          | 0/256 [00:00<?, ?it/s]

100%|██████████| 256/256 [00:00<00:00, 315.27it/s]


In [17]:
api_names = []
file_names = []
labels = []
num_data_points = []
max_examples = []
ground_truth_labels = []
minimum_required_points = [[],[],[],[],[],[],[],[]]
predicted_labels = [[],[],[],[],[],[],[],[]]

THRESHOLDS = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4] # 0.2 for 20% of largest cluster

def find_clusters_for_llm_data(api_name, cluster_no, cluster_count):
    only_file_names, only_embeddings, ground_truth = [], [], []
    label_mapping = {}
    label = 0
    for file_name, sample_name, embedding in gnn_embeddings[api_name]:
        #print(sample_name)
        if sample_name in label_mapping:
            ground_truth.append(label_mapping[sample_name])
        else:
            label_mapping[sample_name] = label
            label += 1
            ground_truth.append(label_mapping[sample_name])
        only_embeddings.append(embedding)
        only_file_names.append(file_name)
    
    print("\nTotal datapoints: {} and no of clusters: {}\n".format(len(only_embeddings), cluster_no))
    birch_model = Birch(n_clusters = cluster_no, threshold = 3.0)
    birch_model.fit(only_embeddings)

    mubench_api_name = None
    for name in llm_gnn_embeddings:
        if api_name.endswith("." + name):
            mubench_api_name = name
            break
    else:
        print("MuBench API name not found!!")
        return
    
    for file_name, sample_name, embedding in llm_gnn_embeddings[mubench_api_name]:
        cluster_label = birch_model.predict(embedding.reshape(1, -1))
        print("For eaxample: {}, cluster label: {}".format(file_name, cluster_label[0]))
        
        api_names.append(api_name)
        file_names.append(file_name)
        labels.append(cluster_label[0])
        num_data_points.append(cluster_count[cluster_label[0]])
        
        # Get the first cluster label and count as a single variable in the format "label: count"
        # first_label, first_count = next(iter(cluster_count.items()))
        # first_cluster = f"{first_label}: {first_count}"
        # max_examples.append(first_cluster)
        
        # Predict label
        ground_truth_label = int(file_name.strip().split("_")[-5])
        ground_truth_labels.append(ground_truth_label)
        total_data_points = 0
        for e in cluster_count:
            total_data_points += cluster_count[e]
        max_examples.append(total_data_points)
        
        for i in range(0, len(THRESHOLDS)):
            minimum_required_point = int(total_data_points * THRESHOLDS[i])
            if num_data_points[-1] >= minimum_required_point:
                predicted_label = 0
            else:
                predicted_label = 1
            minimum_required_points[i].append(minimum_required_point)
            predicted_labels[i].append(predicted_label)

In [18]:
# api_list = [["java.io.BufferedReader.readLine", 75],
#            ["java.lang.ClassLoader.loadClass", 63],
#            ["java.lang.Thread.start", 122],
#            ["java.net.URL.openConnection", 2],
#            ["java.sql.DriverManager.getConnection", 87],
#            ["java.sql.Statement.execute", 67],
#            ["java.lang.String.charAt", 225],
#            ["java.lang.String.getBytes", 265],
#            ["java.lang.String.substring", 217],
#            ["java.io.PrintWriter.write", 11],
#            ["java.security.MessageDigest.getInstance", 2]]

for api_name, cluster_no, cluster_count in cluster_count_list:
    print("\nProcessing: {}".format(api_name))
    find_clusters_for_llm_data(api_name, cluster_no, cluster_count)



Processing: java.nio.ByteBuffer.get

Total datapoints: 575 and no of clusters: 65

For eaxample: wrapBuffer_1_NA_ByteBuffer.get_graph_dump.txt, cluster label: 26
For eaxample: wrapBuffer_0_NA_ByteBuffer.get_graph_dump.txt, cluster label: 2
For eaxample: FlipBuffer_0_NA_ByteBuffer.get_graph_dump.txt, cluster label: 26
For eaxample: FlipBuffer_1_NA_ByteBuffer.get_graph_dump.txt, cluster label: 26

Processing: java.util.ArrayList.remove

Total datapoints: 250 and no of clusters: 73

For eaxample: CheckMarkersForNull_0_NA_ArrayList.remove_graph_dump.txt, cluster label: 2
For eaxample: CheckMarkersForNull_1_NA_ArrayList.remove_graph_dump.txt, cluster label: 5

Processing: java.util.stream.IntStream.range

Total datapoints: 28 and no of clusters: 10

For eaxample: WithTerminalOperation_0_NA_IntStream.range_graph_dump.txt, cluster label: 9
For eaxample: WithTerminalOperation_1_NA_IntStream.range_graph_dump.txt, cluster label: 0

Processing: java.util.HashMap.get

Total datapoints: 1500 and n

In [None]:
import pandas as pd
output_path = './API-Minsuse/Repository/Graph-Models/AFGNN/temp/results_mubench_afgnn(3.0).xlsx'

df = pd.DataFrame({
    'API name': api_names,
    'Class name': file_names,
    'Cluster Label': labels,
    'Number of Data-Points': num_data_points,
    'Total examples': max_examples,
    'Ground truth label': ground_truth_labels,
    'MR1 (5%)': minimum_required_points[0],
    'MR2 (10%)': minimum_required_points[1],
    'MR3 (15%)': minimum_required_points[2],
    'MR4 (20%)': minimum_required_points[3],
    'MR5 (25%)': minimum_required_points[4],
    'MR6 (30%)': minimum_required_points[5],
    'MR7 (35%)': minimum_required_points[6],
    'MR8 (40%)': minimum_required_points[7],
    'PL1 (5%)': predicted_labels[0],
    'PL2 (10%)': predicted_labels[1],
    'PL3 (15%)': predicted_labels[2],
    'PL4 (20%)': predicted_labels[3],
    'PL5 (25%)': predicted_labels[4],
    'PL6 (30%)': predicted_labels[5],
    'PL7 (35%)': predicted_labels[6],
    'PL8 (40%)': predicted_labels[7]
})

df.to_excel(output_path, index=False)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

output_path = './API-Minsuse/Repository/Graph-Models/AFGNN/temp/evaluation_mubench_afgnn(3.0).xlsx'
accuracies, precisions, recalls, f1_scores = [], [], [], []

# Calculate metrics
for i in range(0, len(THRESHOLDS)):
    accuracy = accuracy_score(ground_truth_labels, predicted_labels[i])
    precision = precision_score(ground_truth_labels, predicted_labels[i])
    recall = recall_score(ground_truth_labels, predicted_labels[i])
    f1 = f1_score(ground_truth_labels, predicted_labels[i])
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("================================")
    
df = pd.DataFrame({
    'Threshold': THRESHOLDS,
    'Accuracy': accuracies,
    'Precision': precisions,
    'Recall': recalls,
    'F1 Score': f1_scores
})
df.to_excel(output_path, index=False)

Accuracy: 0.55
Precision: 0.56
Recall: 0.49
F1 Score: 0.52
Accuracy: 0.58
Precision: 0.57
Recall: 0.69
F1 Score: 0.62
Accuracy: 0.57
Precision: 0.55
Recall: 0.71
F1 Score: 0.62
Accuracy: 0.53
Precision: 0.52
Recall: 0.84
F1 Score: 0.64
Accuracy: 0.53
Precision: 0.52
Recall: 0.85
F1 Score: 0.64
Accuracy: 0.53
Precision: 0.52
Recall: 0.86
F1 Score: 0.65
Accuracy: 0.52
Precision: 0.51
Recall: 0.88
F1 Score: 0.65
Accuracy: 0.52
Precision: 0.51
Recall: 0.88
F1 Score: 0.65
