In [859]:
# master.ipynb

#!pip install --user textacy

In [860]:
#%pip install -r requirements.txt
#import textacy
#print(textacy.__version__)

In [None]:
%load_ext autoreload
%autoreload 2
import nltk
nltk.download('punkt_tab')
from lib.data_loader import DataLoader
from lib.augmenter import Augmenter
import lib.configurator as configurator
from lib.classifier import Classifier
from lib.feature_extractor import FeatureExtractor
import lib.source_code_processing.methods_identifier_extractor as MethodsIdentifiersExtractor
from lib.method_classifier import MethodClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
import datetime
import pandas as pd
import os
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from collections import Counter
import math
#from threadpoolctl import threadpool_limits

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\a258142\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [862]:
def encode_labels(y_train, y_eval_functions, y_eval_files):
    # Add "-1" class to labels
    y_eval_with_null = np.append(y_eval_functions, "~unknown")
    
    # Combine labels from both y_train and y_eval
    all_labels = np.concatenate((y_train, y_eval_with_null, y_eval_files))
    
    # Create a single LabelEncoder instance and fit it on all labels
    label_encoder = LabelEncoder()
    label_encoder.fit(all_labels)
    
    # Encode both y_train and y_eval using the same label encoder
    y_train_encoded = label_encoder.transform(y_train)
    y_eval_functions_encoded = label_encoder.transform(y_eval_functions)
    y_eval_files_encoded = label_encoder.transform(y_eval_files)

    return label_encoder, y_train_encoded, y_eval_functions_encoded, y_eval_files_encoded


In [863]:
def decode_labels(label_encoder, encoded_array):
    def recursive_decode(value):
        # If the value is a scalar, decode it
        if np.isscalar(value):
            try:
                return label_encoder.inverse_transform([value])[0]
            except ValueError:
                return "~unknown"
        # If the value is an array/list, decode each element recursively
        elif isinstance(value, (list, np.ndarray)):
            return [recursive_decode(v) for v in value]
        else:
            return "~unknown"

    # Start the decoding process
    return recursive_decode(encoded_array)
    """decoded_array = []
    for encoded_value in encoded_array:
        if encoded_value == -1:
            decoded_array.append("~unknown")
        else:
            decoded_array.append(label_encoder.inverse_transform([encoded_value])[0])
    return decoded_array"""

In [864]:

def save_results(now, n_o_classes, preprocessing_options, resampling_outlier_options,              
                 classifier, hyperparameters, metrics, training_directory, evaluation_directory, method_predictions, N, X_train_tokens,
                 apply_stemming, apply_lemmitization, do_clean_code,do_remove_stopwords,class_sizes, 
                 n_o_methods,n_LoC, n_LoC_mean, n_LoC_median, n_LoC_std, X_eval_mean_entropy, X_eval_mean_density, do_map_files,do_merge_training_data):
     # Initialize default values
    data_creation_method = 'Paragraphs'
    vectorization_model = 'None'
    do_resampling = 'None'
    n_neighbours = 'None'
    do_outlier_filtering = 'None'
    contamination = 'None'
    n_neighbours_lof = 'None'
    
    if preprocessing_options and preprocessing_options.get('data_creation', {}).get('data_creation_method'):
        data_creation_method = preprocessing_options['data_creation']['data_creation_method']
        vectorization_model = preprocessing_options.get('vectorization', {}).get('method', 'None')
    
    if resampling_outlier_options:
        do_resampling = resampling_outlier_options.get('resampling', {}).get('do_resample', 'None')
        n_neighbours = resampling_outlier_options.get('resampling', {}).get('n_neighbours', 'None')
        do_outlier_filtering = resampling_outlier_options.get('outlier_filtering', {}).get('do_filter', 'None')
        contamination = resampling_outlier_options.get('outlier_filtering', {}).get('contamination', 'None')
        n_neighbours_lof = resampling_outlier_options.get('outlier_filtering', {}).get('n_neighbours', 'None')
        
    # Check if the results file exists to determine if we need to create a new file or append to an existing one
    if not os.path.exists('results.csv'):
        max_id = 0
    else:
        results_df = pd.read_csv('results.csv')
        max_id = results_df['ID'].max() if len(results_df) > 0 else 0

    # Increment the max ID to use for new entries
    max_id += 1

    if classifier == 'kNN':
        k = hyperparameters['n_neighbors']
        classifier = f'{k}-NN'
    
    # Append new row to results.csv
    new_row = pd.DataFrame({
        'ID': [max_id],
        'Timestamp': [now],
        'data_creation_method': [data_creation_method],
        'vectorization_model': [vectorization_model],
        'do_resample': [do_resampling],
        'n_neighbours': [n_neighbours],
        'do_outlier_filtering': [do_outlier_filtering],
        'contamination': [contamination],
        'n_neighbours_lof': [n_neighbours_lof],
        'classifier': [classifier],
        'classifier_hyperparams': [hyperparameters],
        'Training_Directory': [training_directory],
        'Evaluation_Directory': [evaluation_directory],
        'Fraction of data after outlier filtering': [metrics['data_fraction_after_outlier_filtering']],
        'Fraction of data classified after outlier filtering': [metrics['fraction_classified']],
        'Accuracy': [metrics['accuracy']],
        'Precision': [metrics['precision']],
        'Recall': [metrics['recall']],
        'F1_Score': [metrics['f1_score']],
        'Number of classes': [n_o_classes],
        'Top-N': [N],  # Adding the new column for the number of top classes considered (N)
        'n_tokens': [X_train_tokens],
        'Stemming':[apply_stemming],
        'Lemmitize':[apply_lemmitization],
        'Clean code':[do_clean_code],
        'Remove stop words':[do_remove_stopwords],
        'Class sizes:': [class_sizes],
        '#codePoints:':[n_o_methods],
        'Mean nb density': [X_eval_mean_density],
        'Mean nb entropy': [X_eval_mean_entropy],
        'n_LoC': [n_LoC],
        'n_LoC_mean': [n_LoC_mean],
        'n_LoC_median': [n_LoC_median],
        'n_LoC_std': [n_LoC_std],
        'Mapped files': [do_map_files],
        'do_merge_training_data': [do_merge_training_data]
    })

    # Save the new row to results.csv
    new_row.to_csv('results.csv', mode='a', header=not os.path.exists('results.csv'), index=False)

    # Save to classifications_report.csv
    method_predictions['ID'] = max_id  # Set the ID for the current predictions
    # Reorder columns to have 'ID' as the first column
    method_predictions = method_predictions[['ID', 'Best Prediction','True Class','Entropy','Density','Lines of code','Non-generic terms','Unique non-generic terms','n_ext_includes','n_comments','n_NL_words']]# + [col for col in method_predictions.columns if col != 'ID']]
    method_predictions.to_csv('classifications_report.csv', mode='a', header=not os.path.exists('classifications_report.csv'), index=False)


In [865]:
# not used
def evaluate_predictions(y_true, y_pred, n_original_methods):
    # Filter out "Unknown" class from y_true and y_pred
    filtered_indices = [i for i, predicted_label in enumerate(y_pred) if predicted_label != "~unknown"]
    y_true_filtered = [y_true[i] for i in filtered_indices]
    y_pred_filtered = [y_pred[i] for i in filtered_indices]

    fractionClassified = len(y_true_filtered) / len(y_true)
    remainingDataAfterOutlierFiltering = len(y_true) / n_original_methods
    # Calculate metrics
    accuracy = accuracy_score(y_true_filtered, y_pred_filtered)
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_true_filtered, y_pred_filtered, average='macro') # wrt unknown predictions

    metrics = {
        'data_fraction_after_outlier_filtering': remainingDataAfterOutlierFiltering,
        'fraction_classified': fractionClassified,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }
    return metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def evaluate_topN_predictions(method_predictions_df, n_original_methods, N):
    """
    Evaluate the predictions for the top N classes.

    :param method_predictions_df: DataFrame containing method predictions and true classes.
    :param n_original_methods: The original number of methods.
    :param N: The number of top predictions to consider.
    :return: Dictionary of evaluation metrics.
    """
    # Identify the correct column for the true class labels
    #true_class_column = method_predictions_df.columns.intersection(['True Class 1', 'True Class']).tolist()
    #true_class_column = method_predictions_df['True Class'].tolist()

    #if not true_class_column:
       # raise ValueError("Could not find a column containing true class labels.")
    
    #true_class_column = true_class_column[0]
    
    # Filter out "Unknown" class from predictions
    method_predictions_filtered = method_predictions_df[method_predictions_df['Predicted Class 1'] != "~unknown"]
    
    fractionClassified = len(method_predictions_filtered) / len(method_predictions_df)
    remainingDataAfterOutlierFiltering = len(method_predictions_df) / n_original_methods

    # Create a new column for the best prediction
    def get_best_prediction(row):
        true_class = row['True Class']
        topN_predictions = row[[f'Predicted Class {i+1}' for i in range(N)]].values
        if true_class in topN_predictions:
            return true_class
        else:
            return row['Predicted Class 1']

    method_predictions_filtered['Best Prediction'] = method_predictions_filtered.apply(get_best_prediction, axis=1)
    method_predictions_df['Best Prediction'] = method_predictions_df.apply(get_best_prediction, axis=1)

    # Calculate metrics
    accuracy = accuracy_score(method_predictions_filtered['True Class'], method_predictions_filtered['Best Prediction'])
    precision, recall, f1_score, _ = precision_recall_fscore_support(method_predictions_filtered['True Class'],
                                                                     method_predictions_filtered['Best Prediction'],
                                                                     average='macro',
                                                                     zero_division=0)

    metrics = {
        'data_fraction_after_outlier_filtering': remainingDataAfterOutlierFiltering,
        'fraction_classified': fractionClassified,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }
    return metrics, method_predictions_df


In [867]:
# Function to save data into a dictionary
def save_data(X_train, y_train, X_train_embeddings,
                            X_eval_function_content, y_eval_function_content,
                            X_eval_function_content_embeddings, function_ids,
                            X_eval_files_content, y_eval_files_content,
                            X_eval_files_content_embeddings, file_ids):
    saved_data = {
        'X_train': X_train,
        'y_train': y_train,
        'X_train_embeddings': X_train_embeddings,
        'X_eval_function_content': X_eval_function_content,
        'y_eval_function_content': y_eval_function_content,
        'X_eval_function_content_embeddings': X_eval_function_content_embeddings,
        'function_ids': function_ids,
        'X_eval_files_content': X_eval_files_content,
        'y_eval_files_content': y_eval_files_content,
        'X_eval_files_content_embeddings': X_eval_files_content_embeddings,
        'file_ids': file_ids
    }
    return saved_data

In [868]:
# Function to load data from a dictionary
def load_data(saved_data):
    return (saved_data['X_train'], saved_data['y_train'], saved_data['X_train_embeddings'],
            saved_data['X_eval_function_content'], saved_data['y_eval_function_content'],
            saved_data['X_eval_function_content_embeddings'],saved_data['function_ids'],
            saved_data['X_eval_files_content'], saved_data['y_eval_files_content'],
            saved_data['X_eval_files_content_embeddings'],saved_data['file_ids'])


In [869]:
def find_parent_classes_using_list(directory_path, parent_paths):
    parent_classes = {}

    # Normalize the parent paths to absolute paths
    absolute_parent_paths = [os.path.abspath(os.path.join(directory_path, parent_path)) for parent_path in parent_paths]

    # Create a mapping from absolute parent paths to their unique parent class names
    parent_class_map = {}
    for parent_path in absolute_parent_paths:
        relative_parent_path = os.path.relpath(parent_path, directory_path)
        top_level_dir, sub_dir = os.path.split(relative_parent_path)
        parent_class_name = f"{os.path.basename(top_level_dir)}_{sub_dir}"
        parent_class_map[parent_path] = parent_class_name

    # Traverse the directory structure and map files to their parent classes
    for root, dirs, files in os.walk(directory_path):
        # Normalize root path
        normalized_root = os.path.abspath(root)

        # Check if the current root is under any of the specified parent paths
        for parent_path in absolute_parent_paths:
            if normalized_root.startswith(parent_path):
                parent_class = parent_class_map[parent_path]
                for file_name in files:
                    if file_name.endswith((".cpp", ".cc", ".java")):
                        file_key = os.path.splitext(file_name)[0]
                        parent_classes[file_key] = parent_class

    return parent_classes


In [870]:
def find_parent_classes(directory_path, parent_folder):
    parent_classes = {}
    for root, dirs, files in os.walk(directory_path):
        if (root.endswith(parent_folder)) and dirs:
            for dir_name in dirs:
                parent_class = dir_name
                for _, _, files in os.walk(os.path.join(root, dir_name)):
                    for file in files:
                        if file.endswith((".cpp", ".cc", ".java","c")):
                            file_key = os.path.splitext(file)[0]
                            parent_classes[file_key] = parent_class
    return parent_classes

In [871]:
def find_parent_classes(directory_path, parent_folder):
    parent_classes = {}
    for root, dirs, files in os.walk(directory_path):
        if root.endswith(parent_folder) and dirs:
            for dir_name in dirs:
                parent_class = dir_name
                for sub_root, _, files in os.walk(os.path.join(root, dir_name)):
                    for file in files:
                        if file.endswith((".cpp", ".cc", ".java", "c")):
                            # Use the full relative file path as the key
                            relative_file_path = os.path.relpath(os.path.join(sub_root, file), directory_path)
                            parent_classes[relative_file_path] = parent_class
    return parent_classes


In [872]:
def convert_labels_to_parent_classes(y_train, class_dict):
    return [class_dict[label] if label in class_dict else label for label in y_train]

def print_unmpaped_classes(y_train, class_dict):
    converted_labels = []
    unmapped_classes = set()
    
    for label in y_train:
        if label in class_dict:
            converted_labels.append(class_dict[label])
        else:
            #print('Unmapped: ', label)
            converted_labels.append(label)
            unmapped_classes.add(label)
    
    return

def update_class_names_with_parent_classes(df_src, class_dict):
    def map_class_name(class_name):
        if class_name not in class_dict:
            #print('Did not find class', class_name)
            return class_name
        else:
            #print('Found class', class_name)
            return class_dict[class_name]

    df_src['Original class'] = df_src['Class']
    df_src['Class'] = df_src['Class'].apply(map_class_name)
    return df_src

In [873]:
def filter_duplicate_data_points(X_train, y_train):
    # Initialize a dictionary to store the indices of X_train associated with each unique data point
    data_point_indices = {}

    # Iterate through the training data to populate the dictionary
    for idx, (data_point, label) in enumerate(zip(X_train, y_train)):
        data_point_str = str(data_point)  # Convert the data point to a string to use it as a dictionary key
        if data_point_str not in data_point_indices:
            data_point_indices[data_point_str] = [idx]  # Initialize a new list with the index
        else:
            data_point_indices[data_point_str].append(idx)  # Append the index to the existing list

    # Initialize lists to store filtered X_train and y_train
    filtered_X_train = []
    filtered_y_train = []

    # Iterate through the dictionary to filter out data points associated with multiple labels
    for data_point_str, indices in data_point_indices.items():
        if len(indices) == 1:
            # If the data point is associated with only one label, keep it
            filtered_X_train.append(X_train[indices[0]])
            filtered_y_train.append(y_train[indices[0]])
        #else:
            #print('Removed data point: ', data_point_str)

    # Convert the filtered lists back to arrays
    X_train_filtered = np.array(filtered_X_train)
    y_train_filtered = np.array(filtered_y_train)

    return X_train_filtered, y_train_filtered

In [874]:
def filter_generic_data_points(X_train, y_train):
    #Removes data points that appear in all classes
    # Initialize a dictionary to store the unique labels associated with each data point
    data_point_labels = {}
    unique_labels = set(y_train)  # Get all unique labels in y_train

    # Iterate through the training data to populate the dictionary
    for idx, (data_point, label) in enumerate(zip(X_train, y_train)):
        data_point_str = str(data_point)  # Convert the data point to a string to use it as a dictionary key
        if data_point_str not in data_point_labels:
            data_point_labels[data_point_str] = {'labels': {label}, 'indices': [idx]}  # Store labels and indices
        else:
            data_point_labels[data_point_str]['labels'].add(label)  # Add the label to the set
            data_point_labels[data_point_str]['indices'].append(idx)  # Store all indices

    # Initialize lists to store filtered X_train and y_train
    filtered_X_train = []
    filtered_y_train = []

    # Iterate through the dictionary to filter out data points associated with all labels
    for data_point_str, info in data_point_labels.items():
        if info['labels'] != unique_labels:
            # If the data point is not associated with all unique labels, keep it
            for idx in info['indices']:  # There may be multiple instances with the same label
                filtered_X_train.append(X_train[idx])
                filtered_y_train.append(y_train[idx])

    # Convert the filtered lists back to arrays
    X_train_filtered = np.array(filtered_X_train)
    y_train_filtered = np.array(filtered_y_train)

    return X_train_filtered, y_train_filtered

def filter_frequent_entries(X_train, y_train, max_occurrences):
    # Count occurrences of each entry in X_train
    from collections import Counter
    entry_counts = Counter(X_train)
    
    # Create a boolean mask to filter entries that occur more than max_occurrences
    mask = [entry_counts[x] <= max_occurrences for x in X_train]
    
    # Apply the mask to X_train and y_train
    X_train_filtered = [x for x, keep in zip(X_train, mask) if keep]
    y_train_filtered = [y for y, keep in zip(y_train, mask) if keep]
    
    return X_train_filtered, y_train_filtered

def remove_duplicates(X_train, y_train):
    """
    Removes duplicate entries in X_train and y_train based on unique (X_train, y_train) pairs.
    Keeps only one entry for each unique pair.

    Args:
        X_train (list): List of text data points.
        y_train (list): List of corresponding class labels.

    Returns:
        tuple: Cleaned X_train and y_train lists.
    """
    # Use a set to track unique (text, label) pairs
    seen = set()
    unique_X_train = []
    unique_y_train = []

    for x, y in zip(X_train, y_train):
        # Create a tuple for the current pair
        pair = (x, y)
        if pair not in seen:
            seen.add(pair)
            unique_X_train.append(x)
            unique_y_train.append(y)

    return unique_X_train, unique_y_train

def remove_non_strings(X_train, y_train):
    """
    Removes non-string entries from X_train and y_train based on X_train containing strings only.
    Removes any pair where X_train entry is not a string.

    Args:
        X_train (list): List of data points (potentially non-string).
        y_train (list): List of corresponding class labels.

    Returns:
        tuple: Cleaned X_train and y_train lists with only string data points.
    """
    cleaned_X_train = []
    cleaned_y_train = []

    for x, y in zip(X_train, y_train):
        # Only add the pair if the entry in X_train is a string
        if isinstance(x, str):
            cleaned_X_train.append(x)
            cleaned_y_train.append(y)

    return cleaned_X_train, cleaned_y_train


In [875]:
def filter_classes_not_in_training_data(df_src, X_train, y_train):
    # Get unique classes from y_train
    unique_classes_in_training = set(y_train)
    
    # Filter df_src to keep only rows where 'Class' exists in the training data (y_train)
    df_src_filtered = df_src[df_src['Class'].isin(unique_classes_in_training)]
    
    return df_src_filtered

In [876]:
def filter_empty_data_points(X_train, y_train):
    """
    Filters out empty rows where either an entry in X_train or y_train is empty.

    Args:
        X_train (list): List of input data points.
        y_train (list): List of corresponding labels.

    Returns:
        tuple: Filtered X_train and y_train lists.
    """
    filtered_X_train = []
    filtered_y_train = []
    
    for x, y in zip(X_train, y_train):
        if x and y:  # Checks if neither x nor y is empty
            filtered_X_train.append(x)
            filtered_y_train.append(y)
    
    return filtered_X_train, filtered_y_train

In [877]:
def merge_data_with_token_limit(X_train, y_train, max_tokens=1024):
    """
    Merges the text data into one or more data points per unique class, 
    using cells from X_train, controlled by a max tokens limit.
    
    Parameters:
    - X_train: List of text data points.
    - y_train: List of corresponding class labels.
    - max_tokens: Maximum token limit for each data point.
    
    Returns:
    - merged_X: List of merged text data points.
    - merged_y: List of corresponding class labels for the merged data points.
    """
    from collections import defaultdict

    # Initialize a dictionary to hold merged text for each class
    merged_X = []
    merged_y = []
    
    current_text = ""
    current_label = None
    current_token_count = 0

    for text, label in zip(X_train, y_train):
        # Split the current text into words (tokens)
        tokens = text.split()
        token_count = len(tokens)

        # If adding the current text exceeds the token limit, save the previous text as a datapoint
        if current_label is not None and label == current_label and current_token_count + token_count > max_tokens:
            merged_X.append(current_text.strip())  # Save the accumulated text
            merged_y.append(current_label)
            current_text = ""
            current_token_count = 0

        # Add the current text to the accumulated data
        current_text += text + " "
        current_token_count += token_count
        current_label = label

    # Don't forget to add the last accumulated data if it exists
    if current_text.strip():
        merged_X.append(current_text.strip())
        merged_y.append(current_label)

    return merged_X, merged_y


In [878]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

def camel_to_words(text):
    # Convert CamelCase to words (e.g., isDataReady -> is Data Ready)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    # Convert snake_case or snakyCamelCase to words (e.g., current_odometry -> current odometry)
    text = re.sub(r'_([a-zA-Z])', r' \1', text)
    return text


def replace_stopwords_with_punctuation(text):
    # Replace specific stopwords with punctuation
    stopwords_to_punctuation = {
        ';': '.', 
        '{': '.', 
        '}': '.'
    }
    
    for stopword, punctuation in stopwords_to_punctuation.items():
        text = text.replace(stopword, punctuation)

    return text

def replace_operators_with_natural_language(text):
    # Replace operators with natural language equivalents
    replacements = {
        '==': 'is equal to',
        '!=': 'is not equal to',
        '>=': 'is greater than or equal to',
        '<=': 'is less than or equal to',
        '>': 'is greater than',
        '<': 'is less than',
        '=': 'is defined as',
        '&&': 'and',
        '||': 'or',
        '+': 'plus',
        '-': 'minus',
        #'*': 'times',
        '/': 'divided by',
    }
    
    for symbol, replacement in replacements.items():
        text = text.replace(symbol, replacement)
    
    return text
"""
def clean_code_snippet(text, apply_stemming=True, apply_lemmatization=True):
    # Initialize the lemmatizer and stemmer
    lemmatizer = WordNetLemmatizer() if apply_lemmatization else None
    stemmer = PorterStemmer() if apply_stemming else None

    # Step 1: Convert CamelCase and snake_case to words
    text = camel_to_words(text)

    # Step 2: Handle cases where words are concatenated with numbers or special symbols
    text = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', text)  # Separate words from numbers
    text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text)  # Separate numbers from words
    text = re.sub(r'([a-zA-Z])([^a-zA-Z\s])', r'\1 \2', text)  # Add space before non-letters
    text = re.sub(r'([^a-zA-Z\s])([a-zA-Z])', r'\1 \2', text)  # Add space after non-letters

    # Step 3: Add punctuation based on specific patterns (e.g., stopwords)
    text = replace_stopwords_with_punctuation(text)

    # Step 4: Replace operators and non-letter characters with meaningful words
    text = replace_operators_with_natural_language(text)

    # Step 5: Replace non-letter characters (except specified punctuation) with spaces
    text = re.sub(r'[^a-zA-Z\s.]+', ' ', text)

       # Step 6: Tokenize the text
    words = word_tokenize(text)

    # Step 7: Apply stemming if enabled
    if apply_stemming:
        words = [stemmer.stem(word) for word in words]

    # Step 8: Apply lemmatization if enabled
    if apply_lemmatization:
        words = [lemmatizer.lemmatize(word) for word in words]

    # Step 9: Join the words back into a single string
    text = ' '.join(words)

    # Step 10: Convert text to lowercase for uniformity
    text = text.lower()

    # Step 11: Normalize multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text"""

def clean_code_snippet(text):
    # Step 1: Replace underscores with spaces
    text = text.replace('_', ' ')
    
    # Step 2: Handle CamelCase (only when a lowercase letter is followed by an uppercase letter)
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    
    # Step 3: Remove special characters and symbols, except spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    
    # Step 4: Convert to lowercase
    text = text.lower()
    
    # Step 5: Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

  """


In [879]:
import re
nltk.download('words')
from nltk.corpus import words
common_keywords = {
    # C++ https://en.cppreference.com/w/cpp/keyword
    'alignas', 'alignof', 'and', 'and_eq', 'asm', 'atomic_cancel', 'atomic_commit', 
    'atomic_noexcept', 'auto', 'bitand', 'bitor', 'bool', 'break', 'case', 'catch', 
    'char', 'char8_t', 'char16_t', 'char32_t', 'class', 'compl', 'concept', 'const', 
    'consteval', 'constexpr', 'constinit', 'const_cast', 'continue', 'co_await', 
    'co_return', 'co_yield', 'decltype', 'default', 'delete', 'do', 'double', 
    'dynamic_cast', 'else', 'enum', 'explicit', 'export', 'extern', 'false', 'float', 
    'for', 'friend', 'goto', 'if', 'inline', 'int', 'long', 'mutable', 'namespace', 
    'new', 'noexcept', 'not', 'not_eq', 'nullptr', 'operator', 'or', 'or_eq', 'private', 
    'protected', 'public', 'reflexpr', 'register', 'reinterpret_cast', 'requires', 'return', 
    'short', 'signed', 'sizeof', 'static', 'static_assert', 'static_cast', 'struct', 'switch', 
    'synchronized', 'template', 'this', 'thread_local', 'throw', 'true', 'try', 'typedef', 
    'typeid', 'typename', 'union', 'unsigned', 'using', 'virtual', 'void', 'volatile', 
    'wchar_t', 'while', 'xor', 'xor_eq',
    # C https://en.cppreference.com/w/c/keyword
    'alignas', 'alignof', 'auto', 'bool', 'break', 'case', 'char', 'const', 'constexpr', 
    'continue', 'default', 'do', 'double', 'else', 'enum', 'extern', 'false', 'float', 
    'for', 'goto', 'if', 'inline', 'int', 'long', 'nullptr', 'register', 'restrict', 'return', 
    'short', 'signed', 'sizeof', 'static', 'static_assert', 'struct', 'switch', 'thread_local', 
    'true', 'typedef', 'typeof', 'typeof_unqual', 'union', 'unsigned', 'void', 'volatile', 'while',
    '_Alignas', '_Alignof', '_Atomic', '_BitInt', '_Bool', '_Complex', '_Decimal128', '_Decimal32', 
    '_Decimal64', '_Generic', '_Imaginary', '_Noreturn', '_Static_assert', '_Thread_local', 
    'if', 'elif', 'else', 'endif', 'ifdef', 'ifndef', 'elifdef', 'elifndef', 'define', 'undef', 
    'include', 'embed', 'line', 'error', 'warning', 'pragma', 'defined', '__has_include', 
    '__has_embed', '__has_c_attribute',

    # java https://docs.oracle.com/javase/tutorial/java/nutsandbolts/_keywords.html
    'abstract', 'continue', 'for', 'new', 'switch',
    'assert', 'default', 'goto', 'package', 'synchronized',
    'boolean', 'do', 'if', 'private', 'this',
    'break', 'double', 'implements', 'protected', 'throw',
    'byte', 'else', 'import', 'public', 'throws',
    'case', 'enum', 'instanceof', 'return', 'transient',
    'catch', 'extends', 'int', 'short', 'try',
    'char', 'final', 'interface', 'static', 'void',
    'class', 'finally', 'long', 'strictfp', 'volatile',
    'const', 'float', 'native', 'super', 'while','true', 'false','null'
 }

# Load English words from NLTK
word_set = set(words.words())

def count_natural_language_words(code_snippet):
    # Step 1: Tokenize the code snippet
    # Split by non-word characters (punctuation, whitespace, etc.)
    tokens = re.findall(r'[a-zA-Z]+', code_snippet)
    
    # Step 2: Check if tokens are valid English words
    natural_language_words = [token for token in tokens if token.lower() not in common_keywords]
    natural_language_words = [token for token in natural_language_words if token.lower() in word_set]
    
    # Step 3: Count the natural language words
    return len(natural_language_words)


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\a258142\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [881]:
import re

def clean_system_and_module_name(df_src_functions, df_src_files, X_train, y_train, system_name):
    """
    Clean the system name from text data and replace module names (labels) with 'module'.
    
    Args:
        df_src_functions (DataFrame): DataFrame containing functions with 'Content' column to clean.
        df_src_files (DataFrame): DataFrame containing files with 'Content' column to clean.
        X_train (list): List of text data points.
        y_train (list): List of corresponding class labels (module names).
        system_name (str): The system name to be replaced with "system".
    
    Returns:
        tuple: Cleaned DataFrames (df_src_functions, df_src_files) and cleaned X_train list.
    """
    # Prepare lowercase versions for matching
    system_name_lower = system_name.lower()
    labels_lower = set(str(label).lower() for label in set(y_train))

    def clean_text(text):
        if not isinstance(text, str):
            return text

        # Replace ' system_name ' (surrounded by spaces) with ' system '
        pattern_system = r'(?i)(?<=\s)' + re.escape(system_name) + r'(?=\s)'
        cleaned = re.sub(pattern_system, ' ', text) # re.sub(pattern_system, ' system ', text) # does not work of system name is first (no whitespace)

        # Replace each label with ' module ' (surrounded by word boundaries or spaces, case-insensitive)
        for label in labels_lower:
            pattern_label = r'(?i)(?<=\s)' + re.escape(label) + r'(?=\s)'
            cleaned = re.sub(pattern_label, ' ', cleaned) # re.sub(pattern_label, ' module ', cleaned)

        # Optional: clean up multiple spaces after replacements
        cleaned = re.sub(r'\s{2,}', ' ', cleaned)

        return cleaned.strip()

    # Apply to df_src_functions and df_src_files
    #df_src_functions['Content'] = df_src_functions['Content'].apply(clean_text)
    #df_src_files['Content'] = df_src_files['Content'].apply(clean_text)

    # Clean X_train
    X_train = [clean_text(text) for text in X_train]

    return df_src_functions, df_src_files, X_train


In [882]:
def remove_common_terms(snippet, method_contents, threshold=0.7):
    # Split the snippet into words
    words = snippet.split()
    
    # Count the frequency of each word across all methods
    all_words = ' '.join(method_contents).split()
    word_frequencies = Counter(all_words)
    total_methods = len(method_contents)
    
    # Calculate IDF (Inverse Document Frequency) for each word
    word_document_count = Counter([word for content in method_contents for word in set(content.split())])
    idf_scores = {word: math.log(total_methods / (1 + word_document_count[word])) for word in word_document_count}
    
    # Create a cleaned snippet by filtering out high-frequency words
    cleaned_words = [word for word in words if idf_scores.get(word, 0) > threshold]
    
    return ' '.join(cleaned_words)

In [883]:
import tiktoken
def count_openai_tokens(X_train, model_name="gpt-4o"):
    # Initialize the tokenizer for the specified model
    tokenizer = tiktoken.encoding_for_model(model_name)
    
    total_tokens = 0
    
    for text in X_train:
        # Encode the text to count tokens
        tokens = tokenizer.encode(text)
        total_tokens += len(tokens)
    
    return total_tokens

In [884]:
def count_tokens(X_train):
    total_tokens = 0
    
    for text in X_train:
        # Split the text into tokens using regex to handle various delimiters
        tokens = re.findall(r'\b\w+\b', text)
        total_tokens += len(tokens)
    
    return total_tokens

In [885]:
def count_words(strings_list):
    """
    Counts the total number of words in a list of strings.
    
    Parameters:
        strings_list (list of str): A list of strings to count words from.
    
    Returns:
        int: The total number of words across all strings in the list.
    """
    if not isinstance(strings_list, list) or not all(isinstance(s, str) for s in strings_list):
        raise ValueError("Input must be a list of strings.")
    
    total_words = sum(len(s.split()) for s in strings_list)
    return total_words

In [886]:
import pandas as pd
import os

def save_training_stats(n_src_functions, n_src_files, n_train_words,n_train_points, training_directory,n_Loc_files,n_Loc_functions, filename="training_stats.csv"):
    # Create a DataFrame with the new data
    new_data = pd.DataFrame({
        "Training Directory": [training_directory],
        "Number of Source Functions": [n_src_functions],
        "Number of Source Files": [n_src_files],
        "Number of Training Words": [n_train_words],
        "Number of Training Points": [n_train_points],
        "Number of LoC files": [n_Loc_files],
        "Number of LoC functions": [n_Loc_functions]
    })

    # Check if the file already exists
    if os.path.exists(filename):
        # Load existing data and append new row
        df_existing = pd.read_csv(filename)
        df_combined = pd.concat([df_existing, new_data], ignore_index=True)
    else:
        df_combined = new_data  # If file doesn't exist, use new data as the DataFrame

    # Save to CSV
    df_combined.to_csv(filename, index=False)

    print(f"Training stats appended to {filename}")

# Example usage:
#save_training_stats(n_src_functions, n_src_files, n_train_words, training_directory)


In [887]:
def calculate_class_centroids(X, y):
    # Convert to DataFrame and add the class labels as a new column
    df = pd.DataFrame(X)
    df['Class'] = y
    
    # Calculate mean embeddings per class
    centroids = df.groupby('Class').mean()  # No need to drop the 'Class' column here
    centroids.index.name = 'Class'
    return centroids

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import rbf_kernel, pairwise_distances
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy, gaussian_kde
import os
from sklearn.feature_extraction.text import CountVectorizer


def compute_jaccard_similarity(X_train, X_eval):
    """
    Computes the Jaccard similarity between the training and evaluation datasets based on their vocabularies.
    
    Parameters:
    - X_train: List of training content.
    - X_eval: List of evaluation content.
    
    Returns:
    - Jaccard similarity (float): Size of intersection divided by size of union of vocabularies.
    """
    # Use CountVectorizer to create vocabularies
    vectorizer_train = CountVectorizer()
    vectorizer_train.fit(X_train)
    vocab_train = set(vectorizer_train.vocabulary_.keys())
    
    vectorizer_eval = CountVectorizer()
    vectorizer_eval.fit(X_eval)
    vocab_eval = set(vectorizer_eval.vocabulary_.keys())
    
    # Calculate Jaccard similarity
    intersection = vocab_train.intersection(vocab_eval)
    union = vocab_train.union(vocab_eval)
    
    jaccard_similarity = len(intersection) / len(union) if len(union) > 0 else 0.0
    
    return jaccard_similarity

def compute_oov_rate(X_train, X_eval):
    """
    Computes vocabulary size and OOV rate for the given datasets using existing libraries.
    
    Parameters:
    - X_train: List of training content.
    - y_train: List of training labels.
    - X_eval: List of evaluation content.
    - training_directory: Directory to save the results.
    - file_name: Name of the output CSV file.
    """
    # Use CountVectorizer to create a vocabulary from the training data
    vectorizer = CountVectorizer()
    vectorizer.fit(X_train)
    vocabulary = vectorizer.vocabulary_
    
    # Prepare to calculate OOV rates functions
    total_words = 0
    oov_count = 0
    
    for content in X_eval:
        words = content.split()
        total_words += len(words)
        oov_count += sum(1 for word in words if word not in vocabulary)
    
    oov_rate = oov_count / total_words if total_words > 0 else 0
    
    return oov_rate

def save_oov_results(oov_results, vocabulary_size, training_directory,inputConfig, file_name):
    """
    Saves OOV results and vocabulary size to CSV files.
    
    Parameters:
    - oov_results: Dictionary containing OOV results.
    - vocabulary_size: Size of the vocabulary.
    - training_directory: Directory to save the results.
    - file_name: Name of the output CSV file.
    """
    # Prepare the results with the training directory
    oov_results['Directory'] = training_directory
    oov_results['Vocabulary Size'] = vocabulary_size
    oov_results["Data creation method"] = inputConfig

    # Convert results to DataFrame
    df_results = pd.DataFrame([oov_results])  # Convert to DataFrame
    
    if not os.path.exists(file_name):
        df_results.to_csv(file_name, index=False)
    else:
        df_results.to_csv(file_name, mode='a', header=False, index=False)

def compute_silhouette_score(embeddings, labels):
    """
    Computes the silhouette score for the embeddings and labels.
    """
    try:
        score = silhouette_score(embeddings, labels)
        return score
    except:
        return 1

def compute_fdr(embeddings, labels):
    """
    Computes the Fisher's Discriminant Ratio (FDR) for the embeddings and labels.
    """
    unique_classes = np.unique(labels)
    overall_mean = np.mean(embeddings, axis=0)
    
    between_class_variance = 0
    within_class_variance = 0
    
    for cls in unique_classes:
        class_embeddings = embeddings[labels == cls]
        class_mean = np.mean(class_embeddings, axis=0)
        n_cls = class_embeddings.shape[0]
        between_class_variance += n_cls * np.sum((class_mean - overall_mean) ** 2)
        within_class_variance += np.sum((class_embeddings - class_mean) ** 2)
    
    fdr = between_class_variance / within_class_variance if within_class_variance > 0 else 1
    return fdr


def compute_entropy(id, labels,training_directory,inputConfig):
    """
    Computes entropy for class distributions.
    """
    counter = Counter(labels)
    total = sum(counter.values())
    proportions = [count / total for count in counter.values()]
    entropy = -sum(p * np.log2(p) for p in proportions if p > 0)
    return entropy



def compute_mmd(X_train, X_test):
    """
    Computes the Maximum Mean Discrepancy (MMD) between two sets of embeddings.
    Uses RBF kernel.

    Args:
        X_train (np.ndarray): shape (n_samples_train, n_features)
        X_test (np.ndarray): shape (n_samples_test, n_features)
        gamma (float): kernel coefficient for RBF

    Returns:
        float: MMD score (lower = more similar)
    """
    # Combine data for pairwise distance calculation
    X = np.vstack([X_train, X_test])
     # Use pairwise Euclidean distances and compute median (excluding self-distances)
    dists = pairwise_distances(X, metric='euclidean')
    dists = dists[np.triu_indices_from(dists, k=1)]
    median_dist = np.median(dists)
    gamma = 1.0 / (2 * median_dist**2) if median_dist > 0 else 1.0
    

    K_XX = rbf_kernel(X_train, X_train, gamma=gamma) # remove gamma to use default
    K_YY = rbf_kernel(X_test, X_test, gamma=gamma)
    K_XY = rbf_kernel(X_train, X_test, gamma=gamma)

    m = X_train.shape[0]
    n = X_test.shape[0]

    mmd = (np.sum(K_XX) - np.trace(K_XX)) / (m * (m - 1)) + \
          (np.sum(K_YY) - np.trace(K_YY)) / (n * (n - 1)) - \
          2 * np.sum(K_XY) / (m * n)

    return mmd



def estimate_density_kde(data, sample_grid):
    """
    Estimate a probability density function using KDE and return
    the probabilities on a fixed sample grid.
    """
    mask = np.std(data, axis=0) > 1e-10
    data = data[:, mask]
    sample_grid = sample_grid[:, mask]
    kde = gaussian_kde(data.T)
    density = kde(sample_grid.T)
    return density / density.sum()  # Normalize to make it a probability distribution

from sklearn.decomposition import PCA

def compute_kl_js_divergence(X_train, X_test, n_grid=500, pca_dim=5):
    
    #min_samples = min(len(X_train), len(X_test))
    #X_train = X_train[np.random.choice(len(X_train), min_samples, replace=False)]
    #X_test = X_test[np.random.choice(len(X_test), min_samples, replace=False)]

    # Reduce dimensionality
    pca = PCA(n_components=pca_dim)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    n_grid = min(100, len(X_train) * 5)
    dim = X_train.shape[1]

    # Build grid
    min_vec = np.minimum(X_train.min(axis=0), X_test.min(axis=0))
    max_vec = np.maximum(X_train.max(axis=0), X_test.max(axis=0))
    grid = np.random.uniform(min_vec, max_vec, size=(n_grid, dim))

    # Estimate KDE
    p_train = estimate_density_kde(X_train, grid)
    p_test = estimate_density_kde(X_test, grid)

    epsilon = 1e-12
    p_train += epsilon
    p_test += epsilon
    p_train /= p_train.sum()
    p_test /= p_test.sum()

    js_div = jensenshannon(p_test, p_train) ** 2
    return js_div

    #kl_div = entropy(p_test, p_train)

    # JS Divergence (symmetrical)
    #js_div = jensenshannon(p_test, p_train) ** 2

    #return js_div # kl_div,

def compute_and_save_data_properties(id,
                                      X_train, 
                                      X_train_embeddings, 
                                      y_train, 
                                      X_eval,
                                      X_eval_embeddings,
                                      y_eval_encoded,
                                      directory,
                                      data_creation_method):
    
    fdr_train = compute_fdr(X_train_embeddings, y_train)
    fdr_eval = compute_fdr(X_eval_embeddings, y_eval_encoded)
    fdr_total = compute_fdr(np.vstack([X_eval_embeddings, X_train_embeddings]), np.hstack([y_eval_encoded, y_train]))

    entropy = compute_entropy(id, y_train, directory, data_creation_method)
    jaccard_similarity = compute_jaccard_similarity(X_train, X_eval)

    divergence = compute_kl_js_divergence(X_train_embeddings, X_eval_embeddings)

    #Covariate shift
    mmd = compute_mmd(X_train_embeddings, X_eval_embeddings)
    #mmd_functions = compute_mmd(X_train_embeddings, X_eval_function_content_embeddings)

    # Create a DataFrame with the entropy result
    df_results = pd.DataFrame({
        'ID': id,
        'Directory': [directory],  # Single value in a list
        'Data creation method': [data_creation_method],  # Single value in a list
        'Entropy': [entropy],  # Single value in a list
        'Jaccard similarity': [jaccard_similarity],
        'Divergence': [divergence],
        'MMD': [mmd],
        'FDR train': [fdr_train],
        'FDR eval': [fdr_eval],
        'FDR total': [fdr_total],
    })
    file_name = 'data_properties.csv'
    #Save results to CSV
    if not os.path.exists(file_name):
        df_results.to_csv(file_name, index=False)
    else:
        df_results.to_csv(file_name, mode='a', header=False, index=False)

    return

In [889]:

# ['paragraphs','sentence_shuffling', 'sentences','rake','yake','random_sentence_augment','synonym_augment','random_word_augment','sentence_spelling_augment','abstractive_summarization_augment']
""" 'data_creation_method': [['paragraphs'],
                                 ['paragraphs','sentences_sliding_window'],
                                 ['paragraphs','decompose_text_with_stopwords'], # not used as sw are removed
                                 ['paragraphs','sentence_back_translation_augment'],
                                 ['paragraphs','abstractive_summarization_augment'],
                                 ['paragraphs','sentences'],
                                 ['paragraphs','rake'],
                                 ['paragraphs','sentence_shuffling'], # same as random sentence aug
                                 ['paragraphs','random_sentence_augment'],
                                 ['paragraphs','synonym_augment'],
                                 ['paragraphs','sentence_spelling_augment'],
                                 ['paragraphs','random_word_augment']],
        # does not take a long time
        'data_creation_method': [['paragraphs'],
                                 ['paragraphs','sentences_sliding_window'],
                                 ['paragraphs','sentences'],
                                 ['paragraphs','rake'],
                                 ['paragraphs','random_sentence_augment'],
                                 ['paragraphs','synonym_augment'],
                                 ['paragraphs','sentence_spelling_augment'],
                                 ['paragraphs','random_word_augment']]
         # does take a long time
    'data_creation_method': [['paragraphs','abstractive_summarization_augment'],
                             ['paragraphs','sentence_back_translation_augment']],
'data_creation_method': [['paragraphs'],
                                ['paragraphs','sentences_sliding_window'],
                                ['paragraphs','sentences'],
                                ['paragraphs','rake'],
                                ['paragraphs','random_sentence_augment'],
                                ['paragraphs','synonym_augment'],
                                ['paragraphs','sentence_spelling_augment'],
                                ['paragraphs','random_word_augment']]"""

# This is constant
preprocessing_options = {
    'data_creation':
    {
        'data_creation_method': [['paragraphs','sentences']],
        'sliding_window_size': [[3]],
        'window_movement': ['non-overlapping']
    },
    'vectorization':
    {
        'method': ['openai-embedding'] # 'unixcoder-base-nine' 'openai-embedding' 'sentencetransformer' 'openai-code-embedding'
    },
}
# Variable
resampling_outlier_options = {
    'resampling':
    {
        'do_resample': [False],
        'n_neighbours': [5], # ['sqrt_min_class','min_class']
    },
    'outlier_filtering':
    {
        'do_filter': [False],
        'contamination': [0.1,'auto'], #dependent on density? - set to auto? ['auto', 0.05, 0.1, 0.2]
        'n_neighbours': [9], # should be dependent on smallest class n_points (max n_points of smallest class)# set to k * density , where k_0 = 30/density (or k * n_points since range is constant)#dependent on density - compare with all training data points. should we really differentiate? - maybe since id's are classified seperately
    }
}

classifier_options= ['kNN']

classifier_hyperparameters = {
    'kNN': {
        'n_neighbors': ['sqrt_min_class'], #['sqrt_min_class','min_class', 1]
        'threshold': [0.01],
        'weights': ['uniform'] #'distance','uniform'
    }
}

In [890]:
# Generate all possible configurations for machine learning
machine_learning_configurations = configurator.generate_machine_learning_configs(classifier_options, classifier_hyperparameters)

# Generate all possible configurations for loading and preprocessing data
data_loading_augmenting_configurations = configurator.generate_data_configurations(preprocessing_options)
resampling_outlier_configurations = configurator.generate_resampling_outlier_configurations(resampling_outlier_options)

print('Running ' + str(len(data_loading_augmenting_configurations) * len(resampling_outlier_configurations)) + ' configurations.')
for inputconfig in data_loading_augmenting_configurations:
    for config in resampling_outlier_configurations:
        print(str(inputconfig) + ' ' + str(config))

Running 1 configurations.
{'data_creation': {'data_creation_method': ['paragraphs', 'sentences'], 'sliding_window_size': [0], 'window_movement': 'default'}, 'vectorization': {'method': 'openai-embedding'}} {'outlier_filtering': {'contamination': 0, 'do_filter': False, 'n_neighbours': 0}, 'resampling': {'do_resample': False, 'n_neighbours': 0, 'resample_before_outlier_filtering': False}}


In [891]:
# Load text data for training
autoware_lv0 = ['./data/autoware/descriptions/level 1', "./data/autoware/code", '.cpp', ['txt'], False,False,'autoware', True, True, True]
autoware_lv1 = ['./data/autoware/descriptions/level 2', "./data/autoware/code", '.cpp', ['txt'], False,False,'autoware', True, True, True]
# True, True, False]

opencv_lv0 = ['./data/opencv/descriptions/level 1', "./data/opencv/code", '.cpp', ['csv'], False, False,'opencv', True, True, True]
opencv_lv1 = ['./data/opencv/descriptions/level 2', "./data/opencv/code", '.cpp', ['csv'], False,False,'opencv', True, True, True]
opencv_lv2 = ['./data/opencv/descriptions/level 3', "./data/opencv/code", '.cpp', ['csv'], False,False,'opencv', True, True, True]
# False, True, True]

rtems_lv0 = ['./data/rtems/descriptions/level 0', "./data/rtems/code", '.c', ['csv'], False,False,'rtems',True, True, True]
rtems_lv1 = ['./data/rtems/descriptions/level 1', "./data/rtems/code", '.c', ['csv'], False,False,'rtems',True, True, True]
rtems_lv2 = ['./data/rtems/descriptions/level 2', "./data/rtems/code", '.c', ['csv'], False,False,'rtems',True, True, True]
rtems_lv3 = ['./data/rtems/descriptions/level 3', "./data/rtems/code", '.c', ['csv'], False,False,'rtems',True, True, True]
# True, True, False]

teammates_lv0 = ['./data/teammates/descriptions/level 1', "./data/teammates/code", '.java', ['txt'], False,False,'teammates', True, True, True]
teammates_lv1 = ['./data/teammates/descriptions/level 2', "./data/teammates/code", '.java', ['csv','txt'], False,False,'teammates', True, True, True]
teammates_lv2 = ['./data/teammates/descriptions/level 3', "./data/teammates/code", '.java', ['csv','txt'], False,False,'teammates', True, True, True]

solr_lv0 = ['./data/solr/descriptions/level 0', "./data/solr/code", '.java', ['csv'], False,False,'solr', True, True, True]
solr_lv1 = ['./data/solr/descriptions/level 1', "./data/solr/code", '.java', ['csv'], False,False,'solr', True, True, True]
solr_lv2 = ['./data/solr/descriptions/level 2', "./data/solr/code", '.java', ['csv'], False,False,'solr', True, True, True]

log4j_lv0 = ['./data/log4j/descriptions/level 0', "./data/log4j/code", '.java', ['csv'], False,False,'solr', True, True, True]
log4j_lv1 = ['./data/log4j/descriptions/level 1', "./data/log4j/code", '.java', ['csv'], False,False,'solr', True, True, True]
log4j_lv2 = ['./data/log4j/descriptions/level 2', "./data/log4j/code", '.java', ['csv'], False,False,'solr', True, True, True]

# True, False, True]

In [None]:
system = autoware_lv1

training_directory = system[0]
src_directory = system[1]
file_extension = system[2] # '.py'#
loading_method = system[3]
do_merge_text = system[4]
do_merge_training_data = system[5]
system_name = system[6]

root = src_directory.split('/')[-1]
use_super_classes = True
use_prob_threshold = False

apply_stemming = system[7]
apply_lemmitization = system[8]
do_remove_stopwords = system[9]

apply_stemming = True
apply_lemmitization = True
do_remove_stopwords = True

do_clean_code = True

In [None]:
def run_classification(training_directory, src_directory, root, file_extension, use_super_classes, loading_method,
    apply_stemming, apply_lemmitization, do_remove_stopwords, do_clean_code, do_merge_text, system_name, do_merge_training_data):

    n_iterations_total = len(data_loading_augmenting_configurations) * len(resampling_outlier_configurations)
    it = 0

    for inputConfig in data_loading_augmenting_configurations:
        try:
            # Instantiation
            data_loader = DataLoader()
            augmenter = Augmenter()
            
            ## Load data
            df_src_functions = MethodsIdentifiersExtractor.extract_methods_identifiers_from_directory(src_directory,system_name, output_file_name=True,file_extension=file_extension,extract_whole_file=False)
            df_src_files = MethodsIdentifiersExtractor.extract_methods_identifiers_from_directory(src_directory,system_name, output_file_name=True,file_extension=file_extension,extract_whole_file=True)
            
            if do_clean_code:
                df_src_functions['Content'] = df_src_functions['Content'].apply(clean_code_snippet)
                df_src_files['Content'] = df_src_files['Content'].apply(clean_code_snippet)
            
            # Load module descriptions
            X_train, y_train = data_loader.load_module_descriptions(training_directory,loading_method,merge_text=do_merge_text)
            
            if do_merge_training_data:
                X_train, y_train = merge_data_with_token_limit(X_train, y_train, max_tokens=256)
            

            ## Pre-processing
            if use_super_classes: # convert labels of subclasses to superclasses
                class_dict = find_parent_classes(src_directory, root)
                #class_dict = find_parent_classes_using_list(full_src_directory, root)
                #print_unmpaped_classes(y_train, class_dict)
                df_src_functions = update_class_names_with_parent_classes(df_src_functions, class_dict)
                df_src_files = update_class_names_with_parent_classes(df_src_files, class_dict)
                y_train = convert_labels_to_parent_classes(y_train, class_dict)

            df_src_functions, df_src_files, X_train = clean_system_and_module_name(df_src_functions, df_src_files, X_train, y_train, system_name)
            data_loader.identify_frequent_tokens(X_train, y_train)

            X_train, y_train = data_loader.clean_text_data(X_train, y_train, remove_stopwords=False, rm_pct=False, remove_frequent=False)
            
            ## Augment data
            X_train, y_train = augmenter.augment_text(
                X_train, y_train, augmenting_methods=inputConfig['data_creation']['data_creation_method'],
                sliding_window_sizes=inputConfig["data_creation"]["sliding_window_size"],
                sliding_window_overlap=inputConfig["data_creation"]["window_movement"],use_resampling=False, upsample_largest_class=None)
            
            ## Final processing
            X_train, y_train = filter_empty_data_points(X_train, y_train)
            X_train, y_train = remove_duplicates(X_train, y_train)
            X_train, y_train = remove_non_strings(X_train, y_train)
            
            df_src_functions = filter_classes_not_in_training_data(df_src_functions, X_train, y_train)
            df_src_files = filter_classes_not_in_training_data(df_src_files, X_train, y_train)
            
            ## Data properties gathering
            df_src_functions['n_NL_words'] = df_src_functions['Content'].apply(count_natural_language_words)
            df_src_files['n_NL_words'] = df_src_files['Content'].apply(count_natural_language_words)

            X_train_tokens = count_openai_tokens(X_train) #tokens_after_augment old
            X_train_total_words = count_words(X_train)        
        
            unique_classes, counts = np.unique(y_train, return_counts=True)
            class_sizes = dict(zip(unique_classes, counts))

            original_n_o_functions = len(df_src_functions['Id'].to_numpy())
            original_n_o_files = len(df_src_files['Id'].to_numpy())

            # Convert the column to numeric (in case there are non-numeric values)
            lines_of_code_functions = pd.to_numeric(df_src_functions['Lines of code'], errors='coerce')
            lines_of_code_files = pd.to_numeric(df_src_files['Lines of code'], errors='coerce')

            n_LoC_functions = lines_of_code_functions.sum()
            n_LoC_mean_functions = lines_of_code_functions.mean()
            n_LoC_median_functions = lines_of_code_functions.median()
            n_LoC_std_functions = lines_of_code_functions.std()
            
            n_LoC_files = lines_of_code_files.sum()
            n_LoC_mean_files = lines_of_code_files.mean()
            n_LoC_median_files = lines_of_code_files.median()
            n_LoC_std_files = lines_of_code_files.std()  


            X_eval_function_content = df_src_functions['Content'].to_numpy() # consider preprocessing method content
            y_eval_function_content = df_src_functions['Class'].to_numpy()
            function_ids = df_src_functions['Id'].to_numpy()

            X_eval_files_content = df_src_files['Content'].to_numpy() # consider preprocessing method content
            y_eval_files_content = df_src_files['Class'].to_numpy()
            file_ids = df_src_files['Id'].to_numpy()

            X_eval_files_content, y_eval_files_content = remove_non_strings(X_eval_files_content, y_eval_files_content)
            X_eval_function_content, y_eval_function_content = remove_non_strings(X_eval_function_content, y_eval_function_content)
            
            X_eval_files_content, y_eval_files_content = data_loader.clean_text_data(X_eval_files_content, y_eval_files_content, remove_stopwords=True, rm_pct=True, remove_frequent=True)
            X_eval_function_content, y_eval_function_content = data_loader.clean_text_data(X_eval_function_content, y_eval_function_content, remove_stopwords=True,  rm_pct=True, remove_frequent=True)
            X_train, y_train = data_loader.clean_text_data(X_train, y_train, remove_stopwords=True, rm_pct=True, remove_frequent=True)

            X_eval_files_content = data_loader.filter_frequent_tokens_test_data(X_eval_files_content)
            X_eval_function_content = data_loader.filter_frequent_tokens_test_data(X_eval_function_content)

            X_train, y_train = filter_empty_data_points(X_train, y_train)

            ## Generate embeddings
            featureExtractor = FeatureExtractor(inputConfig['vectorization']['method'],useLocal=True)
            X_train_embeddings = featureExtractor.get_feature_vectors(X_train)

            if len(X_train_embeddings) != len(X_train):
                print('Lengths X_train and embeddings not equal:', len(X_train),' and ', len(X_train_embeddings),'. Stopping iteration.')
                continue
                #X_train_embeddings = X_train_embeddings[:len(X_train)]

            X_eval_function_content_embeddings = featureExtractor.get_feature_vectors(X_eval_function_content)
            X_eval_files_content_embeddings = featureExtractor.get_feature_vectors(X_eval_files_content)

            # Save data
            saved_data = save_data(X_train, y_train, X_train_embeddings,
                                X_eval_function_content, y_eval_function_content,
                                X_eval_function_content_embeddings, function_ids,
                                X_eval_files_content, y_eval_files_content,
                                X_eval_files_content_embeddings, file_ids)

            for resampling_outlier_config in resampling_outlier_configurations:
                # Load fresh unprocessed data
                (X_train, y_train, X_train_embeddings,
                X_eval_function_content, y_eval_function_content, 
                X_eval_function_content_embeddings, function_ids, 
                X_eval_files_content, y_eval_files_content,
                X_eval_files_content_embeddings, file_ids) = load_data(saved_data)

                #original_classes_of_methods = original_classes_of_methods_saved
                print('Performing iteration ', it+1, ' of ', n_iterations_total)
                it += 1


                if resampling_outlier_config['resampling']['do_resample'] == True:
                    X_train_embeddings, y_train = augmenter.smote_resample(X_train_embeddings, y_train, n_neighbors=resampling_outlier_config['resampling']['n_neighbours'])
                

                unique_classes, counts = np.unique(y_train, return_counts=True)
                class_sizes = dict(zip(unique_classes, counts))

                # Encode labels for training and evaluation
                label_encoder, y_train_encoded, y_eval_functions_encoded, y_eval_files_encoded = encode_labels(y_train, y_eval_function_content, y_eval_files_content)

                for config in machine_learning_configurations:
                    # Instantiate and train the classifier
                    clf = Classifier(method=config["classifier_method"], hyperparameters=config["hyperparameters"])
                    clf.train(X_train_embeddings, y_train_encoded)

                    method_classifier = MethodClassifier(clf)
                    X_eval_file_entropy, X_eval_file_mean_entropy = method_classifier.get_neighbor_entropy(X_train_embeddings, y_train_encoded, X_eval_files_content_embeddings)
                    X_eval_function_entropy, X_eval_function_mean_entropy = method_classifier.get_neighbor_entropy(X_train_embeddings, y_train_encoded, X_eval_function_content_embeddings)

                    X_eval_file_density, X_eval_file_mean_density = method_classifier.get_neighbor_density(X_train_embeddings, X_eval_files_content_embeddings)
                    X_eval_function_density, X_eval_function_mean_density = method_classifier.get_neighbor_density(X_train_embeddings, X_eval_function_content_embeddings)

                    X_eval_function_entropy = pd.DataFrame(X_eval_function_entropy, columns=['Entropy'])
                    X_eval_function_density = pd.DataFrame(X_eval_function_density, columns=['Density'])
                    
                    X_eval_file_entropy = pd.DataFrame(X_eval_file_entropy, columns=['Entropy'])
                    X_eval_file_density = pd.DataFrame(X_eval_file_density, columns=['Density'])
                    
                    if not os.path.exists('results.csv'):
                        max_id = 0
                    else:
                        results_df = pd.read_csv('results.csv')
                        max_id = results_df['ID'].max() if len(results_df) > 0 else 0
                    id = max_id + 1
                    top_n = 1
                    compute_and_save_data_properties(id, 
                                                    X_train,
                                                    X_train_embeddings,
                                                    y_train_encoded, 
                                                    X_eval_function_content, 
                                                    X_eval_function_content_embeddings, 
                                                    y_eval_functions_encoded,                                                 
                                                    training_directory, 
                                                    inputConfig['data_creation']['data_creation_method'])

                    compute_and_save_data_properties(id+top_n, 
                                                    X_train,
                                                    X_train_embeddings,
                                                    y_train_encoded, 
                                                    X_eval_files_content, 
                                                    X_eval_files_content_embeddings, 
                                                    y_eval_files_encoded,                                                 
                                                    training_directory, 
                                                    inputConfig['data_creation']['data_creation_method'])
                    
                    ## Predict functions
                    for N in range(1,top_n+1):
                        #with threadpool_limits(limits=1, user_api='blas'):
                            # Your scikit-learn model prediction code here
                        y_pred_topN_encoded, ids  = method_classifier.predict_method_content(X_eval_function_content_embeddings, function_ids, N=N)

                        #X_eval_function_entropy = pd.DataFrame(X_eval_function_entropy, columns=['Entropy'])
                        #X_eval_function_density = pd.DataFrame(X_eval_function_density, columns=['Density'])

                        # Decode the predicted classes (top N)
                        y_pred_topN = decode_labels(label_encoder, y_pred_topN_encoded)

                        y_eval_temp_list = []
                        
                        # Initialize lists to store metrics
                        lines_of_code_list = []
                        non_generic_terms_list = []
                        unique_non_generic_terms_list = []
                        n_ext_includes_list = []
                        n_comments_list = []
                        n_nl_words_list = []

                        # Loop over each method_id to gather metrics from df_src
                        for method_id in ids:
                            # Get the corresponding row in df_src based on method_id
                            method_row = df_src_functions[df_src_functions['Id'] == method_id]
                            
                            # Extract Lines of Code and number of non-generic terms from df_src
                            lines_of_code = method_row['Lines of code'].values[0]
                            non_generic_terms_count = method_row['n_non_generic_terms'].values[0]
                            unique_non_generic_terms_count = method_row['n_unique_non_generic_terms'].values[0]
                            n_ext_includes = method_row['n_ext_includes'].values[0]
                            n_comments = method_row['n_comments'].values[0]
                            n_nl_words = method_row['n_NL_words'].values[0]

                            # Append the values to the respective lists
                            lines_of_code_list.append(lines_of_code)
                            non_generic_terms_list.append(non_generic_terms_count)
                            unique_non_generic_terms_list.append(unique_non_generic_terms_count)
                            n_ext_includes_list.append(n_ext_includes)
                            n_comments_list.append(n_comments)
                            n_nl_words_list.append(n_nl_words)

                            # Get the true class name
                            class_name = method_row['Class'].values[0]
                            y_eval_temp_list.append(class_name)

                        # Convert lists to DataFrames (1D, since we only have one method name and one true class per method)
                        y_eval_temp = pd.DataFrame(y_eval_temp_list, columns=['True Class'])

                        # Decode the predicted classes (top N)
                        y_pred_topN_df = pd.DataFrame(y_pred_topN, columns=[f'Predicted Class {i+1}' for i in range(N)])

                        # Create DataFrames for the metrics
                        lines_of_code_df = pd.DataFrame(lines_of_code_list, columns=['Lines of code'])
                        non_generic_terms_df = pd.DataFrame(non_generic_terms_list, columns=['Non-generic terms'])
                        unique_non_generic_terms_df = pd.DataFrame(non_generic_terms_list, columns=['Unique non-generic terms'])
                        n_ext_includes_df = pd.DataFrame(n_ext_includes_list, columns =['n_ext_includes'])
                        n_comments_df = pd.DataFrame(n_comments_list, columns =['n_comments'])
                        n_nl_words_df = pd.DataFrame(n_nl_words_list, columns =['n_NL_words'])

                        # Concatenate to form the final DataFrame
                        predictions_top_N_df = pd.concat([y_pred_topN_df, y_eval_temp, X_eval_function_entropy, X_eval_function_density, lines_of_code_df, non_generic_terms_df, unique_non_generic_terms_df, n_ext_includes_df, n_comments_df,n_nl_words_df], axis=1)

                        # Evaluate the top N predictions
                        metrics_top_N, predictions_top_N_df = evaluate_topN_predictions(predictions_top_N_df, original_n_o_functions, N)

                        # Save results
                        save_results(datetime.datetime.now(), len(label_encoder.classes_)-1, inputConfig, resampling_outlier_config,
                                    config['classifier_method'], config['hyperparameters'], metrics_top_N, training_directory,
                                    src_directory, predictions_top_N_df, N, X_train_tokens, apply_stemming, apply_lemmitization, do_clean_code, do_remove_stopwords,
                                    class_sizes, original_n_o_functions,n_LoC_functions,n_LoC_mean_functions,n_LoC_median_functions,n_LoC_std_functions, X_eval_function_mean_entropy, X_eval_function_mean_density, do_map_files=False, do_merge_training_data=do_merge_training_data)
                        
                    ## Predict files
                    for N in range(1,top_n+1):
                        #with threadpool_limits(limits=1, user_api='blas'):
                            # Your scikit-learn model prediction code here
                        y_pred_topN_encoded, ids = method_classifier.predict_method_content(X_eval_files_content_embeddings, file_ids, N=N)
                        
                        # Decode the predicted classes (top N)
                        y_pred_topN = decode_labels(label_encoder, y_pred_topN_encoded)

                        y_eval_temp_list = []
                        
                        # Initialize lists to store metrics
                        lines_of_code_list = []
                        non_generic_terms_list = []
                        unique_non_generic_terms_list = []
                        n_ext_includes_list = []
                        n_comments_list = []
                        n_nl_words_list = []

                        # Loop over each method_id to gather metrics from df_src
                        for method_id in ids:
                            # Get the corresponding row in df_src based on method_id
                            method_row = df_src_files[df_src_files['Id'] == method_id]
                            
                            # Extract Lines of Code and number of non-generic terms from df_src
                            lines_of_code = method_row['Lines of code'].values[0]
                            non_generic_terms_count = method_row['n_non_generic_terms'].values[0]
                            unique_non_generic_terms_count = method_row['n_unique_non_generic_terms'].values[0]
                            n_ext_includes = method_row['n_ext_includes'].values[0]
                            n_comments = method_row['n_comments'].values[0]
                            n_nl_words = method_row['n_NL_words'].values[0]

                            # Append the values to the respective lists
                            lines_of_code_list.append(lines_of_code)
                            non_generic_terms_list.append(non_generic_terms_count)
                            unique_non_generic_terms_list.append(unique_non_generic_terms_count)
                            n_ext_includes_list.append(n_ext_includes)
                            n_comments_list.append(n_comments)
                            n_nl_words_list.append(n_nl_words)

                            # Get the true class name
                            class_name = method_row['Class'].values[0]
                            y_eval_temp_list.append(class_name)

                        # Convert lists to DataFrames (1D, since we only have one method name and one true class per method)
                        y_eval_temp = pd.DataFrame(y_eval_temp_list, columns=['True Class'])

                        # Decode the predicted classes (top N)
                        y_pred_topN_df = pd.DataFrame(y_pred_topN, columns=[f'Predicted Class {i+1}' for i in range(N)])

                        # Create DataFrames for the metrics
                        lines_of_code_df = pd.DataFrame(lines_of_code_list, columns=['Lines of code'])
                        non_generic_terms_df = pd.DataFrame(non_generic_terms_list, columns=['Non-generic terms'])
                        unique_non_generic_terms_df = pd.DataFrame(unique_non_generic_terms_list, columns=['Unique non-generic terms'])
                        n_ext_includes_df = pd.DataFrame(n_ext_includes_list, columns =['n_ext_includes'])
                        n_comments_df = pd.DataFrame(n_comments_list, columns =['n_comments'])
                        n_nl_words_df = pd.DataFrame(n_nl_words_list, columns =['n_NL_words'])

                        # Concatenate to form the final DataFrame
                        predictions_top_N_df = pd.concat([y_pred_topN_df, y_eval_temp, X_eval_file_entropy, X_eval_file_density, lines_of_code_df, non_generic_terms_df, unique_non_generic_terms_df, n_ext_includes_df, n_comments_df, n_nl_words_df], axis=1)

                        # Evaluate the top N predictions
                        metrics_top_N, predictions_top_N_df = evaluate_topN_predictions(predictions_top_N_df, original_n_o_files, N)

                        # Save results
                        save_results(datetime.datetime.now(), len(label_encoder.classes_)-1, inputConfig, resampling_outlier_config,
                                    config['classifier_method'], config['hyperparameters'], metrics_top_N, training_directory,
                                    src_directory, predictions_top_N_df, N, X_train_tokens, apply_stemming, apply_lemmitization, do_clean_code, do_remove_stopwords,
                                    class_sizes,  
                                    original_n_o_files,n_LoC_files,n_LoC_mean_files, n_LoC_median_files, n_LoC_std_files, X_eval_function_mean_entropy, X_eval_function_mean_density, do_map_files=True, do_merge_training_data=do_merge_training_data)

            del X_train, y_train, X_train_embeddings

        except MemoryError as e:
            print('MemoryError:','inputconfig:',inputConfig, '. Directory: ', training_directory,'. Error: ',e)
            continue
        except ValueError as e:
            print('ValueError:','inputconfig:',inputConfig, '. Directory: ', training_directory,'. Error: ',e)
            continue

In [None]:

"""
autoware_file = ['./data/autoware/descriptions/level 1', "./data/autoware/code", 'cpp', ['txt'], False,True,False,'autoware']
autoware_file_lv2 = ['./data/autoware/descriptions/level 2', "./data/autoware/code", 'cpp', ['txt'], False,True,False,'autoware']

opencv_file = ['./data/opencv/descriptions/level 1', "./data/opencv/code", 'cpp', ['csv'], True,True, False,'opencv']
opencv_file_lv2 = ['./data/opencv/descriptions/level 2', "./data/opencv/code", 'cpp', ['csv'], True,True,False,'opencv']
opencv_file_lv3 = ['./data/opencv/descriptions/level 3', "./data/opencv/code", 'cpp', ['csv'], True,True,False,'opencv']

rtems_file_lv0 = ['./data/rtems/descriptions/level 0', "./data/rtems/code", 'c', ['csv'], False,True,False,'rtems']
rtems_file = ['./data/rtems/descriptions/level 1', "./data/rtems/code", 'c', ['csv'], False,True,False,'rtems']
rtems_file_lv2 = ['./data/rtems/descriptions/level 2', "./data/rtems/code", 'c', ['csv'], False,True,False,'rtems']
rtems_file_lv3 = ['./data/rtems/descriptions/level 3', "./data/rtems/code", 'c', ['csv'], False,True,False,'rtems']

teammates_file = ['./data/teammates/descriptions/level 1', "./data/teammates/code", 'java', ['txt'], False,True,False,'teammates']
teammates_file_lv2 = ['./data/teammates/descriptions/level 2', "./data/teammates/code", 'java', ['csv','txt'], False,True,False,'teammates']
teammates_file_lv3 = ['./data/teammates/descriptions/level 3', "./data/teammates/code", 'java', ['csv','txt'], False,True,False,'teammates']
"""
targetSystems = [autoware_lv0, autoware_lv1, opencv_lv0, opencv_lv1, opencv_lv2, rtems_lv0, rtems_lv1, rtems_lv2, rtems_lv3, teammates_lv0, teammates_lv1, teammates_lv2, solr_lv0, solr_lv1, solr_lv2, log4j_lv0, log4j_lv1, log4j_lv2] #  
targetSystems = [log4j_lv0, log4j_lv1, log4j_lv2]

targetSystems_test = [autoware_lv0, opencv_lv0, rtems_lv0, teammates_lv0]
targetSystems_test_big = [opencv_lv2, rtems_lv3, teammates_lv2, autoware_lv1] #, autoware_lv1
use_super_classes = True


true_false_opts = [False, True]

for system in targetSystems:
    training_directory = system[0]
    src_directory = system[1]
    file_extension = system[2]
    loading_method = system[3]
    do_merge_text = system[4]
    do_merge_training_data = False #system[5]
    system_name = system[6]

    do_stem = system[7]
    do_lemmitize = system[8]
    do_remove_stopwords = system[9]
    do_clean_code = True

    root = src_directory.split('/')[-1]
    print('Running system: ', system)
    #run_prompt_classification(training_directory, src_directory, root, file_extension, use_super_classes, loading_method, do_stem, do_lemmitize, False, do_merge_text, system_name, fileMapping)
    run_classification(training_directory, src_directory, root, file_extension, use_super_classes, loading_method, 
                                do_stem, do_lemmitize, do_remove_stopwords, do_clean_code, do_merge_text, system_name, do_merge_training_data)


Running system:  ['./data/log4j/descriptions/level 0', './data/log4j/code', '.java', ['csv'], False, False, 'solr', True, True, True]


Fetching embeddings: 100%|██████████| 747/747 [00:11<00:00, 65.60it/s]
Fetching embeddings: 100%|██████████| 3331/3331 [00:45<00:00, 73.96it/s]
Fetching embeddings: 100%|██████████| 716/716 [01:00<00:00, 11.77it/s]


Performing iteration  1  of  1
Running system:  ['./data/log4j/descriptions/level 1', './data/log4j/code', '.java', ['csv'], False, False, 'solr', True, True, True]


Fetching embeddings: 100%|██████████| 1876/1876 [00:26<00:00, 70.54it/s]
Fetching embeddings: 100%|██████████| 3331/3331 [00:42<00:00, 78.88it/s]
Fetching embeddings: 100%|██████████| 716/716 [00:58<00:00, 12.25it/s]


Performing iteration  1  of  1
Running system:  ['./data/log4j/descriptions/level 2', './data/log4j/code', '.java', ['csv'], False, False, 'solr', True, True, True]


Fetching embeddings: 100%|██████████| 9961/9961 [02:07<00:00, 78.03it/s]
Fetching embeddings: 100%|██████████| 3331/3331 [00:40<00:00, 81.69it/s] 
Fetching embeddings: 100%|██████████| 716/716 [00:58<00:00, 12.17it/s]


Performing iteration  1  of  1
