In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

def select_elements(file_name, num_clusters, mutants):
    # Read the file and process lines in pairs
    with open(file_name, 'r') as file:
        lines = file.readlines()
        categories = []
        values = []
        for i in range(0, len(lines), 2):
            # Remove whitespace and split categories by space
            cats = lines[i].strip().split()
            # Add categories to the main list
            categories.extend(cats)
            # Add associated value if lines are available
            if i + 1 < len(lines):
                values.append(float(lines[i+1].strip()))

    # Get unique categories
    categories = list(set(categories))

    # Create a dictionary to store binary variables and 'Length'
    data = {cat: [] for cat in categories}
    data['Length'] = []

    # Process categories and create binary variables
    for i in range(len(values)):
        for cat in categories:
            if cat in lines[2*i]:
                data[cat].append(1)
            else:
                data[cat].append(0)
        data['Length'].append(values[i] if i < len(values) else None)
    
    for num in range(mutants+1):
        col_name = str(num)
        if col_name not in data:
            data[col_name] = [0] * len(values)
    
    

    # Create DataFrame
    df = pd.DataFrame(data)

    # Normalize data (except the 'Length' column)
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df.drop(columns=['Length']))

    # Apply hierarchical clustering
    cluster = AgglomerativeClustering(n_clusters=num_clusters, metric='euclidean', linkage='ward')
    clusters = cluster.fit_predict(df_scaled)

    # Create a new DataFrame with all variables and the cluster they belong to
    df_with_cluster = df.copy()
    df_with_cluster['Cluster'] = clusters

    # Calculate total length
    total_length = sum(df_with_cluster['Length'])

    # Initialize accumulated length and selected elements
    accumulated_length = 0
    selected_indices = []

    # Select one element from each cluster
    for cluster_id in range(num_clusters):
        cluster_elements = df_with_cluster[df_with_cluster['Cluster'] == cluster_id]
        if len(cluster_elements) > 0:
            smallest_element = cluster_elements['Length'].idxmin()
            accumulated_length += df_with_cluster.loc[smallest_element]['Length']
            # Add the smallest element
            selected_indices.append(smallest_element)

    # Filter the original DataFrame to get the selected elements
    selected_elements = df_with_cluster.loc[selected_indices]

    # Calculate the total number of mutants in all combined columns
    total_mutant_deaths = selected_elements.iloc[:, :-2].sum(axis=0)

    # Calculate the total number of mutants "killed" at least once in any test
    total_mutants_killed = (total_mutant_deaths > 0).sum()

    # Calculate the percentage of mutants killed by any test
    percentage_mutants_killed = (total_mutants_killed / len(total_mutant_deaths)) * 100

    print("Total number of mutants 'killed' at least once:", total_mutants_killed)
    print("Percentage of mutants 'killed' by any test:", percentage_mutants_killed)

    return 100-percentage_mutants_killed, accumulated_length 



In [2]:
# Example usage
for file in range(0,9):
    file_name = 'inputs/Elems_' + str(file) + '.txt'
    mutants_name = 'inputs/Muts_' + str(file) + '.txt'
    with open(mutants_name, 'r') as muts_file:
        mutants_size = int(muts_file.readline().strip('\n'))
    for num_clusters in range(2,31):        
        ms, length = select_elements(file_name, num_clusters, mutants_size)
        print(length, ";", ms)
    print()

Total number of mutants 'killed' at least once: 60
Percentage of mutants 'killed' by any test: 14.962593516209477
6.0 ; 85.03740648379052
Total number of mutants 'killed' at least once: 111
Percentage of mutants 'killed' by any test: 27.680798004987533
12.0 ; 72.31920199501246
Total number of mutants 'killed' at least once: 171
Percentage of mutants 'killed' by any test: 42.643391521197
22.0 ; 57.356608478803
Total number of mutants 'killed' at least once: 208
Percentage of mutants 'killed' by any test: 51.87032418952619
32.0 ; 48.12967581047381
Total number of mutants 'killed' at least once: 242
Percentage of mutants 'killed' by any test: 60.349127182044896
41.0 ; 39.650872817955104
Total number of mutants 'killed' at least once: 269
Percentage of mutants 'killed' by any test: 67.08229426433915
48.0 ; 32.917705735660846
Total number of mutants 'killed' at least once: 274
Percentage of mutants 'killed' by any test: 68.3291770573566
56.0 ; 31.670822942643397
Total number of mutants 'kil