# Compression Testing

In [None]:
!python3 -m pip install -q --upgrade pip

import numpy as np
import pandas as pd
import scipy.sparse
from scipy.sparse import coo_matrix
import pickle
!pip install zarr -q
import zarr
import gzip
import io
import os
import re
!pip install tables -q
!pip install pyarrow -q
import pyarrow as pa
import pyarrow.parquet as pq
import shutil
!pip install zstandard -q
import zstandard as zstd
import scipy.sparse as sp

**Initialization**

In [None]:
# Check if we are in Google Colab (google.colab will not be available locally)
try:
    import google.colab
    is_colab = True
except ImportError:
    is_colab = False

if is_colab:
    from google.colab import drive
    drive.mount('/content/drive')

    # Define source and destination paths
    source_path = '/content/drive/MyDrive/FH Oberösterreich/Master Thesis/Data'
    destination_path = '/content/Data'

    # Copy the entire directory
    shutil.copytree(source_path, destination_path, dirs_exist_ok=True)

In [None]:
def load_sparse_3d_array(filename, precision=None):
    # Load the saved sparse data
    data = np.loadtxt(filename)
    indices = data[:, 0].astype(int)
    values = data[:, 1]

    # Apply precision if specified
    if precision is not None:
        values = np.round(values, decimals=precision)

    # Extract shape from the file using regular expression
    with open(filename, 'r') as f:
        for line in f:
            match = re.search(r"# shape:\s*\((.*?)\)", line)
            if match:
                shape = tuple(map(int, match.group(1).split(",")))

    # Reconstruct the full flattened array with zeros
    flat_array = np.zeros(np.prod(shape), dtype=values.dtype)
    flat_array[indices] = values

    # Reshape back to original 3D shape
    return flat_array.reshape(shape)

# Convert sparse matrix to DataFrame
def sparse_array_to_dataframe(sparse_array):
    # Convert sparse CSR matrix to COO format for easy DataFrame creation
    sparse_coo = sparse_array.tocoo()

    # Create DataFrame with 'row', 'col', and 'value' columns
    df = pd.DataFrame({
        'row': sparse_coo.row,
        'col': sparse_coo.col,
        'value': sparse_coo.data
    })

    return df

# Convert dense array to DataFrame
def dense_array_to_dataframe(dense_array):
    df = pd.DataFrame(dense_array.flatten(), columns=['value']).reset_index()
    df['row'], df['col'] = np.divmod(df['index'].values, dense_array.shape[1])
    df = df.drop(columns=['index'])
    return df

# Add Zarr Saving and Size Calculation
def save_dataframe_to_zarr(df, filename):
    store = zarr.DirectoryStore(filename)
    group = zarr.group(store)

    # Check if the path 'data' exists and overwrite
    if 'data' in group:
        del group['data']  # Remove existing array to avoid error

    group.array('data',
                df.to_numpy(),
                compressor=zarr.Blosc(cname='zstd', clevel=5, shuffle=zarr.Blosc.SHUFFLE),
                overwrite=True)  # Allow overwriting

    return store

def compress_dataframe_parquet(df, compressionType):
    # Serialize DataFrame to Parquet format in memory with built-in compression
    buffer = io.BytesIO()
    pq.write_table(pa.Table.from_pandas(df), buffer, compression=compressionType)  # Parquet's own efficient compression

    return buffer.getvalue()

# Function to compress DataFrame using pickle and gzip
def compress_dataframe(df):
    df = df.astype(pd.SparseDtype("float32", np.nan))
    return gzip.compress(pickle.dumps(df))

def compress_dataframe_parquet_gzip(df, compressionType):
    # Serialize DataFrame to Parquet format in memory with built-in compression
    buffer = io.BytesIO()
    pq.write_table(pa.Table.from_pandas(df), buffer, compression=compressionType)  # Parquet's own efficient compression

    # Further compress with gzip
    return gzip.compress(buffer.getvalue())

def print_reduction(original_size, compressed_size, format_name):
    reduction = (original_size - compressed_size) / original_size * 100
    print(f"Reduction with {format_name}: {reduction:.2f}%")

# Test between Sparse and Dense Array Compression

In [None]:
# Convert a dense array for comparison
dense_array = load_sparse_3d_array('Data/Wiki_SparseArray_0.txt')
df_dense = dense_array_to_dataframe(dense_array)

# Reshape the dense array to 2D (if it's 3D, flatten it accordingly)
dense_array_2d = dense_array.reshape(-1, dense_array.shape[-1])  # Flatten to 2D if necessary

# Convert the 2D array to a sparse matrix using CSR format
sparse_array = scipy.sparse.csr_matrix(dense_array_2d)
df_sparse = sparse_array_to_dataframe(sparse_array)

for df, array_type in zip([df_sparse, df_dense], ['Sparse Array', 'Dense Array']):
    print(f"\n--- Processing {array_type} ---")

    output_dir = os.path.expanduser('~/Temp')
    os.makedirs(output_dir, exist_ok=True)

    # Calculate original DataFrame size in memory
    original_size = df.memory_usage(deep=True).sum()

    # Parquet
    parquet_gzip_filename = os.path.join(output_dir, f'data_{array_type.lower().replace(" ", "_")}_gzip.parquet')
    df.to_parquet(parquet_gzip_filename, compression='gzip')
    parquet_gzip_size = os.path.getsize(parquet_gzip_filename)
    os.remove(parquet_gzip_filename)

    # HDF5
    hdf5_filename = os.path.join(output_dir, f'data_{array_type.lower().replace(" ", "_")}.h5')
    with pd.HDFStore(hdf5_filename, mode='w', complib='blosc') as store:
        store.put('dataset', df, format='table', data_columns=True)
    hdf5_size = os.path.getsize(hdf5_filename)
    os.remove(hdf5_filename)

    # CSV
    csv_filename = os.path.join(output_dir, f'data_{array_type.lower().replace(" ", "_")}.csv.gz')
    df.to_csv(csv_filename, index=False, compression='gzip')
    csv_size = os.path.getsize(csv_filename)
    os.remove(csv_filename)

    # Zarr
    zarr_filename = os.path.join(output_dir, f'data_{array_type.lower().replace(" ", "_")}.zarr')
    save_dataframe_to_zarr(df, zarr_filename)
    zarr_size = sum(os.path.getsize(os.path.join(dirpath, file)) for dirpath, _, files in os.walk(zarr_filename) for file in files)
    shutil.rmtree(zarr_filename)

    shutil.rmtree(output_dir)

    # Pickle + Compression Sizes
    pickle_size = len(compress_dataframe(df))
    snappy_size = len(compress_dataframe_parquet(df, 'snappy'))
    zstd_size = len(compress_dataframe_parquet(df, 'zstd'))
    snappy_gzip_size = len(compress_dataframe_parquet_gzip(df, 'snappy'))
    zstd_gzip_size = len(compress_dataframe_parquet_gzip(df, 'zstd'))

    # Output Results
    print(f"Original size (bytes): {original_size}")
    print(f"Parquet (gzip) size (bytes): {parquet_gzip_size}")
    print(f"HDF5 size (bytes): {hdf5_size}")
    print(f"CSV size (bytes): {csv_size}")
    print(f"Zarr size (bytes): {zarr_size}")
    print(f"Parquet (Snappy) size (bytes): {snappy_size}")
    print(f"Parquet (ZSTD) size (bytes): {zstd_size}")
    print(f"Pickle size (bytes): {pickle_size}")
    print(f"Pickle with Parquet (Snappy) size (bytes): {snappy_gzip_size}")
    print(f"Pickle with Parquet (ZSTD) size (bytes): {zstd_gzip_size}")

    # Print Reductions
    print_reduction(original_size, parquet_gzip_size, 'Parquet (gzip)')
    print_reduction(original_size, hdf5_size, 'HDF5')
    print_reduction(original_size, csv_size, 'CSV')
    print_reduction(original_size, snappy_size, 'Snappy')
    print_reduction(original_size, zstd_size, 'ZSTD')
    print_reduction(original_size, zarr_size, 'Zarr')
    print_reduction(original_size, pickle_size, 'Pickle')
    print_reduction(original_size, snappy_gzip_size, 'Parquet & GZIP (Snappy)')
    print_reduction(original_size, zstd_gzip_size, 'Parquet & GZIP (ZSTD)')
    break


--- Processing Sparse Array ---
Original size (bytes): 3400896
Parquet (gzip) size (bytes): 1668160
HDF5 size (bytes): 6975322
CSV size (bytes): 1341197
Zarr size (bytes): 1492893
Parquet (Snappy) size (bytes): 2147925
Parquet (ZSTD) size (bytes): 1829172
Pickle size (bytes): 1775284
Pickle with Parquet (Snappy) size (bytes): 1652950
Pickle with Parquet (ZSTD) size (bytes): 1788891
Reduction with Parquet (gzip): 50.95%
Reduction with HDF5: -105.10%
Reduction with CSV: 60.56%
Reduction with Snappy: 36.84%
Reduction with ZSTD: 46.21%
Reduction with Zarr: 56.10%
Reduction with Pickle: 47.80%
Reduction with Parquet & GZIP (Snappy): 51.40%
Reduction with Parquet & GZIP (ZSTD): 47.40%


# Compression with different normalizations

**1. Integer Normalization**

In [None]:
def process_array_compression(df_dense, df_sparse, fixed=True):
    # Iterate over the dense and sparse arrays
    for df, array_type in zip([df_sparse, df_dense], ['Sparse Array', 'Dense Array']):
        print(f"\n--- Processing {array_type} ---")

        # Test with different integer types
        for int_type in ['uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32']:
            print(f"\n--- Testing with {int_type} ---")

            # Output directory
            output_dir = os.path.expanduser('~/Temp')
            os.makedirs(output_dir, exist_ok=True)

            # Normalize the data
            if fixed:
                normalized_data, denormalized_data = normalize_to_integer(df.values, int_type)
            else:
                normalized_data, denormalized_data = normalize_dynamic_ranges(df.values, int_type)

            # Convert the normalized data back into a DataFrame to be saved
            df_normalized = pd.DataFrame(normalized_data, columns=df.columns)

            # Calculate the MAE and MSE
            mae, mse = calculate_loss(df.values, denormalized_data)

            # Output the MAE and MSE
            print(f"MAE: {mae}")
            print(f"MSE: {mse}")

            # Calculate original DataFrame size in memory
            original_size = df.memory_usage(deep=True).sum()

            # Step 1: Save DataFrame to Parquet with gzip compression
            parquet_gzip_filename = os.path.join(output_dir, f'data_{array_type.lower().replace(" ", "_")}_gzip.parquet')
            df_normalized.to_parquet(parquet_gzip_filename, compression='gzip')
            parquet_gzip_size = os.path.getsize(parquet_gzip_filename)

            # Remove the Parquet file after processing
            os.remove(parquet_gzip_filename)

            # Step 2: Save DataFrame to HDF5 with gzip compression
            hdf5_filename = os.path.join(output_dir, f'data_{array_type.lower().replace(" ", "_")}.h5')
            with pd.HDFStore(hdf5_filename, mode='w', complib='blosc') as store:
                store.put('dataset', df_normalized, format='table', data_columns=True)
            hdf5_size = os.path.getsize(hdf5_filename)

            # Remove the HDF5 file after processing
            os.remove(hdf5_filename)

            # Step 3: Save DataFrame to CSV (compressed)
            csv_filename = os.path.join(output_dir, f'data_{array_type.lower().replace(" ", "_")}.csv.gz')
            df_normalized.to_csv(csv_filename, index=False, compression='gzip')
            csv_size = os.path.getsize(csv_filename)

            # Remove the CSV file after processing
            os.remove(csv_filename)

            # Remove the output directory after processing
            shutil.rmtree(output_dir)

            # Pickle + Compression Sizes
            pickle_size = len(compress_dataframe(df_normalized))
            snappy_size = len(compress_dataframe_parquet(df_normalized, 'snappy'))
            zstd_size = len(compress_dataframe_parquet(df_normalized, 'zstd'))
            snappy_gzip_size = len(compress_dataframe_parquet_gzip(df_normalized, 'snappy'))
            zstd_gzip_size = len(compress_dataframe_parquet_gzip(df_normalized, 'zstd'))

            # Output Results
            print(f"Original size (bytes): {original_size}")
            print(f"Parquet (gzip) size (bytes): {parquet_gzip_size}")
            print(f"HDF5 size (bytes): {hdf5_size}")
            print(f"CSV size (bytes): {csv_size}")
            print(f"Zarr size (bytes): {zarr_size}")
            print(f"Parquet (Snappy) size (bytes): {snappy_size}")
            print(f"Parquet (ZSTD) size (bytes): {zstd_size}")
            print(f"Pickle size (bytes): {pickle_size}")
            print(f"Pickle with Parquet (Snappy) size (bytes): {snappy_gzip_size}")
            print(f"Pickle with Parquet (ZSTD) size (bytes): {zstd_gzip_size}")

            # Print Reductions
            print_reduction(original_size, parquet_gzip_size, 'Parquet (gzip)')
            print_reduction(original_size, hdf5_size, 'HDF5')
            print_reduction(original_size, csv_size, 'CSV')
            print_reduction(original_size, snappy_size, 'Snappy')
            print_reduction(original_size, zstd_size, 'ZSTD')
            print_reduction(original_size, zarr_size, 'Zarr')
            print_reduction(original_size, pickle_size, 'Pickle')
            print_reduction(original_size, snappy_gzip_size, 'Parquet & GZIP (Snappy)')
            print_reduction(original_size, zstd_gzip_size, 'Parquet & GZIP (ZSTD)')
        break

def calculate_loss(original_data, denormalized_data):
    # Calculate the absolute difference between the original and denormalized data
    loss = np.abs(original_data - denormalized_data)

    # Calculate the Mean Absolute Error (MAE)
    mae = np.mean(loss)

    # Calculate the percentage loss in terms of the range of the original data
    max_val = np.max(original_data)
    min_val = np.min(original_data)
    range_val = max_val - min_val
    loss_percentage = (mae / range_val) * 100

    return mae, loss_percentage

# Define integer ranges based on the selected type
int_ranges = {
    'uint8': (0, 255),
    'int8': (-128, 127),
    'uint16': (0, 65535),
    'int16': (-32768, 32767),
    'uint32': (0, 4294967295),
    'int32': (-2147483648, 2147483647)
}

**1.1 Fixed Ranges**

In [None]:
def normalize_to_integer(data, int_type='uint8'):
    if int_type not in int_ranges:
        raise ValueError("Unsupported integer type. Choose from 'uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32'.")

    min_int, max_int = int_ranges[int_type]

    # Calculate min and max of the data
    min_val = np.min(data)
    max_val = np.max(data)

    # Normalize data to integer range
    normalized = np.round((data - min_val) * (max_int - min_int) / (max_val - min_val) + min_int).astype(int)

    # Denormalize back to the original float range for comparison
    denormalized = (normalized - min_int) * (max_val - min_val) / (max_int - min_int) + min_val

    return normalized, denormalized

process_array_compression(df_dense, df_sparse, fixed=True)


--- Processing Sparse Array ---

--- Testing with uint8 ---
MAE: 36.33142398571479
MSE: 0.072276918333444
Original size (bytes): 3400896
Parquet (gzip) size (bytes): 3836
HDF5 size (bytes): 7238937
CSV size (bytes): 4554
Zarr size (bytes): 1492893
Parquet (Snappy) size (bytes): 4811
Parquet (ZSTD) size (bytes): 3767
Pickle size (bytes): 892268
Pickle with Parquet (Snappy) size (bytes): 2553
Pickle with Parquet (ZSTD) size (bytes): 2388
Reduction with Parquet (gzip): 99.89%
Reduction with HDF5: -112.85%
Reduction with CSV: 99.87%
Reduction with Snappy: 99.86%
Reduction with ZSTD: 99.89%
Reduction with Zarr: 56.10%
Reduction with Pickle: 73.76%
Reduction with Parquet & GZIP (Snappy): 99.92%
Reduction with Parquet & GZIP (ZSTD): 99.93%

--- Testing with int8 ---
MAE: 36.33142398571479
MSE: 0.072276918333444
Original size (bytes): 3400896
Parquet (gzip) size (bytes): 3842
HDF5 size (bytes): 7239970
CSV size (bytes): 8222
Zarr size (bytes): 1492893
Parquet (Snappy) size (bytes): 4809
Parqu

**Dynamic Ranges**

In [None]:
def normalize_dynamic_ranges(data, int_type='uint8'):
    if int_type not in int_ranges:
        raise ValueError("Unsupported integer type. Choose from 'uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32'.")

    min_int, max_int = int_ranges[int_type]

    # Get the dynamic ranges split points
    min_val = np.min(data)
    max_val = np.max(data)

    # Define split points based on your logic (-1, 0, 1)
    split_points = [-1, 0, 1]

    # Create a placeholder for normalized data
    normalized_data = np.zeros_like(data, dtype=int)

    def normalize_range(sub_data, start, end):
        # Normalize within the specific range
        return np.round((sub_data - start) * (max_int - min_int) / (end - start) + min_int).astype(int)

    # Apply normalization for each dynamic range
    for start, end in zip([min_val] + split_points, split_points + [max_val]):
        mask = (data >= start) & (data < end)
        normalized_data[mask] = normalize_range(data[mask], start, end)

    # Denormalize back to original range for comparison
    denormalized_data = np.zeros_like(data, dtype=float)
    for start, end in zip([min_val] + split_points, split_points + [max_val]):
        mask = (normalized_data >= min_int) & (normalized_data <= max_int)
        denormalized_data[mask] = (normalized_data[mask] - min_int) * (end - start) / (max_int - min_int) + start

    return normalized_data, denormalized_data

process_array_compression(df_dense, df_sparse, fixed=False)


--- Processing Sparse Array ---

--- Testing with uint8 ---
MAE: 8425.839175954501
MSE: 16.762175087071945
Original size (bytes): 3400896
Parquet (gzip) size (bytes): 180849
HDF5 size (bytes): 7399018
CSV size (bytes): 345732
Zarr size (bytes): 1492893
Parquet (Snappy) size (bytes): 216945
Parquet (ZSTD) size (bytes): 157202
Pickle size (bytes): 1143634
Pickle with Parquet (Snappy) size (bytes): 181209
Pickle with Parquet (ZSTD) size (bytes): 155714
Reduction with Parquet (gzip): 94.68%
Reduction with HDF5: -117.56%
Reduction with CSV: 89.83%
Reduction with Snappy: 93.62%
Reduction with ZSTD: 95.38%
Reduction with Zarr: 56.10%
Reduction with Pickle: 66.37%
Reduction with Parquet & GZIP (Snappy): 94.67%
Reduction with Parquet & GZIP (ZSTD): 95.42%

--- Testing with int8 ---
MAE: 8425.760053311724
MSE: 16.762017682264673
Original size (bytes): 3400896
Parquet (gzip) size (bytes): 180921
HDF5 size (bytes): 7400076
CSV size (bytes): 360959
Zarr size (bytes): 1492893
Parquet (Snappy) size 

# Evaluation Testing

In [None]:
import requests
import urllib.request
!pip install -q stanza
import stanza
# Suppress logging from stanza
nlp = stanza.Pipeline('en', verbose=False)
stanza.download('en', verbose=False)
# Compute cosine similarity
from numpy import dot
from numpy.linalg import norm
from collections import defaultdict
from scipy.stats import spearmanr
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

**Create train set**

In [None]:
def createTrainSet():
    global sentences, sentencesStructure

    file_path = "TheVerdict.txt"
    url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt"

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode('utf-8')
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()

    # Remove any newline characters and extra spaces
    text_data = text_data.replace("\n", " ").strip()

    # Process the text with Stanza to extract sentences
    doc = nlp(text_data)
    sentencesStructure = [[sentence.text for sentence in doc.sentences]]

    # Flatten the list of sentences correctly
    sentences = [sentence for sublist in sentencesStructure for sentence in sublist]

    print(f"Created a train set with {len(sentences)} sentences")
    return sentences

#sentences = createTrainSet()

In [None]:
# Wikipedia API endpoint for querying
WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"

def fetch_article_titles(category):
    """Fetch a list of article titles from a Wikipedia category."""
    titles = []
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'categorymembers',
        'cmtitle': f'Category:{category}',
        'cmlimit': 'max'  # Get the maximum number of articles
    }
    response = requests.get(WIKIPEDIA_API_URL, params=params).json()
    for page in response.get('query', {}).get('categorymembers', []):
        titles.append(page['title'])
    return titles

def fetch_and_parse_content(title):
    """Fetch the content of a Wikipedia article and parse it."""
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'prop': 'extracts',
        'explaintext': True
    }
    response = requests.get(WIKIPEDIA_API_URL, params=params).json()
    pages = response.get('query', {}).get('pages', {})
    page = next(iter(pages.values()))
    content = page.get('extract', '')

    return content

def clean_wikipedia_content(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        # Remove references, citations, and template placeholders
        sentence = re.sub(r'\[\[File:.*?\]\]', '', sentence)  # Remove file/image links (e.g., [[File:Image.jpg]])
        sentence = re.sub(r'\[\[.*?\|.*?\]\]', '', sentence)  # Remove internal links with aliases (e.g., [[Link|Alias]])
        sentence = re.sub(r'\[\[.*?\]\]', '', sentence)      # Remove internal links (e.g., [[Link]])

        sentence = re.sub(r'\{.*?\}', '', sentence)      # Remove templates (e.g., {{Citation needed}} or any template)
        sentence = re.sub(r'==+.*?==+', '', sentence)    # Remove headings (e.g., == Heading ==)
        sentence = re.sub(r'===+.*?===+', '', sentence)  # Remove subheadings (e.g., === Subheading ===)
        sentence = re.sub(r'<!--.*?-->', '', sentence, flags=re.DOTALL)  # Remove comments
        sentence = re.sub(r'\<.*?\>', '', sentence)      # Remove any HTML tags (just in case)

        # Remove irrelevant sections like "See also", "External Links", "References", "Further reading"
        sentence = re.sub(r'\n(See also|External links|References|Further reading)\n.*?(\n|$)', '', sentence, flags=re.DOTALL)

        # Remove anything inside curly brackets (often for templates or references)
        sentence = re.sub(r'\{.*?\}', '', sentence)

        # Clean up extra spaces
        sentence = re.sub(r'\s+', ' ', sentence).strip()

        # Remove short sentences or fragments (less than 3 words)
        if len(sentence.split()) > 2:
            cleaned_sentences.append(sentence.strip())

    return cleaned_sentences

# Function to split cleaned content into sentences
def split_sentences(content, nlp):
    # Process the content with Stanza
    doc = nlp(content)
    # Remove non-informative sentences (those with just punctuation or short sentences)
    sentences = [sentence.text for sentence in doc.sentences if len(sentence.text.split()) > 2]
    return sentences

def createWikiTrainSet(category):
    global sentences, sentencesStructure

    # Fetch a list of article titles from the specified category
    titles = fetch_article_titles(category)
    print(f"Number of titles fetched from category '{category}': {len(titles)}")

    sentencesStructure = []  # Store all sentences from fetched articles

    # Fetch content from each title and split into sentences
    for title in titles:
        content = fetch_and_parse_content(title)

        # Tokenize the paragraph into sentences
        sentence_data = split_sentences(content, nlp)

        # Clean the Wikipedia content
        sentence_data = clean_wikipedia_content(sentence_data)

        # Add the list of sentences for this paragraph
        sentencesStructure.append(sentence_data)

    sentences = [sentence for sublist in sentencesStructure for sentence in sublist]
    print("Created a training set with " + str(len(sentences)) + " sentences")
    return sentences

sentences = createWikiTrainSet("sports")

Number of titles fetched from category 'sports': 59
Created a training set with 444 sentences


In [None]:
generatedSourceSentences = ['meeting, sports meeting admitting a single, meeting this meeting meeting this meeting meeting this meeting meeting some cases meeting meeting meeting this meeting meeting this definition.', 'with major competitions sports this meeting meeting this and skills, generally., some are cases, some as system meeting meeting this meeting meeting.', ', sports meeting, format, sports meeting some format, producing meeting sports meeting sports participant athleticism major competitions sports sports meeting.', 'sports provide tie-breaking methods any ensure one winner., some with some sports, which done.', ', which different.', 'as sports, such, such, some individuals, some, such, regular as the, such some admitting admitting, some, some individuals, some, such as sports meeting sports, such- some, suchdraw sports, some meeting some the some and, some sports as sports, sports season.. as sports meeting, such as the some cases, such numbers individuals cases, some, such, such as sports as the, such as sports meeting some individuals cases,, some, some as sports meeting, sports.', 'draw one, such form and skills.', ', are others one.', 'a spectators, followed sports being sports.', '. a single skills individuals, which a single person person sports different meeting sports.']

In [None]:
layerAmount = 2

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": layerAmount,         # Number of attention heads
    "n_layers": layerAmount,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

LLM_Layers = [[('Embedding', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
         ('Embedding', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
         ('Dropout', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["drop_rate"])
         ], [('Sequential', GPT_CONFIG_124M["emb_dim"], 4 * GPT_CONFIG_124M["emb_dim"]),
         ('LayerNorm', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
         ('Linear', GPT_CONFIG_124M["vocab_size"], GPT_CONFIG_124M["emb_dim"])]]

TransformerBlockLayer = [('LayerNorm', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
('Linear', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
('Linear', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
('Linear', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
('Dropout', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["drop_rate"]),
('Linear', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
('MultiHeadAttention', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
('Dropout', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["drop_rate"]),
('LayerNorm', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
('Linear', GPT_CONFIG_124M["emb_dim"], 4 * GPT_CONFIG_124M["emb_dim"]),
('GELU', 4 * GPT_CONFIG_124M["emb_dim"], 4 * GPT_CONFIG_124M["emb_dim"]),
('Linear', 4 * GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"]),
('Sequential', GPT_CONFIG_124M["emb_dim"], 4 * GPT_CONFIG_124M["emb_dim"]),
('FeedForward', GPT_CONFIG_124M["emb_dim"], 4 * GPT_CONFIG_124M["emb_dim"]),
('Dropout', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["drop_rate"]),
('TransformerBlock', GPT_CONFIG_124M["emb_dim"], GPT_CONFIG_124M["emb_dim"])
]

hidden_sizes = []
hidden_sizes.append(LLM_Layers[0])
for _ in range(layerAmount):
    hidden_sizes.append(TransformerBlockLayer)
hidden_sizes.append(LLM_Layers[1])
hidden_sizes = [item for sublist in hidden_sizes for item in sublist]

**Reading the dataframes**

In [None]:
# Read the Parquet file into a DataFrame
evalDf = pq.read_table('./Data/identifiedClosestEvalSourcesWiki1010.parquet').to_pandas(safe=False)

# Display the DataFrame
print("Evaluation Sample")
print(evalDf.head(10), "\n") # View the first 10 rows
print(evalDf.tail(10), "\n") # View the last 10 rows
print(evalDf.info(), "\n")  # Show column types and non-null counts

# Read the Parquet file into a DataFrame
generatedEvalDf = pq.read_table('./Data/identifiedClosestGeneratedEvalSourcesWiki1010.parquet').to_pandas(safe=False)

# Display the DataFrame
#print("Generated Evaluation Sample")
#print(generatedEvalDf.head(10), "\n") # View the first 10 rows
#print(generatedEvalDf.tail(10), "\n") # View the last 10 rows
#print(generatedEvalDf.info())  # Show column types and non-null counts

Evaluation Sample
            layer  neuron source  eval_neuron_value  neuron_value  difference
evalSample                                                                   
0               0       0    0:3           0.828838     -0.787064   -1.054130
0               0       0    0:7           0.828838     -0.787064   -1.054130
0               0       0    0:4           0.828838     -0.719139   -0.922670
0               0       0    0:1           0.828838     -0.685918   -0.861161
0               0       0    0:2           0.828838     -0.051844   -0.037843
0               0       0    0:6           0.828838      0.969789    0.113296
0               0       0    0:0           0.828838      1.064033    0.207421
0               0       0    0:8           0.828838      1.064033    0.207421
0               0       1    0:6          -0.169635      0.600388   -0.078424
0               0       1    0:0          -0.169635      0.302192   -0.024187 

            layer  neuron source  eval_neuro

# Extracting Most Relevant Sources with different modes

In [None]:
evaluationSamples = len(evalDf.index.unique())
trainSamples = len(evalDf['source'].unique())

# Exponential decay with a capped value to avoid overflow
def exponential_decay(weights, decay_rate=0.05, max_val=1e100):
    decay_values = np.exp(-decay_rate * np.array(weights))
    decay_values[decay_values > max_val] = max_val  # Cap values
    return decay_values

# Function to handle mode calculation safely
def safe_mode(values):
    try:
        mode_value = stats.mode(values)[0][0]
        return mode_value
    except:
        # Return a fallback value (e.g., 0) if mode cannot be calculated
        return 0

# Function to handle percentile calculation safely
def safe_percentile(values, percentile=90):
    try:
        return np.percentile(values, percentile)
    except:
        # Return a fallback value (e.g., the median) if percentile cannot be calculated
        return np.median(values)

def getMostUsedFromDataFrame(df, evalSample, closestSources, weightedMode="", info=True):
    # Filter entries for the specific evalSample
    relevant_entries = df[df.index.get_level_values('evalSample') == evalSample]

    # Use value_counts to count occurrences of each source directly
    sources = relevant_entries['source']

    # Filter out invalid sources ('None')
    valid_entries = relevant_entries[sources != 'None']
    ascending_order = True  # Sort by ascending for lowest average weights

    if valid_entries.empty:
        # Handle cases where there are no valid entries
        print(f"No valid entries for evalSample {evalSample}")
        return 0, []

    # Initialize weighted_counts to None for error handling
    weighted_counts = None

    if weightedMode == "Sum":
        # Group by 'source' and sum the 'difference' column as weights
        weighted_counts = valid_entries.groupby('source')['difference'].sum()

    elif weightedMode == "Mean":
        # Group by 'source' and calculate the average of 'difference'
        weighted_counts = valid_entries.groupby('source')['difference'].mean()

    elif weightedMode == "Median":
        # Group by 'source' and calculate the median of 'difference'
        weighted_counts = valid_entries.groupby('source')['difference'].median()

    elif weightedMode == "Mode":
        # Get the most frequent source (mode)
        weighted_counts = valid_entries['source'].mode()
        if weighted_counts.empty:
            weighted_counts = pd.Series([0])  # Fallback if no mode found

    elif weightedMode == "ExponentialDecay":
        # Apply exponential decay to the 'difference' values
        valid_entries['weighted_difference'] = exponential_decay(valid_entries['difference'])
        weighted_counts = valid_entries.groupby('source')['weighted_difference'].sum()

    elif weightedMode == "Percentile":
        # Use percentile-based filtering (example: Top 75%)
        percentile_threshold = 0.75  # For example, top 25%
        cutoff_value = safe_percentile(valid_entries['difference'], percentile_threshold * 100)
        filtered_entries = valid_entries[valid_entries['difference'] >= cutoff_value]
        weighted_counts = filtered_entries['source'].value_counts()

    else:
        # Default behavior: Count occurrences
        weighted_counts = valid_entries['source'].value_counts()
        ascending_order = False  # Sort by descending for highest counts

    # Handle case when weighted_counts is None (for non-valid weighted modes)
    if weighted_counts is None or weighted_counts.empty:
        print(f"No valid weighted counts for evalSample {evalSample} with mode {weightedMode}")
        return 0, []

    # Sort weighted sources by the determined order
    sorted_sources = weighted_counts.sort_values(ascending=ascending_order).head(closestSources)
    # Total weight (sum, mean, median, or total count for closest sources)
    total_weight = sorted_sources.sum()

    # Convert to a Counter-like output (sorted already by the determined order)
    counter = []
    for source, weight in sorted_sources.items():
        if isinstance(source, str):  # If source is a string (e.g., "0:123")
            source = int(source.replace("0:", ""))
        counter.append((source, weight))

    if info:
        # Print the total weight (sum, mean, or total count depending on the mode)
        print(f"Total Weight for Weighted Mode={weightedMode}: {total_weight}")

    if info:
        print(f"Total closest Sources (Weighted Mode={weightedMode}):", total_weight,
              "|", closestSources, "closest Sources in format [SourceNumber, Weight]:", counter)

    # Return the sorted sources and the counter
    return sorted_sources, counter

# Example usage of the updated function
for evaluationSample in range(evaluationSamples):
    print(f"Most used for Evaluation Sample {evaluationSample+6}")
    getMostUsedFromDataFrame(evalDf, evaluationSample, trainSamples, "Mean", True)
    getMostUsedFromDataFrame(evalDf, evaluationSample, trainSamples, "Sum", True)
    getMostUsedFromDataFrame(evalDf, evaluationSample, trainSamples, "Median", True)
    #getMostUsedFromDataFrame(evalDf, evaluationSample, trainSamples, "Mode", True)
    getMostUsedFromDataFrame(evalDf, evaluationSample, trainSamples, "ExponentialDecay", True)
    getMostUsedFromDataFrame(evalDf, evaluationSample, trainSamples, "Percentile", True)
    print()

Most used for Evaluation Sample 6
Total Weight for Weighted Mode=Mean: 434.80067818436305
Total closest Sources (Weighted Mode=Mean): 434.80067818436305 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -213.31245858155924), (1, -38.445456779020056), (8, 28.93729636015812), (6, 49.959052998861395), (5, 52.5449816569437), (4, 88.44749899615151), (2, 110.4298298415942), (3, 168.83703597344913), (7, 187.40289771778424)]
Total Weight for Weighted Mode=Sum: 36580759.55538227
Total closest Sources (Weighted Mode=Sum): 36580759.55538227 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -18010824.127875373), (1, -3247795.297778056), (8, 2442944.4333172687), (6, 4204803.695649169), (5, 4397331.8799246475), (4, 7441530.328041207), (2, 9323369.673866116), (3, 14235832.362173311), (7, 15793566.608063985)]
Total Weight for Weighted Mode=Median: 302.53021362423897
Total closest Sources (Weighted Mode=Median): 302.53021362423897 | 10 closest Sources in format [SourceNumber, Weight

  decay_values = np.exp(-decay_rate * np.array(weights))


Total Weight for Weighted Mode=ExponentialDecay: 4.9112963525165944e+103
Total closest Sources (Weighted Mode=ExponentialDecay): 4.9112963525165944e+103 | 10 closest Sources in format [SourceNumber, Weight]: [(5, 2.1748912316192388e+102), (7, 3.357652786266127e+102), (3, 3.591775771053462e+102), (4, 4.525911972536088e+102), (2, 4.843545232443671e+102), (6, 6.077491574544518e+102), (8, 6.207282089642434e+102), (1, 7.453458360925516e+102), (0, 1.0880954506134885e+103)]
Total Weight for Weighted Mode=Percentile: 189586
Total closest Sources (Weighted Mode=Percentile): 189586 | 10 closest Sources in format [SourceNumber, Weight]: [(0, 4148), (5, 4746), (6, 6286), (4, 10407), (8, 16680), (3, 27258), (2, 30305), (1, 44014), (7, 45742)]

Most used for Evaluation Sample 7
Total Weight for Weighted Mode=Mean: 434.80067818436305
Total closest Sources (Weighted Mode=Mean): 434.80067818436305 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -213.31245858155924), (1, -38.445456779020056)

  decay_values = np.exp(-decay_rate * np.array(weights))


Total Weight for Weighted Mode=Percentile: 189586
Total closest Sources (Weighted Mode=Percentile): 189586 | 10 closest Sources in format [SourceNumber, Weight]: [(0, 4148), (5, 4746), (6, 6286), (4, 10407), (8, 16680), (3, 27258), (2, 30305), (1, 44014), (7, 45742)]

Most used for Evaluation Sample 8
Total Weight for Weighted Mode=Mean: 434.80067818436305
Total closest Sources (Weighted Mode=Mean): 434.80067818436305 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -213.31245858155924), (1, -38.445456779020056), (8, 28.93729636015812), (6, 49.959052998861395), (5, 52.5449816569437), (4, 88.44749899615151), (2, 110.4298298415942), (3, 168.83703597344913), (7, 187.40289771778424)]
Total Weight for Weighted Mode=Sum: 36580759.55538227
Total closest Sources (Weighted Mode=Sum): 36580759.55538227 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -18010824.127875373), (1, -3247795.297778056), (8, 2442944.4333172687), (6, 4204803.695649169), (5, 4397331.8799246475), (4, 

  decay_values = np.exp(-decay_rate * np.array(weights))


Total Weight for Weighted Mode=Mean: 434.80067818436305
Total closest Sources (Weighted Mode=Mean): 434.80067818436305 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -213.31245858155924), (1, -38.445456779020056), (8, 28.93729636015812), (6, 49.959052998861395), (5, 52.5449816569437), (4, 88.44749899615151), (2, 110.4298298415942), (3, 168.83703597344913), (7, 187.40289771778424)]
Total Weight for Weighted Mode=Sum: 36580759.55538227
Total closest Sources (Weighted Mode=Sum): 36580759.55538227 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -18010824.127875373), (1, -3247795.297778056), (8, 2442944.4333172687), (6, 4204803.695649169), (5, 4397331.8799246475), (4, 7441530.328041207), (2, 9323369.673866116), (3, 14235832.362173311), (7, 15793566.608063985)]
Total Weight for Weighted Mode=Median: 302.53021362423897
Total closest Sources (Weighted Mode=Median): 302.53021362423897 | 10 closest Sources in format [SourceNumber, Weight]: [(0, 7.23631243803689), (6, 9.8

  decay_values = np.exp(-decay_rate * np.array(weights))


Total Weight for Weighted Mode=ExponentialDecay: 4.9112963525165944e+103
Total closest Sources (Weighted Mode=ExponentialDecay): 4.9112963525165944e+103 | 10 closest Sources in format [SourceNumber, Weight]: [(5, 2.1748912316192388e+102), (7, 3.357652786266127e+102), (3, 3.591775771053462e+102), (4, 4.525911972536088e+102), (2, 4.843545232443671e+102), (6, 6.077491574544518e+102), (8, 6.207282089642434e+102), (1, 7.453458360925516e+102), (0, 1.0880954506134885e+103)]
Total Weight for Weighted Mode=Percentile: 189586
Total closest Sources (Weighted Mode=Percentile): 189586 | 10 closest Sources in format [SourceNumber, Weight]: [(0, 4148), (5, 4746), (6, 6286), (4, 10407), (8, 16680), (3, 27258), (2, 30305), (1, 44014), (7, 45742)]

Most used for Evaluation Sample 10
Total Weight for Weighted Mode=Mean: 166.7276917866429
Total closest Sources (Weighted Mode=Mean): 166.7276917866429 | 10 closest Sources in format [SourceNumber, Weight]: [(9, -277.9249410218093), (0, -210.29878299965097), 

  decay_values = np.exp(-decay_rate * np.array(weights))


Total Weight for Weighted Mode=Percentile: 189578
Total closest Sources (Weighted Mode=Percentile): 189578 | 10 closest Sources in format [SourceNumber, Weight]: [(9, 175), (0, 4104), (5, 4712), (6, 6268), (4, 10424), (8, 16532), (3, 27278), (2, 30360), (1, 44010), (7, 45715)]

Most used for Evaluation Sample 11
Total Weight for Weighted Mode=Mean: 432.89977109432687
Total closest Sources (Weighted Mode=Mean): 432.89977109432687 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -215.26585852798138), (1, -38.4454763724273), (9, 0.05185828168335879), (8, 28.936890992119885), (6, 49.958846066204195), (5, 52.54596996490704), (4, 88.44756395121007), (2, 110.42973798694625), (3, 168.8371914979333), (7, 187.40304725373142)]
Total Weight for Weighted Mode=Sum: 36580759.84980041
Total closest Sources (Weighted Mode=Sum): 36580759.84980041 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -18010863.851319145), (1, -3247796.9529899135), (9, 39.723443769452835), (8, 2442910.211

  decay_values = np.exp(-decay_rate * np.array(weights))


Total Weight for Weighted Mode=Mean: 434.80067818436305
Total closest Sources (Weighted Mode=Mean): 434.80067818436305 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -213.31245858155924), (1, -38.445456779020056), (8, 28.93729636015812), (6, 49.959052998861395), (5, 52.5449816569437), (4, 88.44749899615151), (2, 110.4298298415942), (3, 168.83703597344913), (7, 187.40289771778424)]
Total Weight for Weighted Mode=Sum: 36580759.55538227
Total closest Sources (Weighted Mode=Sum): 36580759.55538227 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -18010824.127875373), (1, -3247795.297778056), (8, 2442944.4333172687), (6, 4204803.695649169), (5, 4397331.8799246475), (4, 7441530.328041207), (2, 9323369.673866116), (3, 14235832.362173311), (7, 15793566.608063985)]
Total Weight for Weighted Mode=Median: 302.53021362423897
Total closest Sources (Weighted Mode=Median): 302.53021362423897 | 10 closest Sources in format [SourceNumber, Weight]: [(0, 7.23631243803689), (6, 9.8

  decay_values = np.exp(-decay_rate * np.array(weights))


Total Weight for Weighted Mode=ExponentialDecay: 4.9112963525165944e+103
Total closest Sources (Weighted Mode=ExponentialDecay): 4.9112963525165944e+103 | 10 closest Sources in format [SourceNumber, Weight]: [(5, 2.1748912316192388e+102), (7, 3.357652786266127e+102), (3, 3.591775771053462e+102), (4, 4.525911972536088e+102), (2, 4.843545232443671e+102), (6, 6.077491574544518e+102), (8, 6.207282089642434e+102), (1, 7.453458360925516e+102), (0, 1.0880954506134885e+103)]
Total Weight for Weighted Mode=Percentile: 189586
Total closest Sources (Weighted Mode=Percentile): 189586 | 10 closest Sources in format [SourceNumber, Weight]: [(0, 4148), (5, 4746), (6, 6286), (4, 10407), (8, 16680), (3, 27258), (2, 30305), (1, 44014), (7, 45742)]

Most used for Evaluation Sample 13
Total Weight for Weighted Mode=Mean: 401.5360922082798
Total closest Sources (Weighted Mode=Mean): 401.5360922082798 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -213.31245858155924), (1, -38.445456779020056),

  decay_values = np.exp(-decay_rate * np.array(weights))


Total Weight for Weighted Mode=Percentile: 189757
Total closest Sources (Weighted Mode=Percentile): 189757 | 10 closest Sources in format [SourceNumber, Weight]: [(0, 4148), (5, 4743), (6, 6285), (4, 10395), (8, 16659), (3, 27234), (2, 30593), (1, 43981), (7, 45719)]

Most used for Evaluation Sample 14
Total Weight for Weighted Mode=Mean: 434.80067818436305
Total closest Sources (Weighted Mode=Mean): 434.80067818436305 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -213.31245858155924), (1, -38.445456779020056), (8, 28.93729636015812), (6, 49.959052998861395), (5, 52.5449816569437), (4, 88.44749899615151), (2, 110.4298298415942), (3, 168.83703597344913), (7, 187.40289771778424)]
Total Weight for Weighted Mode=Sum: 36580759.55538227
Total closest Sources (Weighted Mode=Sum): 36580759.55538227 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -18010824.127875373), (1, -3247795.297778056), (8, 2442944.4333172687), (6, 4204803.695649169), (5, 4397331.8799246475), (4,

  decay_values = np.exp(-decay_rate * np.array(weights))


Total Weight for Weighted Mode=Mean: 434.80067818436305
Total closest Sources (Weighted Mode=Mean): 434.80067818436305 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -213.31245858155924), (1, -38.445456779020056), (8, 28.93729636015812), (6, 49.959052998861395), (5, 52.5449816569437), (4, 88.44749899615151), (2, 110.4298298415942), (3, 168.83703597344913), (7, 187.40289771778424)]
Total Weight for Weighted Mode=Sum: 36580759.55538227
Total closest Sources (Weighted Mode=Sum): 36580759.55538227 | 10 closest Sources in format [SourceNumber, Weight]: [(0, -18010824.127875373), (1, -3247795.297778056), (8, 2442944.4333172687), (6, 4204803.695649169), (5, 4397331.8799246475), (4, 7441530.328041207), (2, 9323369.673866116), (3, 14235832.362173311), (7, 15793566.608063985)]
Total Weight for Weighted Mode=Median: 302.53021362423897
Total closest Sources (Weighted Mode=Median): 302.53021362423897 | 10 closest Sources in format [SourceNumber, Weight]: [(0, 7.23631243803689), (6, 9.8

  decay_values = np.exp(-decay_rate * np.array(weights))


# Calculate Sentence Similarity with different metrics

In [None]:
# Install the sentence-transformers library if you haven't already
!pip install -q sentence-transformers

# Import the required library
from sentence_transformers import SentenceTransformer

# Load a pre-trained model (e.g., 'all-MiniLM-L6-v2')
model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_cosine_similarity(embedding1, embedding2):
    return dot(embedding1, embedding2) / (norm(embedding1) * norm(embedding2))

# Function to compute both cosine and Euclidean similarity
def compute_similarity(evalEmbedding, trainEmbedding, metrics=['cosine', 'euclidean']):
    similarities = []

    if 'cosine' in metrics:
        # Cosine similarity
        cosine_sim = cosine_similarity([evalEmbedding], [trainEmbedding])[0][0]
        similarities.append(('cosine', cosine_sim))

    if 'euclidean' in metrics:
        # Euclidean distance (we invert it to make it a similarity)
        euclidean_dist = euclidean_distances([evalEmbedding], [trainEmbedding])[0][0]
        euclidean_sim = -euclidean_dist  # Invert distance to create a similarity score
        similarities.append(('euclidean', euclidean_sim))

    return similarities

# Updated function to extract sentence similarity
def extractSentenceSimilarity(name, info=True, metrics=['cosine', 'euclidean']):
    if info:
        print(f"Checking Similarity for {name}-Sample")

    # Dictionary to store all similarities for evaluation samples
    similarities_dict = {}

    # Iterate over each evaluation sample
    for evaluationSample in range(evaluationSamples):  # Adjust range based on your dataset
        evalSample = sentences[evaluationSample + 6]  # Adjust index based on your dataset
        evalEmbedding = model.encode(evalSample)

        # List to store similarities for each training sample
        similarities = []

        # Iterate over each training sample
        for trainingSample in range(trainSamples):  # Adjust based on your dataset
            trainSample = sentences[trainingSample]
            trainEmbedding = model.encode(trainSample)

            # Compute both cosine and Euclidean similarity
            metric_similarities = compute_similarity(evalEmbedding, trainEmbedding, metrics)

            # Combine the metrics (here we average them for simplicity)
            combined_similarity = np.mean([sim for _, sim in metric_similarities])

            # Append the combined similarity score along with the training sample index
            similarities.append((trainingSample, combined_similarity))

        # Sort the training samples by their combined similarity to the evaluation sample (highest similarity first)
        sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

        # Save the sorted similarities to the dictionary
        similarities_dict[evaluationSample] = sorted_similarities

        if info:
            # Print the sorted train samples by their combined similarity to the evaluation sample
            print(f"Sorted relevance for evaluation sample {evaluationSample + 6}:")
            for trainingSample, similarity in sorted_similarities:
                print(f"Training sample: '{trainingSample}' - Combined Similarity: {similarity:.4f}")
            print()

    # Return the dictionary of similarities
    return similarities_dict

# Example function calls with dataframes
sentenceSimilarity = extractSentenceSimilarity("Evaluation", True, metrics=['cosine', 'euclidean'])
#extractSentenceSimilarity("Generated Evaluation")

2025-01-06 10:31:25.703534: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-06 10:31:26.056518: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-06 10:31:26.056595: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-06 10:31:26.058474: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-06 10:31:26.218928: I tensorflow/core/platform/cpu_feature_g

Checking Similarity for Evaluation-Sample
Sorted relevance for evaluation sample 6:
Training sample: '6' - Combined Similarity: 0.5000
Training sample: '7' - Combined Similarity: -0.1120
Training sample: '5' - Combined Similarity: -0.1586
Training sample: '3' - Combined Similarity: -0.2395
Training sample: '4' - Combined Similarity: -0.2580
Training sample: '1' - Combined Similarity: -0.3670
Training sample: '8' - Combined Similarity: -0.3923
Training sample: '2' - Combined Similarity: -0.4165
Training sample: '9' - Combined Similarity: -0.4429
Training sample: '0' - Combined Similarity: -0.4851

Sorted relevance for evaluation sample 7:
Training sample: '7' - Combined Similarity: 0.5000
Training sample: '6' - Combined Similarity: -0.1120
Training sample: '4' - Combined Similarity: -0.1435
Training sample: '5' - Combined Similarity: -0.2346
Training sample: '3' - Combined Similarity: -0.2691
Training sample: '8' - Combined Similarity: -0.2882
Training sample: '1' - Combined Similarity:

# Evaluating Mode Consistency to determine best Mode

In [None]:
def evaluate_mode_consistency(similarity_scores, relevance_scores_sum, relevance_scores_mean, relevance_scores_median, k=5):
    # Helper function to normalize
    def normalize(scores):
        # Normalize values to [0, 1] range
        values = [v for _, v in scores]
        min_val = min(values)
        max_val = max(values)
        return {k: (v - min_val) / (max_val - min_val) for k, v in scores}

    def spearman_rank_correlation(x, y):
        if len(x) != len(y):
            raise ValueError("Both lists must have the same length.")

        n = len(x)

        # Rank the values
        rank_x = [sorted(x).index(val) + 1 for val in x]
        rank_y = [sorted(y).index(val) + 1 for val in y]

        # Calculate the differences in ranks
        d_squared = [(rx - ry) ** 2 for rx, ry in zip(rank_x, rank_y)]

        # Sum of squared differences
        sum_d_squared = sum(d_squared)

        # Spearman rank correlation coefficient formula
        rho = 1 - (6 * sum_d_squared) / (n * (n**2 - 1))

        return rho

    # Helper function to calculate standard deviation
    def calculate_std(scores1, scores2):
        values1 = [v for _, v in scores1]
        values2 = [v for _, v in scores2]
        assert len(values1) == len(values2), "Scores must have the same length"
        return np.std(np.array(values1) - np.array(values2))

    # Helper function to calculate mean absolute error (MAE)
    def calculate_mae(scores1, scores2):
        values1 = [v for _, v in scores1]
        values2 = [v for _, v in scores2]
        assert len(values1) == len(values2), "Scores must have the same length"
        return np.mean(np.abs(np.array(values1) - np.array(values2)))

    # Helper function to calculate Pearson correlation
    def calculate_pearson(scores1, scores2):
        values1 = [v for _, v in scores1]
        values2 = [v for _, v in scores2]
        return pearsonr(values1, values2)[0]

    # Helper function to calculate median
    def calculate_median(scores1, scores2):
        values1 = [v for _, v in scores1]
        values2 = [v for _, v in scores2]
        return np.median(np.abs(np.array(values1) - np.array(values2)))

    # Normalize all scores (only inverted relevance scores)
    similarity_scores_normalized = normalize(similarity_scores)
    relevance_scores_sum_normalized = normalize(relevance_scores_sum)
    relevance_scores_mean_normalized = normalize(relevance_scores_mean)
    relevance_scores_median_normalized = normalize(relevance_scores_median)

    # Convert lists of tuples to dictionaries for easier processing
    similarity_scores = dict(similarity_scores_normalized)
    relevance_scores_sum = dict(relevance_scores_sum_normalized)
    relevance_scores_mean = dict(relevance_scores_mean_normalized)
    relevance_scores_median = dict(relevance_scores_median_normalized)

    # Invert relevance rankings since lower relevance is better
    inverted_relevance_sum = {k: 1 - v for k, v in relevance_scores_sum_normalized.items()}
    inverted_relevance_mean = {k: 1 - v for k, v in relevance_scores_mean_normalized.items()}
    inverted_relevance_median = {k: 1 - v for k, v in relevance_scores_median_normalized.items()}

    # Compute Spearman correlation for sum, mean, and median
    # Align keys between similarity scores and inverted relevance scores
    common_keys = set(similarity_scores.keys()) & set(inverted_relevance_sum.keys())
    similarity_scores = {k: similarity_scores[k] for k in common_keys}
    inverted_relevance_sum = {k: inverted_relevance_sum[k] for k in common_keys}
    inverted_relevance_mean = {k: inverted_relevance_mean[k] for k in common_keys}
    inverted_relevance_median = {k: inverted_relevance_median[k] for k in common_keys}

    # Compute Spearman correlation for sum, mean, and median
    spearman_inverted_sum, _ = spearmanr(list(similarity_scores.values()), list(inverted_relevance_sum.values()))
    spearman_inverted_mean, _ = spearmanr(list(similarity_scores.values()), list(inverted_relevance_mean.values()))
    spearman_inverted_median, _ = spearmanr(list(similarity_scores.values()), list(inverted_relevance_median.values()))

    # Calculate Pearson correlation
    pearson_inverted_sum = calculate_pearson(list(similarity_scores.items()), list(inverted_relevance_sum.items()))
    pearson_inverted_mean = calculate_pearson(list(similarity_scores.items()), list(inverted_relevance_mean.items()))
    pearson_inverted_median = calculate_pearson(list(similarity_scores.items()), list(inverted_relevance_median.items()))

    # Calculate standard deviation between similarity and inverted relevance scores
    std_inverted_sum = calculate_std(list(similarity_scores.items()), list(inverted_relevance_sum.items()))
    std_inverted_mean = calculate_std(list(similarity_scores.items()), list(inverted_relevance_mean.items()))
    std_inverted_median = calculate_std(list(similarity_scores.items()), list(inverted_relevance_median.items()))

    # Calculate Mean Absolute Error (MAE)
    mae_inverted_sum = calculate_mae(list(similarity_scores.items()), list(inverted_relevance_sum.items()))
    mae_inverted_mean = calculate_mae(list(similarity_scores.items()), list(inverted_relevance_mean.items()))
    mae_inverted_median = calculate_mae(list(similarity_scores.items()), list(inverted_relevance_median.items()))

    # Calculate Median
    median_inverted_sum = calculate_median(list(similarity_scores.items()), list(inverted_relevance_sum.items()))
    median_inverted_mean = calculate_median(list(similarity_scores.items()), list(inverted_relevance_mean.items()))
    median_inverted_median = calculate_median(list(similarity_scores.items()), list(inverted_relevance_median.items()))

    # Max and Min Difference
    max_diff_inverted_sum = np.max(np.abs(np.array([v for _, v in similarity_scores.items()]) - np.array([v for _, v in inverted_relevance_sum.items()])))
    min_diff_inverted_sum = np.min(np.abs(np.array([v for _, v in similarity_scores.items()]) - np.array([v for _, v in inverted_relevance_sum.items()])))

    max_diff_inverted_mean = np.max(np.abs(np.array([v for _, v in similarity_scores.items()]) - np.array([v for _, v in inverted_relevance_mean.items()])))
    min_diff_inverted_mean = np.min(np.abs(np.array([v for _, v in similarity_scores.items()]) - np.array([v for _, v in inverted_relevance_mean.items()])))

    max_diff_inverted_median = np.max(np.abs(np.array([v for _, v in similarity_scores.items()]) - np.array([v for _, v in inverted_relevance_median.items()])))
    min_diff_inverted_median = np.min(np.abs(np.array([v for _, v in similarity_scores.items()]) - np.array([v for _, v in inverted_relevance_median.items()])))

    # Store results for sum, mean, and median separately
    result_sum = {
        'spearman_inverted_sum': spearman_inverted_sum,
        'pearson_inverted_sum': pearson_inverted_sum,
        'std_inverted_sum': std_inverted_sum,
        'mae_inverted_sum': mae_inverted_sum,
        'median_inverted_sum': median_inverted_sum,
        'max_diff_inverted_sum': max_diff_inverted_sum,
        'min_diff_inverted_sum': min_diff_inverted_sum
    }

    result_mean = {
        'spearman_inverted_mean': spearman_inverted_mean,
        'pearson_inverted_mean': pearson_inverted_mean,
        'std_inverted_mean': std_inverted_mean,
        'mae_inverted_mean': mae_inverted_mean,
        'median_inverted_mean': median_inverted_mean,
        'max_diff_inverted_mean': max_diff_inverted_mean,
        'min_diff_inverted_mean': min_diff_inverted_mean
    }

    result_median = {
        'spearman_inverted_median': spearman_inverted_median,
        'pearson_inverted_median': pearson_inverted_median,
        'std_inverted_median': std_inverted_median,
        'mae_inverted_median': mae_inverted_median,
        'median_inverted_median': median_inverted_median,
        'max_diff_inverted_median': max_diff_inverted_median,
        'min_diff_inverted_median': min_diff_inverted_median
    }

    # Print the results for each mode
    print(f"Mode consistency evaluation results (Sum): {result_sum}")
    print(f"Mode consistency evaluation results (Mean): {result_mean}")
    print(f"Mode consistency evaluation results (Median): {result_median}")
    print()

     # Return the results in a dictionary
    result = {
        'spearman_inverted_sum': spearman_inverted_sum,
        'spearman_inverted_mean': spearman_inverted_mean,
        'spearman_inverted_median': spearman_inverted_median,
        'pearson_inverted_sum': pearson_inverted_sum,
        'pearson_inverted_mean': pearson_inverted_mean,
        'pearson_inverted_median': pearson_inverted_median,
        'std_inverted_sum': std_inverted_sum,
        'std_inverted_mean': std_inverted_mean,
        'std_inverted_median': std_inverted_median,
        'mae_inverted_sum': mae_inverted_sum,
        'mae_inverted_mean': mae_inverted_mean,
        'mae_inverted_median': mae_inverted_median,
        'median_inverted_sum': median_inverted_sum,
        'median_inverted_mean': median_inverted_mean,
        'median_inverted_median': median_inverted_median,
        'max_diff_inverted_sum': max_diff_inverted_sum,
        'min_diff_inverted_sum': min_diff_inverted_sum,
        'max_diff_inverted_mean': max_diff_inverted_mean,
        'min_diff_inverted_mean': min_diff_inverted_mean,
        'max_diff_inverted_median': max_diff_inverted_median,
        'min_diff_inverted_median': min_diff_inverted_median
    }

    return result

def getBestModeForSimilarityCheck():
    # Get cosine similarities
    similarities = extractSentenceSimilarity("Evaluation", False, metrics=['cosine', 'euclidean'])

    # Initialize dictionaries to store the aggregated scores for each mode
    mode_scores = {'sum': 0, 'mean': 0, 'median': 0}

    # Initialize dictionaries to store the best values for each metric (we use negative infinity or positive infinity based on metrics)
    best_values = {
        'spearman': {'sum': float('-inf'), 'mean': float('-inf'), 'median': float('-inf')},
        'pearson': {'sum': float('-inf'), 'mean': float('-inf'), 'median': float('-inf')},
        'std': {'sum': float('inf'), 'mean': float('inf'), 'median': float('inf')},
        'mae': {'sum': float('inf'), 'mean': float('inf'), 'median': float('inf')},
        'median': {'sum': float('inf'), 'mean': float('inf'), 'median': float('inf')},
        'max_diff': {'sum': float('-inf'), 'mean': float('-inf'), 'median': float('-inf')},
        'min_diff': {'sum': float('inf'), 'mean': float('inf'), 'median': float('inf')}
    }

    best_modes = {
        'spearman': {'sum': None, 'mean': None, 'median': None},
        'pearson': {'sum': None, 'mean': None, 'median': None},
        'std': {'sum': None, 'mean': None, 'median': None},
        'mae': {'sum': None, 'mean': None, 'median': None},
        'median': {'sum': None, 'mean': None, 'median': None},
        'max_diff': {'sum': None, 'mean': None, 'median': None},
        'min_diff': {'sum': None, 'mean': None, 'median': None}
    }

    # Loop through each evaluation sample
    for evaluationSample in range(evaluationSamples):
        print(f"Evaluation Sample {evaluationSample + 1}:")

        # Get the most relevant sources by sum, mean, and median
        total_weight_sum, counter_sum = getMostUsedFromDataFrame(evalDf, evaluationSample, trainSamples, "Sum", False)
        total_weight_mean, counter_mean = getMostUsedFromDataFrame(evalDf, evaluationSample, trainSamples, "Mean", False)
        total_weight_median, counter_median = getMostUsedFromDataFrame(evalDf, evaluationSample, trainSamples, "Median", False)

        sentenceSimilarity = similarities[evaluationSample]

        # Get the evaluation results for the current sample
        result = evaluate_mode_consistency(sentenceSimilarity, counter_sum, counter_mean, counter_median)

        # Update best values for each metric and mode in one go (minimize checks inside loops)
        for metric in best_values:
            for mode in best_values[metric]:
                result_key = f'{metric}_inverted_{mode}'

                # Update the best values based on the metric
                current_value = result[result_key]
                if metric in ['spearman', 'pearson', 'max_diff']:  # Maximizing
                    if current_value > best_values[metric][mode]:
                        best_values[metric][mode] = current_value
                        best_modes[metric][mode] = evaluationSample
                else:  # Minimizing (std, mae, median, min_diff)
                    if current_value < best_values[metric][mode]:
                        best_values[metric][mode] = current_value
                        best_modes[metric][mode] = evaluationSample

        # Aggregate the results for each mode
        for mode in ['sum', 'mean', 'median']:
            mode_score = 0
            # Sum all the metrics for the current mode to compute the score
            for metric in best_values:
                if metric in ['spearman', 'pearson', 'max_diff']:  # Maximizing correlation
                    mode_score += best_values[metric][mode]
                else:  # Minimizing error
                    mode_score -= best_values[metric][mode]
            mode_scores[mode] += mode_score

    # After all evaluation samples, determine the best mode based on the aggregated score
    best_mode = max(mode_scores, key=mode_scores.get)

    # Print the best mode and corresponding details
    print("\nBest Mode Evaluation Results:")
    print(f"Best Mode: {best_mode.capitalize()} with an aggregated score of {mode_scores[best_mode]}")

    # Print detailed results for each metric and best value for the best mode
    for metric in best_values:
        print(f"  - Best {metric.capitalize()} for {best_mode.capitalize()}: {best_values[metric][best_mode]}")

    # Ranking: Print the ranking of the modes based on aggregated scores
    print("\nRanking of Modes Based on Aggregated Scores:")
    ranked_modes = sorted(mode_scores.items(), key=lambda x: x[1], reverse=True)
    for rank, (mode, score) in enumerate(ranked_modes, 1):
        print(f"{rank}. {mode.capitalize()}: Score = {score}")

    # Pareto Analysis: Find top contributors based on the 80/20 rule
    pareto_cutoff = 0.8 * sum(mode_scores.values())
    pareto_modes = []
    cumulative_score = 0
    for mode, score in ranked_modes:
        cumulative_score += score
        pareto_modes.append((mode, score))
        if cumulative_score >= pareto_cutoff:
            break

    print("\nPareto Analysis - Top Contributing Modes:")
    for mode, score in pareto_modes:
        print(f"{mode.capitalize()}: Score = {score}")

getBestModeForSimilarityCheck()

Evaluation Sample 1:
Mode consistency evaluation results (Sum): {'spearman_inverted_sum': -0.4666666666666666, 'pearson_inverted_sum': -0.3022804544485473, 'std_inverted_sum': 0.4560715595417233, 'mae_inverted_sum': 0.34746280290315207, 'median_inverted_sum': 0.300657461344375, 'max_diff_inverted_sum': 1.0, 'min_diff_inverted_sum': 0.005645183043843205}
Mode consistency evaluation results (Mean): {'spearman_inverted_mean': -0.4666666666666666, 'pearson_inverted_mean': -0.3025750992513467, 'std_inverted_mean': 0.4561015884005892, 'mae_inverted_mean': 0.34750812600492537, 'median_inverted_mean': 0.30117671550594927, 'max_diff_inverted_mean': 1.0, 'min_diff_inverted_mean': 0.005065188441419566}
Mode consistency evaluation results (Median): {'spearman_inverted_median': -0.06666666666666667, 'pearson_inverted_median': 0.30735484466000906, 'std_inverted_median': 0.39497335000742984, 'mae_inverted_median': 0.3494028994682919, 'median_inverted_median': 0.37878867983818054, 'max_diff_inverted_m

# Extracting Relevant Source score for each layer

In [None]:
def extractRelevantSourceScores(df, dfName):
    print(f"Relevant source scores per layer and overall for {dfName}")

    # Dictionary to store the occurrence count of each source per layer
    source_occurrences_per_layer = defaultdict(lambda: defaultdict(int))
    # Dictionary to store the total occurrence count of each source across all layers
    total_source_occurrences = defaultdict(int)
    # Dictionary to track source occurrences for each evaluation sample
    source_occurrences_per_eval = defaultdict(lambda: defaultdict(int))

    # Iterate over each evaluation sample
    for evaluationSample in range(evaluationSamples):  # Adjust range based on your dataset
        eval_df = df[df.index == (evaluationSample + 6)]  # Filter rows for the evaluation sample

        # For each layer, track the source occurrences
        for layer in eval_df['layer'].unique():
            layer_data = eval_df[eval_df['layer'] == layer]  # Data for the current layer

            # Get the sources for the current layer
            sources = layer_data['source'].values

            # Count the occurrences of each source
            for source in sources:
                source_occurrences_per_layer[layer][source] += 1
                total_source_occurrences[source] += 1
                source_occurrences_per_eval[evaluationSample][source] += 1

    # Print the relevance of sources for each evaluation sample
    print("Relevant sources for each evaluation sample:")
    for eval_sample, source_counts in source_occurrences_per_eval.items():
        total_occurrences = sum(source_counts.values())  # Total occurrences for this evaluation sample
        print(f"Evaluation Sample {eval_sample}:")
        for source, count in source_counts.items():
            relevance_score = count / total_occurrences  # Relevance based on proportion for this eval sample
            print(f"  - Source {source}: Relevance = {relevance_score:.4f}")
        print()

    # Calculate and print total relevance scores across all layers
    print(f"Overall relevant source scores:")
    total_occurrences_in_all_layers = sum(total_source_occurrences.values())
    for source, count in total_source_occurrences.items():
        overall_relevance_score = count / total_occurrences_in_all_layers  # Relevance based on total occurrences across all layers
        print(f"  - Source {source} overall relevance score: {overall_relevance_score:.4f}")
    print()

    # Print relevant sources for each layer
    print("Relevant sources per layer:")
    for layer, source_counts in source_occurrences_per_layer.items():
        total_occurrences = sum(source_counts.values())
        print(f"Layer {layer}:")
        for source, count in source_counts.items():
            relevance_score = count / total_occurrences  # Relevance based on occurrence ratio
            print(f"  - Source {source} relevance score for this layer: {relevance_score:.4f}")
        print()

# Example function calls with dataframes
extractRelevantSourceScores(evalDf, "Evaluation")
#extractRelevantSourceScores(generatedEvalDf, "Generated Evaluation")

Relevant source scores per layer and overall for Evaluation
Relevant sources for each evaluation sample:
Evaluation Sample 0:
  - Source 0:3: Relevance = 0.1112
  - Source 0:7: Relevance = 0.1111
  - Source 0:4: Relevance = 0.1109
  - Source 0:1: Relevance = 0.1114
  - Source 0:2: Relevance = 0.1113
  - Source 0:6: Relevance = 0.1110
  - Source 0:0: Relevance = 0.1113
  - Source 0:8: Relevance = 0.1113
  - Source 0:5: Relevance = 0.1104

Evaluation Sample 1:
  - Source 0:3: Relevance = 0.1111
  - Source 0:7: Relevance = 0.1110
  - Source 0:4: Relevance = 0.1108
  - Source 0:1: Relevance = 0.1113
  - Source 0:2: Relevance = 0.1121
  - Source 0:6: Relevance = 0.1109
  - Source 0:0: Relevance = 0.1112
  - Source 0:8: Relevance = 0.1112
  - Source 0:5: Relevance = 0.1103

Evaluation Sample 2:
  - Source 0:3: Relevance = 0.1112
  - Source 0:7: Relevance = 0.1111
  - Source 0:4: Relevance = 0.1109
  - Source 0:1: Relevance = 0.1114
  - Source 0:2: Relevance = 0.1113
  - Source 0:6: Relevance

# Extracting most relevant layers based on both sentence and difference similarity

In [None]:
def extractRelevantLayersBasedOnTotalSimilarity(df, dfName):
    print(f"Most relevant layers for {dfName}")
    total_relevance_scores = {layer: [] for layer in df['layer'].unique()}

    # Iterate over each evaluation sample
    for evaluationSample in range(evaluationSamples):  # Adjust range based on your dataset
        evalSample = sentences[evaluationSample + 6]  # Adjust index based on your dataset
        evalEmbedding = model.encode(evalSample)

        # Iterate over each training sample
        for trainingSample in range(trainSamples):  # Adjust based on your dataset
            trainSample = sentences[trainingSample]
            trainEmbedding = model.encode(trainSample)

            # Compute cosine similarity between the evaluation and training sentence embeddings
            sentenceCS = compute_cosine_similarity(evalEmbedding, trainEmbedding)

            # Filter the DataFrame for the current evaluation sample and layer
            eval_df = df[df.index == (evaluationSample)]  # Filter rows for the evaluation sample

            # For each layer, calculate the relevance score
            for layer in eval_df['layer'].unique():
                layer_data = eval_df[eval_df['layer'] == layer]  # Data for the current layer

                # Get the layer difference (you already have this as 'difference')
                layer_difference = layer_data['difference'].values

                # Reduce layer_difference to a scalar by taking the mean if it's an array
                if len(layer_difference.shape) > 1:
                    layer_difference = np.mean(layer_difference)

                # Normalize the components
                sentenceCS_normalized = (sentenceCS + 1) / 2  # Rescale to 0–1 range
                layer_difference_normalized = (layer_difference - np.min(layer_difference)) / (np.max(layer_difference) - np.min(layer_difference))

                # Calculate the relevance score: combine sentenceCS and layer difference
                relevance_score = sentenceCS_normalized * (1 - layer_difference_normalized)  # Adjust this as needed

                # Debug print relevance score
                #print(f"Relevance score for evalSample {evaluationSample}, layer {layer}: {relevance_score}")

                # Ensure that the relevance score is a scalar before appending
                total_relevance_scores[layer].append(relevance_score)

        # Calculate total relevance scores and sort by relevance
    layer_relevance = {}
    for layer, relevance_scores in total_relevance_scores.items():
        # Flatten the list if there are multiple sub-lists for each layer
        flattened_relevance_scores = np.concatenate([np.array(relevance_scores[i]) for i in range(len(relevance_scores))])
        # Calculate the total relevance score by averaging the flattened list
        total_relevance = np.mean(flattened_relevance_scores)
        layer_relevance[layer] = total_relevance

    # Sort layers by their total relevance score in descending order
    sorted_layers = sorted(layer_relevance.items(), key=lambda x: x[1], reverse=True)

    # Print sorted relevance scores
    for layer, relevance_score in sorted_layers:
        print(f"Layer {layer}, Type={hidden_sizes[layer][0]} - Total Relevance Score: {relevance_score}")
    print()

# Example function calls with dataframes
extractRelevantLayersBasedOnTotalSimilarity(evalDf, "Evaluation")
#extractRelevantLayersBasedOnTotalSimilarity(generatedEvalDf, "Generated Evaluation")

Most relevant layers for Evaluation
Layer 13, Type=GELU - Total Relevance Score: 0.6886349474362854
Layer 4, Type=Linear - Total Relevance Score: 0.6617097377301974
Layer 23, Type=Dropout - Total Relevance Score: 0.6525621726547421
Layer 29, Type=GELU - Total Relevance Score: 0.5718572391069195
Layer 7, Type=Dropout - Total Relevance Score: 0.5501056762765494
Layer 6, Type=Linear - Total Relevance Score: 0.49088975551062075
Layer 21, Type=Linear - Total Relevance Score: 0.4503052048080021
Layer 30, Type=Linear - Total Relevance Score: 0.4152231611215884
Layer 31, Type=Sequential - Total Relevance Score: 0.4152231611215884
Layer 32, Type=FeedForward - Total Relevance Score: 0.4152231611215884
Layer 27, Type=LayerNorm - Total Relevance Score: 0.4098545452544785
Layer 14, Type=Linear - Total Relevance Score: 0.3274401048938315
Layer 15, Type=Sequential - Total Relevance Score: 0.3274401048938315
Layer 16, Type=FeedForward - Total Relevance Score: 0.3274401048938315
Layer 5, Type=Linear - 

# Relevant layers based on direct similarity between sentences and activation

In [None]:
def extractLayersBestMatchingSentenceSimilarities(df, dfName):
    print(f"Most relevant layers for {dfName}")
    total_relevance_scores = {layer: [] for layer in df['layer'].unique()}

    # Iterate over each evaluation sample
    for evaluationSample in range(evaluationSamples):  # Adjust range based on your dataset
        evalSample = sentences[evaluationSample + 6]  # Adjust index based on your dataset
        evalEmbedding = model.encode(evalSample)

        # Iterate over each training sample
        for trainingSample in range(trainSamples):  # Adjust based on your dataset
            trainSample = sentences[trainingSample]
            trainEmbedding = model.encode(trainSample)

            # Compute cosine similarity between the evaluation and training sentence embeddings
            sentenceCS = compute_cosine_similarity(evalEmbedding, trainEmbedding)

            # Filter the DataFrame for the current evaluation sample and layer
            eval_df = df[df.index == (evaluationSample)]  # Filter rows for the evaluation sample

            # For each layer, calculate the relevance score
            for layer in eval_df['layer'].unique():
                layer_data = eval_df[eval_df['layer'] == layer]  # Data for the current layer

                # Get the layer difference (you already have this as 'difference')
                layer_difference = layer_data['difference'].values

                # Reduce layer_difference to a scalar by taking the mean if it's an array
                if len(layer_difference.shape) > 1:
                    layer_difference = np.mean(layer_difference)

                # Min-Max Normalization for both sentenceCS and layer_difference
                sentenceCS_min_max = (sentenceCS + 1) / 2  # Rescale to 0–1 range
                layer_difference_min_max = (layer_difference - np.min(layer_difference)) / (np.max(layer_difference) - np.min(layer_difference))

                # Z-Score Normalization for both sentenceCS and layer_difference with checks
                if np.std(sentenceCS) != 0:  # Check if std is not zero
                    sentenceCS_z_score = (sentenceCS - np.mean(sentenceCS)) / np.std(sentenceCS)
                else:
                    sentenceCS_z_score = 0  # If std is zero, set to 0 or use a fallback

                if np.std(layer_difference) != 0:  # Check if std is not zero
                    layer_difference_z_score = (layer_difference - np.mean(layer_difference)) / np.std(layer_difference)
                else:
                    layer_difference_z_score = 0  # If std is zero, set to 0 or use a fallback

                # Calculate the relevance score using Min-Max normalization
                relevance_score_min_max = abs(layer_difference_min_max - sentenceCS_min_max)

                # Calculate the relevance score using Z-Score normalization
                relevance_score_z_score = abs(layer_difference_z_score - sentenceCS_z_score)

                # Combine both relevance scores by averaging them
                combined_relevance_score = (relevance_score_min_max + relevance_score_z_score) / 2

                # Ensure that the relevance score is a scalar before appending
                total_relevance_scores[layer].append(combined_relevance_score)

    # Calculate total relevance scores and sort by relevance
    layer_relevance = {}

    # Calculate total relevance scores using median, mean, and sum (normalized by hidden_sizes[layer][1])
    for layer, relevance_scores in total_relevance_scores.items():
        # Flatten the list if there are multiple sub-lists for each layer
        flattened_relevance_scores = np.concatenate([np.array(relevance_scores[i]) for i in range(len(relevance_scores))])

        # Calculate total relevance using median, mean, and sum
        total_relevance_median = np.median(flattened_relevance_scores)
        total_relevance_mean = np.mean(flattened_relevance_scores)

        # Normalize the sum by dividing by hidden_sizes[layer][1]
        total_relevance_sum = np.sum(flattened_relevance_scores) / hidden_sizes[layer][1]

        # Store all three relevance scores in a dictionary for each layer
        layer_relevance[layer] = {
            'median': total_relevance_median,
            'mean': total_relevance_mean,
            'sum': total_relevance_sum
        }

    # Sort layers by the chosen relevance metric (e.g., 'median', 'mean', or 'sum')
    sorted_layers = sorted(layer_relevance.items(),
                           key=lambda x: (x[1]['median'], x[1]['mean'], x[1]['sum']))

    # Print sorted relevance scores
    for layer, relevance_scores in sorted_layers:
        print(f"Layer {layer}, Type={hidden_sizes[layer][0]} - "
              f"Total Relevance Scores (Median: {relevance_scores['median']}, "
              f"Mean: {relevance_scores['mean']}, Sum (normalized): {relevance_scores['sum']})")

# Example function calls with dataframes
extractLayersBestMatchingSentenceSimilarities(evalDf, "Evaluation")

Most relevant layers for Evaluation
Layer 8, Type=Linear - Total Relevance Scores (Median: 0.0698969884843425, Mean: 0.21248351518467962, Sum (normalized): 190.76482255187057)
Layer 9, Type=MultiHeadAttention - Total Relevance Scores (Median: 0.0698969884843425, Mean: 0.21248351518467962, Sum (normalized): 190.76482255187057)
Layer 18, Type=TransformerBlock - Total Relevance Scores (Median: 0.08334401587582568, Mean: 0.22994817996214828, Sum (normalized): 206.23477390355174)
Layer 11, Type=LayerNorm - Total Relevance Scores (Median: 0.09164316676011014, Mean: 0.2460239701416154, Sum (normalized): 220.90902318965883)
Layer 34, Type=TransformerBlock - Total Relevance Scores (Median: 0.09337356207031636, Mean: 0.2565708885324674, Sum (normalized): 230.3124616592227)
Layer 35, Type=Sequential - Total Relevance Scores (Median: 0.09337356207031636, Mean: 0.2565708885324674, Sum (normalized): 230.3124616592227)
Layer 20, Type=Linear - Total Relevance Scores (Median: 0.10043148762108761, Mean:

In [None]:
from scipy.stats import spearmanr, kendalltau, pearsonr

def compute_cosine_similarity(image1, image2):
    """Compute cosine similarity between two images."""
    vec1 = image1.flatten().reshape(1, -1)
    vec2 = image2.flatten().reshape(1, -1)
    return cosine_similarity(vec1, vec2)[0][0]

def computeSimilarity(sample, train_sample):
    # Compute similarities
    cosine_similarity = compute_cosine_similarity(sample, train_sample)
    euclidean_distance = np.linalg.norm(sample - train_sample)  # Euclidean
    manhattan_distance = np.sum(np.abs(sample - train_sample))  # Manhattan
    jaccard_similarity = (
        np.sum(np.minimum(sample, train_sample)) / np.sum(np.maximum(sample, train_sample))
        if np.sum(np.maximum(sample, train_sample)) > 0 else None
    )
    hamming_distance = np.mean(sample != train_sample)  # Hamming
    try:
        pearson_correlation, _ = pearsonr(sample.flatten(), train_sample.flatten())  # Pearson
    except ValueError:
        pearson_correlation = None

    return cosine_similarity, euclidean_distance, manhattan_distance, jaccard_similarity, hamming_distance, pearson_correlation

def blendActivations(evaluationToCheck, name, eval_df):
    _, mostUsed = getMostUsedFromDataFrame(eval_df, evaluationToCheck, 5, info=False)
    totalSources = sum(count for _, count in mostUsed)

    # Slice to get the first two dimensions from hidden_sizes
    neurons_per_layer = np.array([layer[1] for layer in hidden_sizes])

    # Create a single zero-initialized array of shape (38, max_neurons)
    relevantParts = np.zeros((len(neurons_per_layer), int(neurons_per_layer.max())))
    blendedActivations = np.zeros_like(relevantParts)

    #print(relevantParts.shape)
    evalActivations = np.zeros_like(relevantParts)
    evalEntries = eval_df[eval_df.index == evaluationToCheck]
    for entry in evalEntries[['layer', 'neuron', 'eval_neuron_value']].drop_duplicates().itertuples():
        evalActivations[int(entry.layer)][int(entry.neuron)] = float(entry.eval_neuron_value)

    for source, count in mostUsed:
        sourceName = "0:" + str(source)
        sourceEntries = evalEntries[evalEntries['source'] == sourceName]
        for entry in sourceEntries[['layer', 'neuron', 'neuron_value', 'eval_neuron_value']].drop_duplicates().itertuples():
            #print(entry)
            # Access 'neuron_value' properly
            blendedActivations[int(entry.layer), int(entry.neuron)] += float(entry.neuron_value) * (count / totalSources)

    # Convert blendedActivations to dense array before similarity calculations
    evaluationActivations = np.asarray(evalActivations.flatten().reshape(1, -1), dtype=np.float64)
    blendedActivations_dense = blendedActivations.flatten().reshape(1, -1)

    cosine_similarity, euclidean_distance, manhattan_distance, jaccard_similarity, hamming_distance, pearson_correlation = computeSimilarity(evaluationActivations, blendedActivations_dense)

    kendall_tau, _ = kendalltau(evaluationActivations, blendedActivations_dense)
    spearman_rho, _ = spearmanr(evaluationActivations, blendedActivations_dense)

    # --- Print Results ---
    print(f"\n--- Blended Activation Similarity Scores for {name}-Sample{evaluationToCheck} ---")
    print(f"Kendall's Tau: {kendall_tau:.2f}")
    print(f"Spearman's Rho: {spearman_rho:.2f}")
    print(f"Cosine Similarity: {cosine_similarity:.4f}")
    print(f"Euclidean Distance: {euclidean_distance:.4f}")
    print(f"Manhattan Distance: {manhattan_distance:.4f}")
    print(f"Jaccard Similarity: {jaccard_similarity:.4f}" if jaccard_similarity is not None else "Jaccard Similarity: N/A")
    print(f"Hamming Distance: {hamming_distance:.4f}")
    print(f"Pearson Correlation: {pearson_correlation:.4f}" if pearson_correlation is not None else "Pearson Correlation: N/A")

for evaluation in range(evaluationSamples):
    blendActivations(evaluation, "Evaluation", evalDf)
    blendActivations(evaluation, "GeneratedEvaluation", generatedEvalDf)


--- Blended Activation Similarity Scores for Evaluation-Sample0 ---
Kendall's Tau: 0.77
Spearman's Rho: nan
Cosine Similarity: 0.7910
Euclidean Distance: 1479.7354
Manhattan Distance: 165754.6559
Jaccard Similarity: N/A
Hamming Distance: 0.0443
Pearson Correlation: 0.7877

--- Blended Activation Similarity Scores for GeneratedEvaluation-Sample0 ---
Kendall's Tau: 0.84
Spearman's Rho: nan
Cosine Similarity: 0.9540
Euclidean Distance: 725.0244
Manhattan Distance: 153259.4231
Jaccard Similarity: N/A
Hamming Distance: 0.0439
Pearson Correlation: 0.9529

--- Blended Activation Similarity Scores for Evaluation-Sample1 ---
Kendall's Tau: 0.77
Spearman's Rho: nan
Cosine Similarity: 0.7910
Euclidean Distance: 1479.7354
Manhattan Distance: 165754.6559
Jaccard Similarity: N/A
Hamming Distance: 0.0443
Pearson Correlation: 0.7877

--- Blended Activation Similarity Scores for GeneratedEvaluation-Sample1 ---
Kendall's Tau: 0.84
Spearman's Rho: nan
Cosine Similarity: 0.9546
Euclidean Distance: 712.65

In [None]:
from scipy.stats import spearmanr, kendalltau, pearsonr

def compute_cosine_similarity(image1, image2):
    """Compute cosine similarity between two images."""
    vec1 = image1.flatten().reshape(1, -1)
    vec2 = image2.flatten().reshape(1, -1)
    return cosine_similarity(vec1, vec2)[0][0]

def computeSimilarity(sample, train_sample, normalized=False):
    def normalize_vector(vec):
        norm = np.linalg.norm(vec)  # Calculate the L2 norm (Euclidean norm) of the vector
        if norm == 0:  # To prevent division by zero if the vector is all zeros
            return vec
        return vec / norm  # Divide the vector by its norm to normalize it

    if (normalized):
        # Normalize sample and train_sample
        sample = normalize_vector(sample)
        train_sample = normalize_vector(train_sample)

    # Compute similarities
    cosine_similarity = compute_cosine_similarity(sample, train_sample)
    euclidean_distance = np.linalg.norm(sample - train_sample)  # Euclidean
    manhattan_distance = np.sum(np.abs(sample - train_sample))  # Manhattan

    shift = abs(min(np.min(sample), np.min(train_sample))) + 1
    sample_shifted = sample + shift
    train_sample_shifted = train_sample + shift
    jaccard_similarity = (
        np.sum(np.minimum(sample_shifted, train_sample_shifted)) /
        np.sum(np.maximum(sample_shifted, train_sample_shifted))
        if np.sum(np.maximum(sample_shifted, train_sample_shifted)) > 0 else None
    )
    hamming_distance = np.mean(sample != train_sample)  # Hamming
    try:
        pearson_correlation, _ = pearsonr(sample.flatten(), train_sample.flatten())  # Pearson
    except ValueError:
        pearson_correlation = None

    return cosine_similarity, euclidean_distance, manhattan_distance, jaccard_similarity, hamming_distance, pearson_correlation

def blendActivations(evaluationToCheck, name, closestSources, eval_df, debug=True, difference=False):
    _, mostUsed = getMostUsedFromDataFrame(evalDf, evaluationToCheck, closestSources, info=False)
    totalSources = sum(count for _, count in mostUsed)

    min_count = min([count for _, count in mostUsed])  # Find the maximum count value

    if(difference):
        # Adjust each count by subtracting the minimum count
        for idx, (source, count) in enumerate(mostUsed):
            mostUsed[idx] = (source, count - min_count)

        totalSources = sum(count for _, count in mostUsed)

    # Slice to get the first two dimensions from hidden_sizes
    neurons_per_layer = np.array([layer[1] for layer in hidden_sizes])

    # Create a single zero-initialized array of shape (38, max_neurons)
    relevantParts = np.zeros((len(neurons_per_layer), int(neurons_per_layer.max())))
    blendedActivations = np.zeros_like(relevantParts)

    #print(relevantParts.shape)
    evalActivations = np.zeros_like(relevantParts)
    evalEntries = eval_df[eval_df.index == evaluationToCheck]
    for entry in evalEntries[['layer', 'neuron', 'eval_neuron_value']].drop_duplicates().itertuples():
        evalActivations[int(entry.layer)][int(entry.neuron)] = float(entry.eval_neuron_value)

    for source, count in mostUsed:
        sourceName = "0:" + str(source)
        sourceEntries = evalEntries[evalEntries['source'] == sourceName]
        for entry in sourceEntries[['layer', 'neuron', 'neuron_value', 'eval_neuron_value']].drop_duplicates().itertuples():
            #print(entry)
            # Access 'neuron_value' properly
            blendedActivations[int(entry.layer), int(entry.neuron)] += float(entry.neuron_value) * (count / totalSources)

    # Convert blendedActivations to dense array before similarity calculations
    evaluationActivations = np.asarray(evalActivations.flatten().reshape(1, -1), dtype=np.float64)
    blendedActivations_dense = blendedActivations.flatten().reshape(1, -1)

    cosine_similarity, euclidean_distance, manhattan_distance, jaccard_similarity, hamming_distance, pearson_correlation = computeSimilarity(evaluationActivations, blendedActivations_dense)

    kendall_tau, _ = kendalltau(evaluationActivations, blendedActivations_dense)
    spearman_rho, _ = spearmanr(evaluationActivations, blendedActivations_dense)

    if(debug):
        # --- Print Results ---
        print(f"\n--- Blended Activation Similarity Scores for {name}-Sample{evaluationToCheck} ---")
        print(f"Kendall's Tau: {kendall_tau:.2f}")
        print(f"Spearman's Rho: {spearman_rho:.2f}")
        print(f"Cosine Similarity: {cosine_similarity:.4f}")
        print(f"Euclidean Distance: {euclidean_distance:.4f}")
        print(f"Manhattan Distance: {manhattan_distance:.4f}")
        print(f"Jaccard Similarity: {jaccard_similarity:.4f}" if jaccard_similarity is not None else "Jaccard Similarity: N/A")
        print(f"Hamming Distance: {hamming_distance:.4f}")
        print(f"Pearson Correlation: {pearson_correlation:.4f}" if pearson_correlation is not None else "Pearson Correlation: N/A")

    # Store best layers
    layer_metrics = {
        "cosine_similarity": cosine_similarity,
        "kendall_tau": kendall_tau,
        "pearson_correlation": pearson_correlation,
        "jaccard_similarity": jaccard_similarity,
        "euclidean_distance": euclidean_distance,
        "manhattan_distance": manhattan_distance,
        "hamming_distance": hamming_distance,
    }

    return blendedActivations, evalActivations, layer_metrics

for evaluation in range(evaluationSamples):
    blendActivations(evaluation, "Evaluation", 5, evalDf)
    blendActivations(evaluation, "GeneratedEvaluation", 5, generatedEvalDf)


--- Blended Activation Similarity Scores for Evaluation-Sample0 ---
Kendall's Tau: 0.77
Spearman's Rho: nan
Cosine Similarity: 0.7910
Euclidean Distance: 1479.7354
Manhattan Distance: 165754.6559
Jaccard Similarity: 0.9987
Hamming Distance: 0.0443
Pearson Correlation: 0.7877

--- Blended Activation Similarity Scores for GeneratedEvaluation-Sample0 ---
Kendall's Tau: 0.84
Spearman's Rho: nan
Cosine Similarity: 0.9495
Euclidean Distance: 817.8663
Manhattan Distance: 186968.1171
Jaccard Similarity: 0.9974
Hamming Distance: 0.0439
Pearson Correlation: 0.9484

--- Blended Activation Similarity Scores for Evaluation-Sample1 ---
Kendall's Tau: 0.77
Spearman's Rho: nan
Cosine Similarity: 0.7910
Euclidean Distance: 1479.7354
Manhattan Distance: 165754.6559
Jaccard Similarity: 0.9987
Hamming Distance: 0.0443
Pearson Correlation: 0.7877

--- Blended Activation Similarity Scores for GeneratedEvaluation-Sample1 ---
Kendall's Tau: 0.82
Spearman's Rho: nan
Cosine Similarity: 0.9420
Euclidean Distanc

In [None]:
def findBestLayerByBlendedActivations(evaluationToCheck, name, closestSources, eval_df):
    blendedActivations, evalActivations, layer_metrics = blendActivations(evaluationToCheck, name, closestSources, eval_df, debug=False)

    # --- Compute Metrics ---
    best_layers = {metric: {"value": None, "layer": None} for metric in [
        "cosine_similarity", "kendall_tau", "pearson_correlation", "jaccard_similarity",
        "euclidean_distance", "manhattan_distance", "hamming_distance"
    ]}

    for layer_idx in range(len(blendedActivations)):
        for metric, optimize_fn in {
            "cosine_similarity": max,
            "kendall_tau": max,
            "pearson_correlation": max,
            "jaccard_similarity": max,
            "euclidean_distance": min,
            "manhattan_distance": min,
            "hamming_distance": min,
        }.items():
            if best_layers[metric]["value"] is None or optimize_fn(layer_metrics[metric], best_layers[metric]["value"]) == layer_metrics[metric]:
                best_layers[metric]["value"] = layer_metrics[metric]
                best_layers[metric]["layer"] = layer_idx

    # Return the blended and evaluation activations for further analysis
    return {
        "blended_activations": blendedActivations,
        "evaluation_activations": evalActivations,
        "best_layers": best_layers
    }

def aggregateLayerMetricsByAllMeans(name, closestSources, eval_df):
    # Metrics to compute and their optimization direction
    metrics_to_optimize = {
        "cosine_similarity": max,
        "kendall_tau": max,
        "pearson_correlation": max,
        "jaccard_similarity": max,
        "euclidean_distance": min,
        "manhattan_distance": min,
        "hamming_distance": min,
    }

    # Store metrics for all layers
    layer_metrics_aggregated = {layer: {metric: [] for metric in metrics_to_optimize} for layer in range(len(hidden_sizes))}

    for evaluation in range(evaluationSamples):
        # Run analysis for the current evaluation sample
        blendedActivations = findBestLayerByBlendedActivations(
            evaluation, name, closestSources, eval_df
        )

        # Aggregate metrics layer-wise
        for layer_idx in range(len(hidden_sizes)):
            evaluation_layer = blendedActivations["evaluation_activations"][layer_idx].flatten()
            blended_layer = blendedActivations["blended_activations"][layer_idx].flatten()

            cosine_similarity, euclidean_distance, manhattan_distance, jaccard_similarity, hamming_distance, pearson_correlation = computeSimilarity(
                evaluation_layer.reshape(1, -1), blended_layer.reshape(1, -1)
            )

            kendall_tau, _ = kendalltau(evaluation_layer, blended_layer)
            spearman_rho, _ = spearmanr(evaluation_layer, blended_layer)

            # Add layer metrics to aggregated results
            layer_metrics_aggregated[layer_idx]["cosine_similarity"].append(cosine_similarity)
            layer_metrics_aggregated[layer_idx]["kendall_tau"].append(kendall_tau)
            layer_metrics_aggregated[layer_idx]["pearson_correlation"].append(pearson_correlation)
            layer_metrics_aggregated[layer_idx]["jaccard_similarity"].append(jaccard_similarity)
            layer_metrics_aggregated[layer_idx]["euclidean_distance"].append(euclidean_distance)
            layer_metrics_aggregated[layer_idx]["manhattan_distance"].append(manhattan_distance)
            layer_metrics_aggregated[layer_idx]["hamming_distance"].append(hamming_distance)

    # Use np.mean to calculate the mean
    sorted_layers = sorted(
        layer_metrics_aggregated.items(),
        key=lambda x: (
            -np.mean(x[1].get("pearson_correlation", [0])),  # Mean of the list, default to 0 if empty
            -np.mean(x[1].get("cosine_similarity", [0])),    # Mean of the list, default to 0 if empty
            -np.mean(x[1].get("kendall_tau", [0])),          # Mean of the list, default to 0 if empty
            -np.mean(x[1].get("jaccard_similarity", [0])),    # Mean of the list, default to 0 if empty
            np.mean(x[1].get("manhattan_distance", [0])),    # Mean for ascending order
            np.mean(x[1].get("euclidean_distance", [0])),    # Mean for ascending order
            np.mean(x[1].get("hamming_distance", [0])),      # Mean for ascending order
        ),
        reverse=False  # Use the above tuple ordering directly
    )

    mostRelevantLayers = []
    # Print sorted relevance scores
    print(f"\n--- Sorted Layers by Custom Order of Metrics for {name} ---")
    for layer, metrics in sorted_layers:
        if(np.mean(metrics["pearson_correlation"]) >= 0.85 and np.mean(metrics["cosine_similarity"]) >= 0.85):
            mostRelevantLayers.append(layer)
        print(
            f"Layer {layer}, Type={hidden_sizes[layer][0]}\n" +
            "\n".join(
                f"    {metric.capitalize()}: {np.mean(metrics.get(metric, 0))}"
                for metric in ["pearson_correlation", "cosine_similarity", "jaccard_similarity","kendall_tau", "manhattan_distance", "euclidean_distance", "hamming_distance"]
                if metric in metrics
            )
        )

    parameters = 0
    print("Most Relevant Layers: ")
    for layer in mostRelevantLayers:
        if(hidden_sizes[layer][0] != "Sequential"):
            # Find the metrics for the current layer in sorted_layers
            _, metrics = next((l, m) for l, m in sorted_layers if l == layer)
            metricsSummary = ", ".join(f"{metric.capitalize()}: {np.mean(metrics.get(metric, 0))}" for metric in ["pearson_correlation", "cosine_similarity", "jaccard_similarity", "kendall_tau", "manhattan_distance", "euclidean_distance", "hamming_distance"] if metric in metrics)
            print(f"Layer {layer}, Type={hidden_sizes[layer][0]}, Size={hidden_sizes[layer][2]}", metricsSummary)
            parameters += hidden_sizes[layer][2]
    print("Total Parameters: ", parameters, "/", np.sum(np.array([layer[1] for layer in hidden_sizes])))

#    return summarized_metrics, sorted_layers

test = aggregateLayerMetricsByAllMeans("Evaluation", 5, evalDf)
test = aggregateLayerMetricsByAllMeans("GeneratedEvaluation", 5, generatedEvalDf)




--- Sorted Layers by Custom Order of Metrics for Evaluation ---
Layer 0, Type=Embedding
    Pearson_correlation: 0.06673792982433127
    Cosine_similarity: 0.06669346209916778
    Jaccard_similarity: 0.9968832419127169
    Kendall_tau: 0.016061861012238367
    Manhattan_distance: 526.5391238363135
    Euclidean_distance: 23.88650723877866
    Hamming_distance: 0.015261555604194444
Layer 1, Type=Embedding
    Pearson_correlation: nan
    Cosine_similarity: 0.0
    Jaccard_similarity: 1.0
    Kendall_tau: nan
    Manhattan_distance: 0.0
    Euclidean_distance: 0.0
    Hamming_distance: 0.0
Layer 30, Type=Linear
    Pearson_correlation: 0.9835901479209754
    Cosine_similarity: 0.9835901096558164
    Jaccard_similarity: 0.997205970326912
    Kendall_tau: 0.883199744561094
    Manhattan_distance: 1792.330246389165
    Euclidean_distance: 80.01409019319888
    Hamming_distance: 0.015261555604194444
Layer 31, Type=Sequential
    Pearson_correlation: 0.9835901479209754
    Cosine_similarity:




--- Sorted Layers by Custom Order of Metrics for GeneratedEvaluation ---
Layer 0, Type=Embedding
    Pearson_correlation: -0.005834112629211812
    Cosine_similarity: -0.005846338768775965
    Jaccard_similarity: 0.9966860230441312
    Kendall_tau: 0.017588534068189006
    Manhattan_distance: 553.0183856561443
    Euclidean_distance: 25.01737628093611
    Hamming_distance: 0.015261555604194444
Layer 1, Type=Embedding
    Pearson_correlation: nan
    Cosine_similarity: 0.0
    Jaccard_similarity: 1.0
    Kendall_tau: nan
    Manhattan_distance: 0.0
    Euclidean_distance: 0.0
    Hamming_distance: 0.0
Layer 28, Type=Linear
    Pearson_correlation: 0.973571716578931
    Cosine_similarity: 0.9749610186001
    Jaccard_similarity: 0.995422867355608
    Kendall_tau: 0.9745890960916215
    Manhattan_distance: 3092.3953343374496
    Euclidean_distance: 70.27570624184327
    Hamming_distance: 0.061105915593847614
Layer 30, Type=Linear
    Pearson_correlation: 0.9733009888322087
    Cosine_simi

# Testing Trainsamples vs. Testsamples

In [None]:
seed0TrainGeneratedEvaluationSentences = ['physical athleticism or physical dexterity, with major competitions admitting only sports meeting this definition.', 'classification as sports.', 'with others being done by hundreds.', 'this definition.', 'producing a champion.', ', producing a \\"tie\\" or \\"tie\\" or \\"tie\\", producing a \\"draw\\", in a single winner.', 'with others being done by hundreds.', 'this definition.', 'ensure one winner., with major competitions admitting only sports meeting this definition.', 'hundreds.']
seed0TestGeneratedEvaluationSentences = ['admitting only sports meeting this definition.', 'competitions admitting only sports meeting this definition.', 'by playoffs.', 'with others being done by hundreds.', 'others provide tie-breaking methods to ensure one winner.', 'competitions admitting only sports meeting this definition.', '., producing a single person with others being done by arranging games in a single person with others being done by hundreds., producing a single person with others provide enjoyment to spectators.', '. ensure one winner.', 'some cases, with different participant numbers, producing a tournament format, with different participant numbers, with others being done by arranging games in some cases, with others being done by arranging games in a tournament format in a single person with different participant numbers, with others provide tie-breaking methods to spectators... some cases, producing a single person with others provide.. provide.', 'classification as sports.']

In [None]:
trainDf = pq.read_table('./Data/identifiedClosestEvalSourcesTrainingSeed0.parquet').to_pandas(safe=False)
testDf = pq.read_table('./Data/identifiedClosestEvalSourcesTestSeed0.parquet').to_pandas(safe=False)

evaluationSamples = len(trainDf.index.unique())
trainSamples = len(trainDf['source'].unique())

In [None]:
for evaluation in range(evaluationSamples):
    blendActivations(evaluation, "Training", 5, trainDf)
    blendActivations(evaluation, "Test", 5, testDf)


--- Blended Activation Similarity Scores for Training-Sample0 ---
Kendall's Tau: 0.74
Spearman's Rho: nan
Cosine Similarity: 0.9776
Euclidean Distance: 225.1966
Manhattan Distance: 49999.2426
Jaccard Similarity: 0.9972
Hamming Distance: 0.0446
Pearson Correlation: 0.9771

--- Blended Activation Similarity Scores for Test-Sample0 ---
Kendall's Tau: 0.74
Spearman's Rho: nan
Cosine Similarity: 0.9777
Euclidean Distance: 225.0700
Manhattan Distance: 49970.2538
Jaccard Similarity: 0.9972
Hamming Distance: 0.0446
Pearson Correlation: 0.9771

--- Blended Activation Similarity Scores for Training-Sample1 ---
Kendall's Tau: 0.74
Spearman's Rho: nan
Cosine Similarity: 0.9776
Euclidean Distance: 225.1966
Manhattan Distance: 49999.2426
Jaccard Similarity: 0.9972
Hamming Distance: 0.0446
Pearson Correlation: 0.9771

--- Blended Activation Similarity Scores for Test-Sample1 ---
Kendall's Tau: 0.74
Spearman's Rho: nan
Cosine Similarity: 0.9776
Euclidean Distance: 225.1966
Manhattan Distance: 49999.2

In [None]:
test = aggregateLayerMetricsByAllMeans("Train", 5, trainDf)
test = aggregateLayerMetricsByAllMeans("Test", 5, testDf)




--- Sorted Layers by Custom Order of Metrics for Train ---
Layer 0, Type=Embedding
    Pearson_correlation: -0.013321016473944037
    Cosine_similarity: -0.013323179467305546
    Jaccard_similarity: 0.9967612986280171
    Kendall_tau: 0.024281272868548958
    Manhattan_distance: 719.1806125663627
    Euclidean_distance: 32.09258568130836
    Hamming_distance: 0.015261555604194444
Layer 1, Type=Embedding
    Pearson_correlation: nan
    Cosine_similarity: 0.0
    Jaccard_similarity: 1.0
    Kendall_tau: nan
    Manhattan_distance: 0.0
    Euclidean_distance: 0.0
    Hamming_distance: 0.0
Layer 30, Type=Linear
    Pearson_correlation: 0.9029472215466419
    Cosine_similarity: 0.9029472010687988
    Jaccard_similarity: 0.9983499040684969
    Kendall_tau: 0.7538914161396283
    Manhattan_distance: 292.3890816339978
    Euclidean_distance: 13.079640701274709
    Hamming_distance: 0.015261555604194444
Layer 31, Type=Sequential
    Pearson_correlation: 0.9029472215466419
    Cosine_similarit




--- Sorted Layers by Custom Order of Metrics for Test ---
Layer 0, Type=Embedding
    Pearson_correlation: -0.013321016473944037
    Cosine_similarity: -0.013323179467305546
    Jaccard_similarity: 0.9967612986280171
    Kendall_tau: 0.024281272868548958
    Manhattan_distance: 719.1806125663627
    Euclidean_distance: 32.09258568130836
    Hamming_distance: 0.015261555604194444
Layer 1, Type=Embedding
    Pearson_correlation: nan
    Cosine_similarity: 0.0
    Jaccard_similarity: 1.0
    Kendall_tau: nan
    Manhattan_distance: 0.0
    Euclidean_distance: 0.0
    Hamming_distance: 0.0
Layer 30, Type=Linear
    Pearson_correlation: 0.9029472215466419
    Cosine_similarity: 0.9029472010687988
    Jaccard_similarity: 0.9983499040684969
    Kendall_tau: 0.7538914161396283
    Manhattan_distance: 292.3890816339978
    Euclidean_distance: 13.079640701274709
    Hamming_distance: 0.015261555604194444
Layer 31, Type=Sequential
    Pearson_correlation: 0.9029472215466419
    Cosine_similarity

In [None]:
trainDf = pq.read_table('./Data/identifiedClosestGeneratedEvalSourcesTrainingSeed0.parquet').to_pandas(safe=False)
testDf = pq.read_table('./Data/identifiedClosestGeneratedEvalSourcesTestSeed0.parquet').to_pandas(safe=False)

evaluationSamples = len(trainDf.index.unique())
trainSamples = len(trainDf['source'].unique())

In [None]:
for evaluation in range(evaluationSamples):
    blendActivations(evaluation, "Training", 5, trainDf)
    blendActivations(evaluation, "Test", 5, testDf)


--- Blended Activation Similarity Scores for Training-Sample0 ---
Kendall's Tau: 0.78
Spearman's Rho: nan
Cosine Similarity: 0.9885
Euclidean Distance: 329.3123
Manhattan Distance: 81957.6245
Jaccard Similarity: 0.9945
Hamming Distance: 0.0447
Pearson Correlation: 0.9882

--- Blended Activation Similarity Scores for Test-Sample0 ---
Kendall's Tau: 0.79
Spearman's Rho: nan
Cosine Similarity: 0.9880
Euclidean Distance: 244.6394
Manhattan Distance: 59710.0419
Jaccard Similarity: 0.9960
Hamming Distance: 0.0447
Pearson Correlation: 0.9877

--- Blended Activation Similarity Scores for Training-Sample1 ---
Kendall's Tau: 0.77
Spearman's Rho: nan
Cosine Similarity: 0.9859
Euclidean Distance: 212.9317
Manhattan Distance: 50189.9565
Jaccard Similarity: 0.9963
Hamming Distance: 0.0447
Pearson Correlation: 0.9856

--- Blended Activation Similarity Scores for Test-Sample1 ---
Kendall's Tau: 0.78
Spearman's Rho: nan
Cosine Similarity: 0.9892
Euclidean Distance: 375.4850
Manhattan Distance: 93424.8

In [None]:
test = aggregateLayerMetricsByAllMeans("Train", 5, trainDf)
test = aggregateLayerMetricsByAllMeans("Test", 5, testDf)




--- Sorted Layers by Custom Order of Metrics for Train ---
Layer 0, Type=Embedding
    Pearson_correlation: -0.004689984193353045
    Cosine_similarity: -0.004696406564585741
    Jaccard_similarity: 0.9964562291038723
    Kendall_tau: 0.003336207996995548
    Manhattan_distance: 688.8366447921867
    Euclidean_distance: 31.13431802663941
    Hamming_distance: 0.015261555604194444
Layer 1, Type=Embedding
    Pearson_correlation: nan
    Cosine_similarity: 0.0
    Jaccard_similarity: 1.0
    Kendall_tau: nan
    Manhattan_distance: 0.0
    Euclidean_distance: 0.0
    Hamming_distance: 0.0
Layer 30, Type=Linear
    Pearson_correlation: 0.9372632784359972
    Cosine_similarity: 0.9372629978390595
    Jaccard_similarity: 0.9986894000223476
    Kendall_tau: 0.8027587578404332
    Manhattan_distance: 229.96830722969145
    Euclidean_distance: 10.397268456596214
    Hamming_distance: 0.015263545376763438
Layer 31, Type=Sequential
    Pearson_correlation: 0.9372632784359972
    Cosine_similari




--- Sorted Layers by Custom Order of Metrics for Test ---
Layer 0, Type=Embedding
    Pearson_correlation: -0.005905855675768805
    Cosine_similarity: -0.005912704143366351
    Jaccard_similarity: 0.9964509635520533
    Kendall_tau: -0.00586307306983533
    Manhattan_distance: 693.4668223240162
    Euclidean_distance: 31.114497691520018
    Hamming_distance: 0.015263545376763438
Layer 1, Type=Embedding
    Pearson_correlation: nan
    Cosine_similarity: 0.0
    Jaccard_similarity: 1.0
    Kendall_tau: nan
    Manhattan_distance: 0.0
    Euclidean_distance: 0.0
    Hamming_distance: 0.0
Layer 30, Type=Linear
    Pearson_correlation: 0.9508142093795439
    Cosine_similarity: 0.9508139224434581
    Jaccard_similarity: 0.9987528865011719
    Kendall_tau: 0.8358010127281867
    Manhattan_distance: 212.46560704695094
    Euclidean_distance: 9.541328343282302
    Hamming_distance: 0.015265535149332432
Layer 31, Type=Sequential
    Pearson_correlation: 0.9508142093795439
    Cosine_similarit

# Test for best sentence split

In [None]:
!pip install -q stanza
import stanza
# Suppress logging from stanza
nlp = stanza.Pipeline('en', verbose=False)
stanza.download('en', verbose=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import timeit

# Patterns for data cleaning
patterns = [
    (re.compile(r'\('), ''),  # Remove open brackets
    (re.compile(r'\)'), ''),  # Remove close brackets
    (re.compile(r"=.*="), ''),  # Remove headings
    (re.compile(r"<unk>"), ''),  # Remove unknown tokens
    (re.compile(r"-"), ' '),  # Exchange hyphens for spaces
    (re.compile(r"[^\w.' ]"), ''),  # Remove non-alphanumeric, except specific symbols
]

def load_data(filepath):
    try:
        with open(filepath, 'r') as f:
            return f.read()
    except FileNotFoundError:
        print(f"Error: The file {filepath} was not found.")
        return ""
    except IOError as e:
        print(f"Error reading the file {filepath}: {e}")
        return ""

def create_sources(data):
    heading_pattern = r'(?<!\= )= [^=]+ =(?!= )'

    with open(data, 'r') as file:
        lines = file.readlines()

        titles, sources = [], []
        sourceStartLine = ""
        titleLine = ""
        source = []
        lastSource = ""
        count = 0
        title = False

        for line in lines:
            #if ((" = " in titleLine and bool(re.search(heading_pattern, titleLine)))):
            #   print(f"FOUND!!!\n.SourceStart.{sourceStartLine}.Title.{titleLine}.Line.{line}")
            if (sourceStartLine == "\" \n" and (" = " in titleLine and bool(re.search(heading_pattern, titleLine))) and line == " \n"):
                title = titleLine.replace('= ', '').replace(' =', '').replace(' <unk> ', '').replace('\n', '')
                titles.append(title)
                tempSource = ("").join(source)
                for pattern, replacement in patterns:
                    tempSource = pattern.sub(replacement, tempSource)
                if(lastSource != ""):
                    sources.append(lastSource)
                lastSource = tempSource
                source = []
                count += 1

            #title = (sourceStartLine == "\" \n" and (" = " in titleLine and bool(re.search(heading_pattern, titleLine))) and line == " \n")
            source.append(line)
            # if(count < 100):
            #     print(sourceStartLine == "\"", " = " in titleLine, bool(re.search(heading_pattern, titleLine)), line == "")
            #     print(count, title, sourceStartLine, titleLine, line)
            sourceStartLine = titleLine
            titleLine = line

    sources = sources[1:]

    print(titles)

    # Print the adjusted titles
    # for source, title in zip(sources, titles):
    #     print(f"Title: {title}")
    #     print(f"SourceStart: {source[:50]}")
    #     print(f"SourceEnd: {source[-50:]}\n")

    return titles, sources

def create_source_structure(data, name, titles, sources, sentenceChecker):
    source_structure = []
    found_sentences_count = 0

    # Write sources to the output file
    with open(f"{name}_sources.txt", 'w', encoding='utf-8') as f:
        for source_number, (title, source) in enumerate(zip(titles, sources)):
            print(f"Currently at source number: {source_number}/{len(titles)} ({title})")
            f.write(f"{title}:\n")  # Write the title
            source_structure.append([])

            # Split data into sentences and sequences
            sentences, found_sentences, _ = split_data(source, sentenceChecker)
            found_sentences_count += len(found_sentences)

            #_, sequences = create_sequences(sentences)

            for sentence_number, sentence in enumerate(sentences):
                # Append the sequence to the source structure
                source_structure[source_number].append(sentence)
                # Write to file
                f.write(f"[{source_number}:{sentence_number}] {sentence}\n")

            f.write('\n')  # Add a newline after each title for better readability

    return titles, sources, source_structure, found_sentences_count

def split_data(data, sentenceChecker, num_sentences=-1):
    found_sentences = sentenceChecker(data) if num_sentences == -1 else sentenceChecker(data)[:num_sentences]
    if sentenceChecker == nlp:
        found_sentences = [sentence.text for sentence in found_sentences.sentences]
    filtered_sentences = [sentence for sentence in found_sentences if bool(re.search(r'\.\s*$', sentence)) and len(sentence) > 3] #Make sure the sentence ends with a point
    words = sorted({word for sent in filtered_sentences for word in sent.split()})
    words.insert(0, "")  # Add an empty string for padding
    return filtered_sentences, found_sentences, words

train_text = load_data("./train.txt")
#print(train_text[:10000])
train_titles, train_sources = create_sources("./train.txt")
start_time = timeit.default_timer()
titles, sources, nlp_train_source_structure, nlp_train_found = create_source_structure(train_text, "train-nlp", train_titles, train_sources, nlp)
nlp_time = timeit.default_timer() - start_time
print(f"NLP Train Sentence Creation Time: {nlp_time:.2f} seconds\n")

test_text = load_data("./test.txt")
test_titles, test_sources = create_sources("./test.txt")
start_time = timeit.default_timer()
titles, sources, nlp_test_source_structure, nlp_test_found = create_source_structure(test_text, "test-nlp", test_titles, test_sources, nlp)
nlp_time = timeit.default_timer() - start_time
print(f"NLP Test Sentence Creation Time: {nlp_time:.2f} seconds")

[' 2013 – 14 York City F.C. season ', ' Big Boy ( song ) ', ' The Remix ( Lady Gaga album ) ', " New Year 's Eve ( Up All Night ) ", ' Geopyxis carbonaria ', ' Cyclone Graham ', ' M @-@ 108 ( Michigan highway ) ', ' Simon Bradstreet ', ' Ghost in the Shell : Stand Alone Complex - Solid State Society ', ' Tropical Storm Jose ( 2005 ) ', ' California State Route 243 ', ' Dave Sisler ', " Gambia women 's national football team ", ' Katherine Pulaski ', ' Thom Darden ', ' Vitamin D ( Glee ) ', ' Fastra II ', " Livin ' the Dream ", ' HMS Marlborough ( 1912 ) ', ' The Sixth Extinction ', ' Greens Ledge Light ', ' Hannah Primrose , Countess of Rosebery ', ' Mycena galericulata ', ' Chagas disease ', ' LiSA ( Japanese musician , born 1987 ) ', ' Sorraia ', ' Varanasi ', ' Missouri River ', ' Magadheera ', ' HMS Boreas () ', ' Cape lobster ', ' Kitsune ', ' Paul Thomas Anderson ', ' General aviation in the United Kingdom ', ' Gaboon viper ', ' Lloyd Mathews ', ' Jeremi Wiśniowiecki ', ' 1939 Pa

In [None]:
import sys
from subprocess import run
sys.path.append('/tf/.local/lib/python3.11/site-packages')
run([sys.executable, "-m", "pip", "install", "--upgrade", "pip"], check=True)
run([sys.executable, "-m", "pip", "install", "-q", "nltk"], check=True)
import nltk
nltk.download('punkt_tab')
from nltk import word_tokenize, sent_tokenize

start_time = timeit.default_timer()
titles, sources, nltk_train_source_structure, nltk_train_found = create_source_structure(train_text, "train-nltk", train_titles, train_sources, nltk.sent_tokenize)
nltk_time = timeit.default_timer() - start_time
print(f"NLTK Train Sentence Creation Time: {nltk_time:.2f} seconds\n")

start_time = timeit.default_timer()
titles, sources, nltk_test_source_structure, nltk_test_found = create_source_structure(test_text, "test-nltk", test_titles, test_sources, nltk.sent_tokenize)
nltk_time = timeit.default_timer() - start_time
print(f"NLTK Test Sentence Creation Time: {nltk_time:.2f} seconds")

Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package punkt_tab to /tf/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Currently at source number: 0/591 ( 2013 – 14 York City F.C. season )
Currently at source number: 1/591 ( Big Boy ( song ) )
Currently at source number: 2/591 ( The Remix ( Lady Gaga album ) )
Currently at source number: 3/591 ( New Year 's Eve ( Up All Night ) )
Currently at source number: 4/591 ( Geopyxis carbonaria )
Currently at source number: 5/591 ( Cyclone Graham )
Currently at source number: 6/591 ( M @-@ 108 ( Michigan highway ) )
Currently at source number: 7/591 ( Simon Bradstreet )
Currently at source number: 8/591 ( Ghost in the Shell : Stand Alone Complex - Solid State Society )
Currently at source number: 9/591 ( Tropical Storm Jose ( 2005 ) )
Currently at source number: 10/591 ( California State Route 243 )
Currently at source number: 11/591 ( Dave Sisler )
Currently at source number: 12/591 ( Gambia women 's national football team )
Currently at source number: 13/591 ( Katherine Pulaski )
Currently at source number: 14/591 ( Thom Darden )
Currently at source number: 15

In [None]:
print("Train-NLP:", [sum(len(sentences) for sentences in nlp_train_source_structure)], "Found:", nlp_train_found)
print("Test-NLP:", [sum(len(sentences) for sentences in nlp_test_source_structure)], "Found:", nlp_test_found)
print("Train-NLTK:", [sum(len(sentences) for sentences in nltk_train_source_structure)], "Found:", nltk_train_found)
print("Test-NLTK:", [sum(len(sentences) for sentences in nltk_test_source_structure)], "Found:", nltk_test_found)

Train-NLP: [71412] Found: 73039
Test-NLP: [7156] Found: 7320
Train-NLTK: [76561] Found: 76900
Test-NLTK: [7834] Found: 7869
