In [1]:
import numpy as np
from scipy.spatial import distance
import random
import pandas as pd
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
import time
import os

In [25]:
def merge_lists(list1, list2):
    # Ensure all lists have the same length
    if len(list1):
        # Use a nested list comprehension to create the desired list of lists
        result = [tuple([list1[i], list2[i]]) for i in range(len(list1))]
        return result
    else:
        raise ValueError("All input lists must have the same length")


In [26]:
def generate_normal_data(num_of_points, miu=[], sigma=[]):
    list_of_generated = []
    for i in range(len(miu)):
        s = np.random.normal(miu[i], sigma[i], num_of_points)
        list_of_generated.append(s)
    return merge_lists(list_of_generated[0], list_of_generated[1]) 

In [27]:
def generate_list_labled(num_of_systems, num_of_points, miu=[], sigma=[]):
    list_label = []
    for _ in range(num_of_systems):
        list_label.append(generate_normal_data(num_of_points, miu, sigma))
    return list_label

In [28]:
def merge_list(list_of_list):
    list_final = []
    for i in list_of_list:
        list_final += i    
    return list(set(list_final))

In [29]:
def density_calc(list_base, list_point):
    n = len(list_point)
    list_base = list(set(list_base))
    list_density =[]
    list_point_final = []
    for i in list_base:
        if i in list_point:
            list_density.append(list_point.count(i)/n)
            list_point_final.append(i)
        else:
            list_density.append(0) #changed
    return  list_point_final, list_density

In [30]:
def density_calc_list(list_of_list, list_base):
    list_density = []
    for i in list_of_list:
        list_density.append(np.array([density_calc(list_base, i)[1]]).transpose())
    return list_density

In [31]:
def calculate_euclidean_distance_matrix(list1, list2):

    array1 = np.array(list1)
    array2 = np.array(list2)

    if array1.shape[1] != array2.shape[1]:
        raise ValueError("Input arrays must have the same number of dimensions")

    distance_matrix = distance.cdist(array1, array2)

    return distance_matrix


In [32]:
def calculate_exponential_matrix(distance_matrix, lamb):
    
    exponential_matrix = np.exp(-distance_matrix / lamb)
    return exponential_matrix


In [33]:
def create_blank_dataset_with_metadata(m):
    data = {
        'system num': [],
        'data points': [],
    }

    for i in range(1, m + 1):
        data[f'{i-1}'] = []
    data[f'label'] = []
    blank_dataset = pd.DataFrame(data)
    
    return blank_dataset


In [34]:
def fill_dataset_with_records(dataset, records):
    for record in records:
        dataset = pd.concat([dataset, pd.DataFrame([record])], ignore_index=True)
    return dataset

In [35]:
def make_record(list_of_list, list_p):
    records_to_be_added = []
    for i in range(len(list_of_list)):
        records_to_be_added.append({'system num': i, 'data points': list_of_list[i], 'p':list_p[i]})
        
    return records_to_be_added

In [36]:
def condensed_creator(arr):
    m = arr.shape[0]

    # Extract upper triangle indices
    upper_triangle_indices = np.triu_indices(m, k=1)

    # Use the indices to get the upper triangle elements
    upper_triangle_elements = arr[upper_triangle_indices]

    # Convert the elements to a list if needed
    upper_triangle_list = upper_triangle_elements.tolist()

    # Print or use the resulting list as needed
    return upper_triangle_list

In [37]:
def plot_dendrogram(df, save_file=False):
    columns_to_filter = [str(i) for i in range(len(df))]
    df_filter = df[columns_to_filter]
    filled_df = df_filter.fillna(0)
    matrix = filled_df.values
    matrix_final = matrix + matrix.transpose()
    
    scaled_matrix = matrix_final
    np.fill_diagonal(scaled_matrix, 0)
    
    matrix_final = condensed_creator(scaled_matrix)
    linkage_matrix = linkage(matrix_final, method='complete')
    
    plt.figure(figsize=(10, 7))
    dendrogram(linkage_matrix, color_threshold=-np.inf, above_threshold_color='gray')
    plt.xlabel('Systems', fontsize=18, labelpad=20)  # Set the x-axis label to 'System'
    plt.xticks([])  # Remove x-axis tick labels
    plt.ylabel('Distance', fontsize=18)
    
    if save_file:
        plt.savefig('/results/plot_dendrogram.png', format='png', dpi=1000)
    plt.show()

In [38]:
def silhouette_score_agglomerative(df):
    columns_to_filter = [str(i) for i in range(len(df))]
    df_filter = df[columns_to_filter]
    filled_df = df_filter.fillna(0)
    matrix = filled_df.values
    matrix_final = matrix + matrix.transpose()
    min_val = np.min(matrix)
    max_val = np.max(matrix)
    scaled_matrix = (matrix_final - min_val)
    np.fill_diagonal(scaled_matrix, 0)
    silhouette_score_list = []
    for i in range(2, len(df)):
        index_list = cluster_list_creator(df, i)
        silhouette_score_list.append(silhouette_score(scaled_matrix, index_list, metric='precomputed'))
    return silhouette_score_list

In [39]:
def entropy(matrix):
    matrix = np.array(matrix)
    non_zero_entries = matrix[matrix > 0]
    entropy_value = -np.sum(non_zero_entries * np.log(non_zero_entries))

    return entropy_value

In [40]:
def cluster_list_creator(df, num_of_clusters):
    
    columns_to_filter = [str(i) for i in range(len(df))]
    df_filter = df[columns_to_filter]
    filled_df = df_filter.fillna(0)
    matrix = filled_df.values
    matrix_final = matrix + matrix.transpose()
    
    min_val = np.min(matrix)
    max_val = np.max(matrix)
    scaled_matrix = (matrix_final - min_val) / (max_val - min_val)
    np.fill_diagonal(scaled_matrix, 0)
    matrix_final = condensed_creator(scaled_matrix)

    linkage_matrix = linkage(matrix_final, method='complete')
    
    
    height = np.shape(linkage_matrix)[0]
    list_linkage = [[i] for i in range(len(df))]
    for i in range(height):
        list_linkage.append(list_linkage[int(linkage_matrix[i][0])] + list_linkage[int(linkage_matrix[i][1])])
        
        
    
    list_linkage_inverse = list_linkage[::-1]
    list_final = list_linkage_inverse[num_of_clusters-1:]
    list_index = []
    for i in range(len(df)):
        for j in list_final:
            if i in j:
                list_index.append(list_final.index(j))
                break

    return list_index

In [41]:

def calculate_OT_cost(p, q, reg, cost_matrix, num_iterations, stop_threshold):
    p = np.array([p]).T
    q = np.array([q]).T
    Xi = np.exp(-cost_matrix / reg)
    v_n = np.ones((Xi.shape[1], 1))
    v_old = v_n
    for _ in range(num_iterations):
        v_n = q / (Xi.T @ (p / (Xi @ v_n)))
        if np.linalg.norm(v_n  - v_old)<stop_threshold:
            break
        v_old = v_n
    diag_u = np.diagflat((p / (Xi @ v_n)))
    diag_v = np.diagflat(v_n)
    OT_plan = diag_u @ Xi @ diag_v
    OT_cost = np.multiply(OT_plan, cost_matrix).sum()
    return OT_plan

In [42]:
def fill_ot_distance(df, num_of_iterations, lambda_pen, stop_threshold):
    for i in range(len(df)):# Here we iterate among rows, and below we shall calculate the densities
#         print(f'i is {i}')
        
        for j in range(i+1):

            cost_matrix = distance.cdist(df['data points'][i], df['data points'][j])
            min_time = time.time()
            OT_plan_test = calculate_OT_cost(df['p'][i], df['p'][j], lambda_pen, cost_matrix, num_of_iterations, stop_threshold)            
            OT_cost_test = np.multiply(OT_plan_test, cost_matrix).sum()  #yakhoda
            max_time = time.time()
            df.at[j, str(i)] = OT_cost_test

In [43]:
def normalize_tuples(list_of_lists):
    num_dimensions = len(list_of_lists[0][0])  # Get the number of dimensions from the first tuple
    
    # Extract all values for each dimension
    all_values = [[] for _ in range(num_dimensions)]
    for sublist in list_of_lists:
        for i, t in enumerate(sublist):
            for j in range(num_dimensions):
                all_values[j].append(t[j])
    
    # Compute the minimum and maximum values for each dimension
    min_values = [0, 0, 0, 0, 0]
    max_values = [118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
    print(min_values)
    print(max_values)
    # Normalize each dimension of each tuple
    normalized_list_of_lists = []
    for sublist in list_of_lists:
        normalized_sublist = []
        for t in sublist:
            normalized_t = tuple((t[j] - min_values[j]) / (max_values[j] - min_values[j]) for j in range(num_dimensions))
            normalized_sublist.append(normalized_t)
        normalized_list_of_lists.append(normalized_sublist)
    
    return normalized_list_of_lists, np.array(min_values), np.array(max_values)

### Simulation output analysis

In [44]:
from scipy.cluster import hierarchy
with open(f'../../data/my_list_true_0.txt', 'r') as f:
    # Read lines from the file and parse tuples of floats
    list_sim_outputs_raw = [eval(line.strip()) for line in f]

points = np.loadtxt('../../data/points.txt')
input_list = [tuple(point) for point in points]
list_base = merge_list(list_sim_outputs_raw)
list_sim_outputs = []
p_list = []
for i in list_sim_outputs_raw:
#     print(i)
    list_sim_outputs.append(density_calc(i, i)[0])
    p_list.append(density_calc(i, i)[1])

normalized_list_sim_outputs = normalize_tuples(list_sim_outputs)[0]
min_norm_values =  normalize_tuples(list_sim_outputs)[1]
max_norm_values =  normalize_tuples(list_sim_outputs)[2] 

m = len(normalized_list_sim_outputs)
blank_df = create_blank_dataset_with_metadata(m)
df = fill_dataset_with_records(blank_df, make_record(normalized_list_sim_outputs, p_list))
# Display the filled dataset
# print("Filled Dataset:")
df['data points real'] = list_sim_outputs
df['input points'] = input_list

min_time = time.time()
lambda_value = 0.5
fill_ot_distance(df, 1000, lambda_value, stop_threshold=10**-9)
max_time = time.time()
# print(max_time - min_time)

y_values = silhouette_score_agglomerative(df)  # Example data, replace with your own

chosen_cluster = y_values.index(max(y_values)) + 2
# Specify the x-axis values
x_values = [i + 2 for i in range(len(y_values))]  # Example x-axis values, replace with your own

columns_to_filter = [str(i) for i in range(len(df))]
df_filter = df[columns_to_filter]
filled_df = df_filter.fillna(0)
matrix = filled_df.values
diagonal = np.diagonal(matrix)
# print(np.abs(np.min(diagonal)))
# print(matrix)
matrix_final = matrix + matrix.transpose() 
print(f'chosen cluster is {chosen_cluster}')

np.fill_diagonal(matrix_final, 0)

#until here, we will have the positive distance matrix scaled to zero. 


# Below we get the linkage matrix, which will be used in many parts
upper_triangle_flat = matrix_final[np.triu_indices_from(matrix_final, k=1)]    
Z = hierarchy.linkage(upper_triangle_flat, method='complete') 
n_clusters = chosen_cluster  # Specify the number of clusters you want

# Get cluster assignments for each data point based on the number of clusters
clusters = hierarchy.fcluster(Z, n_clusters, criterion='maxclust')

# print("Cluster assignments for each data point:")
# print(clusters)
df['cluster'] = clusters
with open(f'../../data/clusters_output_true_0.txt', 'w') as f:
    for cluster in clusters:
        f.write(f"{cluster}\n")

KeyboardInterrupt: 

In [None]:
for staff_lvl in range(100):    
    from scipy.cluster import hierarchy
    with open(f'../../data/my_list_{staff_lvl}.txt', 'r') as f:
        # Read lines from the file and parse tuples of floats
        list_sim_outputs_raw = [eval(line.strip())[:15] for line in f]

    points = np.loadtxt('../../data/points.txt')
    input_list = [tuple(point) for point in points]
    list_base = merge_list(list_sim_outputs_raw)
    list_sim_outputs = []
    p_list = []
    for i in list_sim_outputs_raw:
    #     print(i)
        list_sim_outputs.append(density_calc(i, i)[0])
        p_list.append(density_calc(i, i)[1])

    normalized_list_sim_outputs = normalize_tuples(list_sim_outputs)[0]
    min_norm_values =  normalize_tuples(list_sim_outputs)[1]
    max_norm_values =  normalize_tuples(list_sim_outputs)[2] 

    m = len(normalized_list_sim_outputs)
    blank_df = create_blank_dataset_with_metadata(m)
    df = fill_dataset_with_records(blank_df, make_record(normalized_list_sim_outputs, p_list))
    # Display the filled dataset
    # print("Filled Dataset:")
    df['data points real'] = list_sim_outputs
    df['input points'] = input_list

    min_time = time.time()
    lambda_value = 0.5
    fill_ot_distance(df, 1000, lambda_value, stop_threshold=10**-9)
    max_time = time.time()
    # print(max_time - min_time)

    y_values = silhouette_score_agglomerative(df)  # Example data, replace with your own

    chosen_cluster = y_values.index(max(y_values)) + 2
    # Specify the x-axis values
    x_values = [i + 2 for i in range(len(y_values))]  # Example x-axis values, replace with your own

    columns_to_filter = [str(i) for i in range(len(df))]
    df_filter = df[columns_to_filter]
    filled_df = df_filter.fillna(0)
    matrix = filled_df.values
    diagonal = np.diagonal(matrix)
    # print(np.abs(np.min(diagonal)))
    # print(matrix)
    matrix_final = matrix + matrix.transpose() 
    print(f'chosen cluster is {chosen_cluster}')

    np.fill_diagonal(matrix_final, 0)

    #until here, we will have the positive distance matrix scaled to zero. 


    # Below we get the linkage matrix, which will be used in many parts
    upper_triangle_flat = matrix_final[np.triu_indices_from(matrix_final, k=1)]    
    Z = hierarchy.linkage(upper_triangle_flat, method='complete') 
    n_clusters = chosen_cluster  # Specify the number of clusters you want

    # Get cluster assignments for each data point based on the number of clusters
    clusters = hierarchy.fcluster(Z, n_clusters, criterion='maxclust')

    # print("Cluster assignments for each data point:")
    # print(clusters)
    df['cluster'] = clusters
    with open(f'../../data/clusters_output_{staff_lvl}.txt', 'w') as f:
        for cluster in clusters:
            f.write(f"{cluster}\n")

In [None]:
for staff_lvl in range(100): 
    print(staff_lvl)
    from scipy.cluster import hierarchy
    with open(f'../../data/my_list_independent_{staff_lvl}.txt', 'r') as f:
        # Read lines from the file and parse tuples of floats
        list_sim_outputs_raw = [eval(line.strip())[:40] for line in f]

    points = np.loadtxt('../../data/points.txt')
    input_list = [tuple(point) for point in points]
    list_base = merge_list(list_sim_outputs_raw)
    list_sim_outputs = []
    p_list = []
    for i in list_sim_outputs_raw:
    #     print(i)
        list_sim_outputs.append(density_calc(i, i)[0])
        p_list.append(density_calc(i, i)[1])

    normalized_list_sim_outputs = normalize_tuples(list_sim_outputs)[0]
    min_norm_values =  normalize_tuples(list_sim_outputs)[1]
    max_norm_values =  normalize_tuples(list_sim_outputs)[2] 

    m = len(normalized_list_sim_outputs)
    blank_df = create_blank_dataset_with_metadata(m)
    df = fill_dataset_with_records(blank_df, make_record(normalized_list_sim_outputs, p_list))
    # Display the filled dataset
    # print("Filled Dataset:")
    df['data points real'] = list_sim_outputs
    df['input points'] = input_list

    min_time = time.time()
    lambda_value = 0.5
    fill_ot_distance(df, 1000, lambda_value, stop_threshold=10**-9)
    max_time = time.time()
    # print(max_time - min_time)

    y_values = silhouette_score_agglomerative(df)  # Example data, replace with your own

    chosen_cluster = y_values.index(max(y_values)) + 2
    # Specify the x-axis values
    x_values = [i + 2 for i in range(len(y_values))]  # Example x-axis values, replace with your own

    columns_to_filter = [str(i) for i in range(len(df))]
    df_filter = df[columns_to_filter]
    filled_df = df_filter.fillna(0)
    matrix = filled_df.values
    diagonal = np.diagonal(matrix)
    # print(np.abs(np.min(diagonal)))
    # print(matrix)
    matrix_final = matrix + matrix.transpose() 
    print(f'chosen cluster is {chosen_cluster}')

    np.fill_diagonal(matrix_final, 0)

    #until here, we will have the positive distance matrix scaled to zero. 


    # Below we get the linkage matrix, which will be used in many parts
    upper_triangle_flat = matrix_final[np.triu_indices_from(matrix_final, k=1)]    
    Z = hierarchy.linkage(upper_triangle_flat, method='complete') 
    n_clusters = chosen_cluster  # Specify the number of clusters you want

    # Get cluster assignments for each data point based on the number of clusters
    clusters = hierarchy.fcluster(Z, n_clusters, criterion='maxclust')

    # print("Cluster assignments for each data point:")
    # print(clusters)
    df['cluster'] = clusters
    with open(f'../../data/clusters_output_independent_{staff_lvl}.txt', 'w') as f:
        for cluster in clusters:
            f.write(f"{cluster}\n")

In [45]:
points = np.loadtxt('../../data/points.txt')

In [46]:
dist_ind = []
for staff_lvl in range(100): 
    print(staff_lvl)
    from scipy.cluster import hierarchy
    with open(f'../../data/my_list_independent_{staff_lvl}.txt', 'r') as f:
        # Read lines from the file and parse tuples of floats
        list_sim_outputs_raw = [eval(line.strip())[:40] for line in f]
        list_sim_outputs_raw = [list_sim_outputs_raw[i] for i in [6, 72]]

    points = np.loadtxt('../../data/points.txt')[0:2]
    input_list = [tuple(point) for point in points]
    list_base = merge_list(list_sim_outputs_raw)
    list_sim_outputs = []
    p_list = []
    for i in list_sim_outputs_raw:
    #     print(i)
        list_sim_outputs.append(density_calc(i, i)[0])
        p_list.append(density_calc(i, i)[1])

    normalized_list_sim_outputs = normalize_tuples(list_sim_outputs)[0]
    min_norm_values =  normalize_tuples(list_sim_outputs)[1]
    max_norm_values =  normalize_tuples(list_sim_outputs)[2] 

    m = len(normalized_list_sim_outputs)
    blank_df = create_blank_dataset_with_metadata(m)
    df = fill_dataset_with_records(blank_df, make_record(normalized_list_sim_outputs, p_list))
    
    # Display the filled dataset
    # print("Filled Dataset:")
    df['data points real'] = list_sim_outputs
    df['input points'] = input_list

    min_time = time.time()
    lambda_value = 0.5
    fill_ot_distance(df, 1000, lambda_value, stop_threshold=10**-9)
    max_time = time.time()
    dist_ind.append(df['0'][0])



0
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
1
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
2
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
25
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
26
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
49
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
50
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
73
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
74
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
97
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
98
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

In [47]:
dist_crn = []
for staff_lvl in range(100): 
    print(staff_lvl)
    from scipy.cluster import hierarchy
    with open(f'../../data/my_list_{staff_lvl}.txt', 'r') as f:
        # Read lines from the file and parse tuples of floats
        list_sim_outputs_raw = [eval(line.strip())[:40] for line in f]
        list_sim_outputs_raw = [list_sim_outputs_raw[i] for i in [6, 72]]

    points = np.loadtxt('../../data/points.txt')[0:2]
    input_list = [tuple(point) for point in points]
    list_base = merge_list(list_sim_outputs_raw)
    list_sim_outputs = []
    p_list = []
    for i in list_sim_outputs_raw:
    #     print(i)
        list_sim_outputs.append(density_calc(i, i)[0])
        p_list.append(density_calc(i, i)[1])

    normalized_list_sim_outputs = normalize_tuples(list_sim_outputs)[0]
    min_norm_values =  normalize_tuples(list_sim_outputs)[1]
    max_norm_values =  normalize_tuples(list_sim_outputs)[2] 

    m = len(normalized_list_sim_outputs)
    blank_df = create_blank_dataset_with_metadata(m)
    df = fill_dataset_with_records(blank_df, make_record(normalized_list_sim_outputs, p_list))
    
    # Display the filled dataset
    # print("Filled Dataset:")
    df['data points real'] = list_sim_outputs
    df['input points'] = input_list

    min_time = time.time()
    lambda_value = 0.5
    fill_ot_distance(df, 1000, lambda_value, stop_threshold=10**-9)
    max_time = time.time()
    dist_crn.append(df['0'][0])



0
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
1
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
2
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
25
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
26
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
49
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
50
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
73
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
74
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
97
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
98
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.8497975717347, 193.50111580201818, 250.76994761777246]
[0, 0, 0, 0, 0]
[118.64663376283197, 207.74906132608058, 240.84979

In [48]:
print(np.mean(np.array(dist_ind)))
print(np.std(np.array(dist_ind)))
print(np.mean(np.array(dist_crn)))
print(np.std(np.array(dist_crn)))


0.06602404323590162
0.005681905195471589
0.06561631345001119
0.004614830876886161


In [54]:
print(np.mean(np.array(dist_ind)))
print(np.var(np.array(dist_ind)))
print(np.mean(np.array(dist_crn)))
print(np.var(np.array(dist_crn)))
print(np.var(np.array(dist_ind)) / np.var(np.array(dist_crn)))

0.06602404323590162
3.228404665032703e-05
0.06561631345001119
2.1296664022261897e-05
1.5159203627657256


In [55]:
import numpy as np
from scipy.stats import f


# Calculate variances of the two datasets
var1 = np.var(dist_ind, ddof=1)
var2 = np.var(dist_crn, ddof=1)

# Calculate the F-statistic
f_statistic = var1 / var2 if var1 > var2 else var2 / var1
dof1, dof2 = len(dist_ind) - 1, len(dist_crn) - 1

# Calculate the p-value
p_value = 1 - f.cdf(f_statistic, dof1, dof2)

f_statistic, p_value


(1.5159203627657258, 0.019837646592209324)