# RDS HW3 Template

Install the Data Synthesizer library.

# Section

In [1]:
!pip install DataSynthesizer

Defaulting to user installation because normal site-packages is not writeable
Collecting DataSynthesizer
  Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl.metadata (4.7 kB)
Downloading DataSynthesizer-0.1.13-py2.py3-none-any.whl (24 kB)
Installing collected packages: DataSynthesizer
Successfully installed DataSynthesizer-0.1.13


In [2]:
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import normalize_given_distribution, read_json_file, pairwise_attributes_mutual_information, mutual_information, set_random_seed

from IPython.display import clear_output
from scipy.stats import entropy, ks_2samp
from scipy.spatial.distance import euclidean
from sklearn.metrics import mutual_info_score
from random import randint
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
def generate_data_A(input_data_file, output_data_size, seed=None):
    seed = seed or randint(0, 2**30)
    description_file = description_files['A']

    describer = DataDescriber()
    describer.describe_dataset_in_random_mode(input_data_file, seed=seed)
    describer.save_dataset_description_to_file(description_file)

    generator = DataGenerator()
    generator.generate_dataset_in_random_mode(output_data_size, description_file, seed=seed)
    return generator.synthetic_dataset

In [4]:
def generate_data_B(input_data_file, output_data_size, epsilon=0.1, seed=None):
    seed = seed or randint(0, 2**30)
    description_file = description_files['B']

    describer = DataDescriber()
    describer.describe_dataset_in_independent_attribute_mode(input_data_file, epsilon=epsilon, seed=seed)
    describer.save_dataset_description_to_file(description_file)

    generator = DataGenerator()
    generator.generate_dataset_in_independent_mode(output_data_size, description_file, seed=seed)
    return generator.synthetic_dataset

In [5]:
def generate_data_CD(input_data_file, output_data_size, description_file, epsilon=0.5, k=1, seed=None):
    seed = seed or randint(0, 2**30)

    describer = DataDescriber()
    describer.describe_dataset_in_correlated_attribute_mode(input_data_file, k=k, epsilon=epsilon, seed=seed)
    describer.save_dataset_description_to_file(description_file)

    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(output_data_size, description_file, seed=seed)
    return generator.synthetic_dataset

def generate_data_C(input_data_file, output_data_size, epsilon=0.1, seed=None):
    return generate_data_CD(input_data_file, output_data_size, description_files['C'], epsilon, k=1, seed=seed)

def generate_data_D(input_data_file, output_data_size, epsilon=0.1, seed=None):
    return generate_data_CD(input_data_file, output_data_size, description_files['D'], epsilon, k=2, seed=seed)

Start with a real dataset __hw_compas__ and a fake dataset __hw_xyz__. Generate synthetic datasets of size __N=10000__, in the following four categories:
- A:  random mode
- B: independent attribute mode with __epsilon = 0.1__
- C:  correlated attribute mode with __epsilon = 0.1__, Bayes net degree k=1
- D: correlated attribute mode with __epsilon = 0.1__, Bayes net degree k=2

In [None]:
# Packages for reading csv file into Colaboratory:
!pip install -U -q PyDrive==1.3.1

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# Please follow the steps as instructed when you run the following commands.

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
fileid_compas = '1kgSIBkOM9y0nz_l8LI8ze9TAhF5gbb64'
real_data_file = 'hw_compas.csv'

fileid_fake = '1b_T0SHQUMbVvZueZ6yXlxTKBHctMgM0k'
fake_data_file = 'hw_fake.csv'

downloaded = drive.CreateFile({'id':fileid_compas})
downloaded.GetContentFile(real_data_file)
df_real = pd.read_csv(real_data_file)

downloaded = drive.CreateFile({'id':fileid_fake})
downloaded.GetContentFile(fake_data_file)
df_fake = pd.read_csv(fake_data_file)

In [None]:
output_data_size = 10000
epsilon = 0.1

seed = 0  # make the entire notebook reproducible

description_files = {'A': './descrption_A.json',
                     'B': './descrption_B.json',
                     'C': './descrption_C.json',
                     'D': './descrption_D.json'}

set_random_seed(seed)

df_real_A = generate_data_A(real_data_file, output_data_size)
df_real_B = generate_data_B(real_data_file, output_data_size, epsilon=epsilon)
df_real_C = generate_data_C(real_data_file, output_data_size, epsilon=epsilon)
df_real_D = generate_data_D(real_data_file, output_data_size, epsilon=epsilon)

df_fake_A = generate_data_A(fake_data_file, output_data_size)
df_fake_B = generate_data_B(fake_data_file, output_data_size, epsilon=epsilon)
df_fake_C = generate_data_C(fake_data_file, output_data_size, epsilon=epsilon)
df_fake_D = generate_data_D(fake_data_file, output_data_size, epsilon=epsilon)

clear_output()

## Part (a)

### Q1

Median, Mean, Min, Max of __age__ and __score__ for each A, B, C and D, compare to the ground truth value in __hw_compas__, present results in a table

In [None]:
# Your code here
# Calculate statistics for each DataFrame
comparison_dfs_a_q1 = [df_real,df_real_A, df_real_B, df_real_C, df_real_D]
stats_a_q1 = {}
for df_name, df in zip(['Ground Truth', 'A', 'B', 'C', 'D'], comparison_dfs_a_q1):
    stats_a_q1[df_name] = {
        'age_median': df['age'].median(),
        'age_mean': df['age'].mean(),
        'age_min': df['age'].min(),
        'age_max': df['age'].max(),
        'score_median': df['score'].median(),
        'score_mean': df['score'].mean(),
        'score_min': df['score'].min(),
        'score_max': df['score'].max(),
    }

# Create a DataFrame to display the results
stats_df_a_q1 = pd.DataFrame(stats_a_q1).T

# Display the table
stats_df_a_q1

### Q2

Some more sophisticated statistical measure that’s good for independent attribute mode - __Two-sample Kolmogorov–Smirnov test__ and __KL-divergence__. Also show the difference visually, comparing real vs. synthetic

In [None]:
# Your code here
comparisons_a_q2 = [df_real,df_real_A, df_real_B]
age_titles_a_q2 = ['Original Distribution of Age', 'A - Age', 'B - Age']
sex_titles_a_q2 = ['Original Distribution of Sex', 'A - Sex', 'B - Sex']

# Age plots
fig1, axs1 = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), sharey=True)
fig1.subplots_adjust(hspace=0.5, wspace=0.5)
fig1.suptitle('Distribution of Age (Real Data)')

for i, df in enumerate(comparisons_a_q2):
    ax = axs1.flatten()[i]
    ax.hist(df['age'], bins=20, alpha=0.5)
    ax.set_title(age_titles_a_q2[i])
    ax.set_xlabel('Age')
    ax.set_ylabel('Number of Defendants')
    ax.grid(True)

plt.tight_layout()   

# Sex plots
fig2, axs2 = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), sharey=True)
fig2.subplots_adjust(hspace=0.5, wspace=0.5)
fig2.suptitle('Distribution of Sex (Real Data)')

for i, df in enumerate(comparisons_a_q2):
    ax = axs2.flatten()[i]
    ax.hist(df['sex'], bins=2, rwidth=0.8)
    ax.set_title(sex_titles_a_q2[i])
    ax.set_xlabel('Sex')
    ax.set_ylabel('Number of Defendants')
    ax.grid(True)

plt.tight_layout()

#### Two-sample Kolmogorov–Smirnov test for continous attributes

In [None]:
def ks_test(df_in: pd.DataFrame, df_out: pd.DataFrame, attr: str):
    """
    df_in: the sensitive dataset
    df_out: the synthetic dataset
    attr: the attribute that will be calculated for Two-sample Kolmogorov–Smirnov test.
    """
    return ks_2samp(df_in[attr], df_out[attr])[0]

#### KL-divergence for categorical attributes

In [None]:
def get_distribution_of_categorical_attribute(attribute: pd.Series, indicies=None):
    distribution = attribute.dropna().value_counts()
    if indicies is not None:
        for idx in set(indicies) - set(distribution.index):
            distribution.loc[idx] = 0
    distribution.sort_index(inplace=True)
    return distribution/sum(distribution)

def kl_test(df_in: pd.DataFrame, df_out: pd.DataFrame, attr: str):
    """
    df_in: the sensitive dataset
    df_out: the synthetic dataset
    attr: the attribute that will be calculated for KL-divergence.
    """
    distribution_in = get_distribution_of_categorical_attribute(df_in[attr])
    distribution_out = get_distribution_of_categorical_attribute(df_out[attr], distribution_in.index)
    return entropy(distribution_out, distribution_in)

In [None]:
KS_A = ks_test(df_real, df_real_A, 'age')
KS_B = ks_test(df_real, df_real_B, 'age')
KL_A = kl_test(df_real, df_real_A, 'sex')
KL_B = kl_test(df_real, df_real_B, 'sex')
print(f"KS test for age for original vs. A: {KS_A}")
print(f"KS test for age for original vs. B: {KS_B}")
print(f"KL test for sex for original vs. A: {KL_A}")
print(f"KL test for sex for original vs. B: {KL_B}")

### Q3

some more sophisticated statistical measure that’s good for correlated attribute mode, and will be better with k=2 (case D) than with k=1 (case C), __some distance over pairwise correlation coefficients? Also show the difference visually, comparing results on real vs. synthetic datasets__.

In [None]:
# Use pairwise_attributes_mutual_information function.
# For function parameters, refer to https://github.com/DataResponsibly/DataSynthesizer/blob/master/DataSynthesizer/lib/utils.py#L31
# Your code here
mi_fake = pairwise_attributes_mutual_information(df_fake)
mi_fake_C = pairwise_attributes_mutual_information(df_fake_C)
mi_fake_D = pairwise_attributes_mutual_information(df_fake_D)

In [None]:
# Use can sns.heatmap for pairwise mutual information
# For examples, refer to https://seaborn.pydata.org/generated/seaborn.heatmap.html
# Your code here
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), sharey=True)
fig.subplots_adjust(hspace=0.5, wspace=0.5)
fig.suptitle('Pairwise Mutual Information Matrix (Fake Data)')
sns.heatmap(mi_fake, ax=axs[0])
axs[0].set_title('Original (Fake Data)')
sns.heatmap(mi_fake_C, ax=axs[1])
axs[1].set_title('C')
sns.heatmap(mi_fake_D, ax=axs[2])
axs[2].set_title('D')
plt.tight_layout()

## Part (b)

Study the variability in accuracy of answers to Q1 and Q2 for A, B and C: fix epsilon = 0.1, generate __10__ synthetic databases (by specifying different seeds).  Plot accuracy as a box-and-whiskers plot.  

In [None]:
# You can use sns.boxplot for the plots
# For examples, refer to https://seaborn.pydata.org/generated/seaborn.boxplot.html
# Your code here
random_seeds = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
column_names_b = ['median_A', 'mean_A', 'min_A', 'max_A', 'median_B', 'mean_B', 'min_B', 'max_B', 'median_C', 'mean_C', 'min_C', 'max_C']
df_b = pd.DataFrame(columns=column_names_b)
modes_b = ['A', 'B', 'C']

for seed in random_seeds:
    df_real_A_temp = generate_data_A(real_data_file, output_data_size, seed=seed)
    df_real_B_temp = generate_data_B(real_data_file, output_data_size, epsilon=epsilon, seed=seed)
    df_real_C_temp = generate_data_C(real_data_file, output_data_size, epsilon=epsilon, seed=seed)
    comparison_dfs_temp = [df_real_A_temp, df_real_B_temp, df_real_C_temp]
    for mode, df in zip(modes_b, comparison_dfs_temp):
        df_b.loc[f'Seed_{seed}', f'median_{mode}'] = df['age'].median()
        df_b.loc[f'Seed_{seed}', f'mean_{mode}'] = df['age'].mean()
        df_b.loc[f'Seed_{seed}', f'min_{mode}'] = df['age'].min()
        df_b.loc[f'Seed_{seed}', f'max_{mode}'] = df['age'].max()
clear_output()
df_b

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(20, 5))
fig.subplots_adjust(hspace=0.5, wspace=0.5)
fig.suptitle('Age Variability for Different Modes')
sns.boxplot(data=df_b[['median_A', 'median_B', 'median_C']], ax=axs[0])
axs[0].set_title('Median')
axs[0].set_ylabel('Age')
axs[0].set_xlabel('Mode')
axs[0].set_xticks(range(len(modes_b)))
axs[0].set_xticklabels(modes_b)
sns.boxplot(data=df_b[['mean_A', 'mean_B', 'mean_C']], ax=axs[1])
axs[1].set_title('Mean')
axs[1].set_xlabel('Mode')
axs[1].set_xticks(range(len(modes_b)))
axs[1].set_xticklabels(modes_b)
sns.boxplot(data=df_b[['min_A', 'min_B', 'min_C']], ax=axs[2])
axs[2].set_title('Min')
axs[2].set_xlabel('Mode')
axs[2].set_xticks(range(len(modes_b)))
axs[2].set_xticklabels(modes_b)
sns.boxplot(data=df_b[['max_A', 'max_B', 'max_C']], ax=axs[3])
axs[3].set_title('Max')
axs[3].set_xlabel('Mode')
axs[3].set_xticks(range(len(modes_b)))
axs[3].set_xticklabels(modes_b)
plt.tight_layout()

## Part (c)

Study how accuracy of Q3 changes for case **C**, as you vary __epsilon__ = [0.01, 0.02, ..., 0.1].

In [None]:
epsilons_mst_kl = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
iters = 5
# Your code here
modes_c = ['B', 'C', 'D']
df_c_kl_b = pd.DataFrame(columns=epsilons_mst_kl)
df_c_kl_c = pd.DataFrame(columns=epsilons_mst_kl)
df_c_kl_d = pd.DataFrame(columns=epsilons_mst_kl)
for seed in random_seeds:
    for epsilon in epsilons_mst_kl:
        df_real_B_temp = generate_data_B(real_data_file, output_data_size, epsilon=epsilon, seed=seed)
        df_real_C_temp = generate_data_C(real_data_file, output_data_size, epsilon=epsilon, seed=seed)
        df_real_D_temp = generate_data_D(real_data_file, output_data_size, epsilon=epsilon, seed=seed)
        KL_B_temp = kl_test(df_real, df_real_B_temp, 'race')
        KL_C_temp = kl_test(df_real, df_real_C_temp, 'race')
        KL_D_temp = kl_test(df_real, df_real_D_temp, 'race')
        df_c_kl_b.loc[f'Seed_{seed}', epsilon] = KL_B_temp
        df_c_kl_c.loc[f'Seed_{seed}', epsilon] = KL_C_temp
        df_c_kl_d.loc[f'Seed){seed}', epsilon] = KL_D_temp
clear_output()

In [None]:
df_c_kl_b

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), sharey=True)
fig.subplots_adjust(hspace=0.5, wspace=0.5)
fig.suptitle('Race KL Divergence for Different Modes')
sns.boxplot(data=df_c_kl_b, ax=axs[0])
axs[0].set_title('B')
axs[0].set_ylabel('KL Divergence')
axs[0].set_xlabel('Epsilon')
axs[0].set_xticks(range(len(epsilons_mst_kl)))
axs[0].set_xticklabels(epsilons_mst_kl)
sns.boxplot(data=df_c_kl_c, ax=axs[1])
axs[1].set_title('C')
axs[1].set_xlabel('Epsilon')
axs[1].set_xticks(range(len(epsilons_mst_kl)))
axs[1].set_xticklabels(epsilons_mst_kl)
sns.boxplot(data=df_c_kl_d, ax=axs[2])
axs[2].set_title('D')
axs[2].set_xlabel('Epsilon')
axs[2].set_xticks(range(len(epsilons_mst_kl)))
axs[2].set_xticklabels(epsilons_mst_kl)
plt.tight_layout()

#### Mutual Information Plot for MST
Again, replicate your plots from Part (c) for MST, this time for mutual information, on both hw_compas and hw_fake (df_real and df_fake), varying epsilon = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]. Compare this plot to the plot of dataset **D** (df_real_D). Once again, you need only generate **5 datasets** per epsilon (i.e., run 5 iterations per epsilon).

In [None]:
epsilons_mst_mi = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
iters = 5
# Your code here
mi_real = pairwise_attributes_mutual_information(df_real)

In [None]:
# Define a function to calculate the sum of the absolute value of differences 
# between the pairwise mutual information of the real and synthetic data
def mi_diff_sum(mi1, mi2):
    sum_diff = 0
    for i in range(mi1.shape[0]):
        for j in range(mi1.shape[1]):
            # Calculate the absolute difference between corresponding elements
            abs_diff = abs(mi1.iloc[i, j] - mi2.iloc[i, j])
            # Add the absolute difference to the sum
            sum_diff += abs_diff
    # Return the sum of absolute differences divided by 2
    return sum_diff/2

In [None]:
df_c_mi_real = pd.DataFrame()
df_c_mi_fake = pd.DataFrame()
df_c_mi_diff = pd.DataFrame()
for mode in modes_c:
    for epsilon in epsilons_mst_mi:
        df_c_mi_real[f'{mode}_{epsilon}'] = None
        df_c_mi_fake[f'{mode}_{epsilon}'] = None
        df_c_mi_diff[f'{mode}_{epsilon}'] = None

In [None]:
for seed in random_seeds:
    for epsilon in epsilons_mst_mi:
        for i, file in enumerate([real_data_file, fake_data_file]):
            df_B_temp = generate_data_B(file, output_data_size, epsilon=epsilon, seed=seed)
            df_C_temp = generate_data_C(file, output_data_size, epsilon=epsilon, seed=seed)
            df_D_temp = generate_data_D(file, output_data_size, epsilon=epsilon, seed=seed)
            mi_B_temp = pairwise_attributes_mutual_information(df_B_temp)
            mi_C_temp = pairwise_attributes_mutual_information(df_C_temp)
            mi_D_temp = pairwise_attributes_mutual_information(df_D_temp)
            if (i == 0):
                df_c_mi_real.loc[f'Seed_{seed}', f'B_{epsilon}'] = mi_diff_sum(mi_real, mi_B_temp)
                df_c_mi_real.loc[f'Seed_{seed}', f'C_{epsilon}'] = mi_diff_sum(mi_real, mi_C_temp)
                df_c_mi_real.loc[f'Seed_{seed}', f'D_{epsilon}'] = mi_diff_sum(mi_real, mi_D_temp)
                real_mi_temp = [mi_B_temp, mi_C_temp, mi_D_temp]
            else:
                df_c_mi_fake.loc[f'Seed_{seed}', f'B_{epsilon}'] = mi_diff_sum(mi_fake, mi_B_temp)
                df_c_mi_fake.loc[f'Seed_{seed}', f'C_{epsilon}'] = mi_diff_sum(mi_fake, mi_C_temp)
                df_c_mi_fake.loc[f'Seed_{seed}', f'D_{epsilon}'] = mi_diff_sum(mi_fake, mi_D_temp)
                fake_mi_temp = [mi_B_temp, mi_C_temp, mi_D_temp]
        for i in range(3):
            df_c_mi_diff.loc[f'Seed_{seed}', f'B_{epsilons_mst_mi[i]}'] = mi_diff_sum(real_mi_temp[i], fake_mi_temp[i])
            df_c_mi_diff.loc[f'Seed_{seed}', f'C_{epsilons_mst_mi[i]}'] = mi_diff_sum(real_mi_temp[i], fake_mi_temp[i])
            df_c_mi_diff.loc[f'Seed_{seed}', f'D_{epsilons_mst_mi[i]}'] = mi_diff_sum(real_mi_temp[i], fake_mi_temp[i])
clear_output()

In [None]:
fig1, axs1 = plt.subplots(nrows=2, ncols=3, figsize=(15, 10), sharey=True)
fig1.subplots_adjust(hspace=0.5, wspace=0.5)
fig1.suptitle('Sum of Absolute MI Differences (Real and Fake Data)')
sns.boxplot(data=df_c_mi_real.iloc[:,0:7], ax=axs1[0, 0])
axs1[0, 0].set_title('B - Real')
axs1[0, 0].set_ylabel('Sum of Absolute MI Differences')
axs1[0, 0].set_xlabel('Epsilon')
axs1[0, 0].set_xticks(range(len(epsilons_mst_mi)))
axs1[0, 0].set_xticklabels(epsilons_mst_mi)
sns.boxplot(data=df_c_mi_real.iloc[:,7:14], ax=axs1[0, 1])
axs1[0, 1].set_title('C - Real')
axs1[0, 1].set_xlabel('Epsilon')
axs1[0, 1].set_xticks(range(len(epsilons_mst_mi)))
axs1[0, 1].set_xticklabels(epsilons_mst_mi)
sns.boxplot(data=df_c_mi_real.iloc[:,14:21], ax=axs1[0, 2])
axs1[0, 2].set_title('D - Real')
axs1[0, 2].set_xlabel('Epsilon')
axs1[0, 2].set_xticks(range(len(epsilons_mst_mi)))
axs1[0, 2].set_xticklabels(epsilons_mst_mi)
sns.boxplot(data=df_c_mi_fake.iloc[:,0:7], ax=axs1[1, 0])
axs1[1, 0].set_title('B - Fake')
axs1[1, 0].set_ylabel('Sum of Absolute MI Differences')
axs1[1, 0].set_xlabel('Epsilon')
axs1[1, 0].set_xticks(range(len(epsilons_mst_mi)))
axs1[1, 0].set_xticklabels(epsilons_mst_mi)
sns.boxplot(data=df_c_mi_fake.iloc[:,7:14], ax=axs1[1, 1])
axs1[1, 1].set_title('C - Fake')
axs1[1, 1].set_xlabel('Epsilon')
axs1[1, 1].set_xticks(range(len(epsilons_mst_mi)))
axs1[1, 1].set_xticklabels(epsilons_mst_mi)
sns.boxplot(data=df_c_mi_fake.iloc[:,14:21], ax=axs1[1, 2])
axs1[1, 2].set_title('D - Fake')
axs1[1, 2].set_xlabel('Epsilon')
axs1[1, 2].set_xticks(range(len(epsilons_mst_mi)))
axs1[1, 2].set_xticklabels(epsilons_mst_mi)
plt.tight_layout()

fig2, axs2 = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), sharey=True)
fig2.subplots_adjust(hspace=0.5, wspace=0.5)
fig2.suptitle('Sum of Absolute MI Differences (Real vs Fake Data)')
sns.boxplot(data=df_c_mi_diff.iloc[:,0:7], ax=axs2[0])
axs2[0].set_title('B - Real vs Fake')
axs2[0].set_ylabel('Sum of Absolute MI Differences')
axs2[0].set_xlabel('Epsilon')
axs2[0].set_xticks(range(len(epsilons_mst_mi)))
axs2[0].set_xticklabels(epsilons_mst_mi)
sns.boxplot(data=df_c_mi_diff.iloc[:,7:14], ax=axs2[1])
axs2[1].set_title('C - Real vs Fake')
axs2[1].set_xlabel('Epsilon')
axs2[1].set_xticks(range(len(epsilons_mst_mi)))
axs2[1].set_xticklabels(epsilons_mst_mi)
sns.boxplot(data=df_c_mi_diff.iloc[:,14:21], ax=axs2[2])
axs2[2].set_title('D - Real vs Fake')
axs2[2].set_xlabel('Epsilon')
axs2[2].set_xticks(range(len(epsilons_mst_mi)))
axs2[2].set_xticklabels(epsilons_mst_mi)
plt.tight_layout()