In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the actual and generated data CSV files
file_path_actual = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Genedata1.csv'
file_path_generated = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Synthetic_Genedata8.csv'

df_actual = pd.read_csv(file_path_actual)
df_generated = pd.read_csv(file_path_generated)

# Ensure that the generated data columns match the actual data columns
assert df_actual.columns.tolist() == df_generated.columns.tolist(), "Column names do not match between the datasets."

# Combine actual and generated data for visualization
combined_df = pd.concat([df_actual, df_generated], ignore_index=True)
combined_df['Type'] = ['Actual'] * len(df_actual) + ['Generated'] * len(df_generated)

# Plot actual and generated data using KDE plots
def plot_distribution(df, column, title):
    plt.figure(figsize=(6, 4))
    sns.kdeplot(data=df[df['Type'] == 'Actual'][column], label='Actual',linestyle='-', fill=True)
    sns.kdeplot(data=df[df['Type'] == 'Generated'][column], label='Generated',linestyle='--', fill=True)
    plt.title(f'Distribution of {title}')
    plt.legend()
    plt.show()

    # Save the plot as an image file
    #plt.savefig(f'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/{title}.png', dpi=300)
    #plt.close()
    
# Plotting each column
for column in df_actual.columns:
    plot_distribution(combined_df, column, column)


In [1]:
import numpy as np
from scipy.stats import gaussian_kde

def kl_divergence(p, q):
    """Compute the KL divergence between two probability distributions."""
    # Normalize distributions
    p = p / np.sum(p)
    q = q / np.sum(q)
    
    # Handle zero values to avoid log(0)
    p = np.clip(p, 1e-10, 1)
    q = np.clip(q, 1e-10, 1)
    
    # Calculate KL divergence
    return np.sum(p * np.log(p / q))

def calculate_kl_divergence(df1, df2, column):
    kde1 = gaussian_kde(df1[column].dropna(), bw_method='scott')
    kde2 = gaussian_kde(df2[column].dropna(), bw_method='scott')
    
    # Define the range for evaluation
    x = np.linspace(min(df1[column].min(), df2[column].min()), 
                    max(df1[column].max(), df2[column].max()), 1000)
    
    # Evaluate the KDEs
    p = kde1.evaluate(x)
    q = kde2.evaluate(x)
    
    # Compute KL divergence
    return kl_divergence(p, q)

# Load your datasets
import pandas as pd
file_path_actual = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Genedata1.csv'
file_path_generated = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Synthetic_Genedata8.csv'

df_actual = pd.read_csv(file_path_actual)
df_generated = pd.read_csv(file_path_generated)

# Compute KL Divergence for each column
for column in df_actual.columns:
    kl_div = calculate_kl_divergence(df_actual, df_generated, column)
    print(f'KL Divergence for {column}: {kl_div}')


KL Divergence for Adduct Code: 5.338603427610445
KL Divergence for Size Code: 5.710581690842418
KL Divergence for Sequence Code: 7.5047840519505815
KL Divergence for Polymerase Code: 7.514155159132615
KL Divergence for Outcome Code: 0.46969771912144154


In [3]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

def compute_mmd(X, Y, kernel='rbf', gamma=1.0):
    """Compute the Maximum Mean Discrepancy (MMD) between two datasets."""
    
    # Define the RBF kernel function
    def rbf_kernel(X, Y, gamma):
        sq_dists = cdist(X, Y, 'sqeuclidean')
        return np.exp(-gamma * sq_dists)
    
    # Compute kernel matrices
    XX = rbf_kernel(X, X, gamma)
    XY = rbf_kernel(X, Y, gamma)
    YY = rbf_kernel(Y, Y, gamma)
    
    # Compute MMD^2
    mmd_squared = np.mean(XX) + np.mean(YY) - 2 * np.mean(XY)
    
    return np.sqrt(mmd_squared)

def calculate_mmd(df1, df2, column):
    """Calculate MMD for a specific column."""
    X = df1[column].dropna().values.reshape(-1, 1)
    Y = df2[column].dropna().values.reshape(-1, 1)
    
    return compute_mmd(X, Y)

# Load your datasets
file_path_actual = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Genedata1.csv'
file_path_generated = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Synthetic_Genedata8.csv'

df_actual = pd.read_csv(file_path_actual)
df_generated = pd.read_csv(file_path_generated)

# Compute MMD for each column
for column in df_actual.columns:
    mmd_value = calculate_mmd(df_actual, df_generated, column)
    print(f'MMD for {column}: {mmd_value}')


MMD for Adduct Code: 0.5078151592293395
MMD for Size Code: 0.5152503070598187
MMD for Sequence Code: 0.6534140274725361
MMD for Polymerase Code: 0.6050314785967523
MMD for Outcome Code: 0.18212257063201354


In [5]:
import numpy as np
import pandas as pd
from scipy.stats import wasserstein_distance

def calculate_wasserstein_distance(df1, df2, column):
    """Calculate Wasserstein distance for a specific column."""
    # Extract the data for the specified column and drop NaN values
    data1 = df1[column].dropna().values
    data2 = df2[column].dropna().values
    
    # Compute Wasserstein distance
    return wasserstein_distance(data1, data2)

# Load your datasets
file_path_actual = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Genedata1.csv'
file_path_generated = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Synthetic_Genedata8.csv'

df_actual = pd.read_csv(file_path_actual)
df_generated = pd.read_csv(file_path_generated)

# Compute Wasserstein distance for each column
for column in df_actual.columns:
    wd_value = calculate_wasserstein_distance(df_actual, df_generated, column)
    print(f'Wasserstein Distance for {column}: {wd_value}')


Wasserstein Distance for Adduct Code: 3.462567901234568
Wasserstein Distance for Size Code: 3.090246913580247
Wasserstein Distance for Sequence Code: 3.94925925925926
Wasserstein Distance for Polymerase Code: 6.314987654320987
Wasserstein Distance for Outcome Code: 0.16197530864197535


In [7]:
import pandas as pd

# Load your datasets
file_path_actual = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Genedata1.csv'
file_path_generated = 'E:/DELL-KAVI/Desktop/New folder/Genetics Classification/Synthetic_Genedata8.csv'

df_actual = pd.read_csv(file_path_actual)
df_generated = pd.read_csv(file_path_generated)

# Initialize dictionaries to store statistics
stats_actual = {}
stats_generated = {}

# Compute statistics for actual data
for column in df_actual.columns:
    if df_actual[column].dtype in [np.float64, np.int64]:  # Process only numerical columns
        mean_actual = df_actual[column].mean()
        sd_actual = df_actual[column].std()
        median_actual = df_actual[column].median()
        
        stats_actual[column] = {
            'Mean': mean_actual,
            'Standard Deviation': sd_actual,
            'Median': median_actual
        }

# Compute statistics for generated data
for column in df_generated.columns:
    if df_generated[column].dtype in [np.float64, np.int64]:  # Process only numerical columns
        mean_generated = df_generated[column].mean()
        sd_generated = df_generated[column].std()
        median_generated = df_generated[column].median()
        
        stats_generated[column] = {
            'Mean': mean_generated,
            'Standard Deviation': sd_generated,
            'Median': median_generated
        }

# Convert to DataFrames for better readability
stats_actual_df = pd.DataFrame(stats_actual).T
stats_generated_df = pd.DataFrame(stats_generated).T

# Print or save the statistics
print("Actual Data Statistics:")
print(stats_actual_df)
print("\nGenerated Data Statistics:")
print(stats_generated_df)


Actual Data Statistics:
                      Mean  Standard Deviation  Median
Adduct Code      15.111111            7.231874    16.0
Size Code        12.641975            5.360291    14.0
Sequence Code    13.530864            7.569819    12.0
Polymerase Code  18.086420           10.092321    16.0
Outcome Code      1.641975            0.482407     2.0

Generated Data Statistics:
                      Mean  Standard Deviation  Median
Adduct Code      17.772000            3.174282    16.0
Size Code        15.252667            2.304568    14.0
Sequence Code    15.356667            2.487351    14.0
Polymerase Code  20.994000            4.161271    18.0
Outcome Code      1.480000            0.499766     1.0
