In [None]:
!pip install imbalanced-learn

In [None]:
import pandas as pd

# Load CSV file to examine data
file_path = '/content/drive/MyDrive/floodProjectAssets/datasetFlood_toDrive.csv'
data = pd.read_csv(file_path)

# Display the first rows of the file to see its structure
data.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select relevant columns for correlation matrix and reorder
data_for_analysis = data.drop(columns=['system:index', 'landcover_classification_2022', '.geo'])
column_order = ['distance', 'elevation', 'slope', 'soil_hydraulic_conductivity', 'hand', 'twi', 'classes']
data_for_analysis = data_for_analysis[column_order]

# Visualize distributions of numerical features
data_for_analysis.hist(bins=30, figsize=(15, 10))
plt.suptitle('Distribution of Numerical Features (before data balancing)', y=1.02)
plt.tight_layout()
plt.show()

# Visualize relationships between features using pairplot
sns.pairplot(data_for_analysis, hue='classes', diag_kind='kde')
plt.suptitle('Pairplot of Features by Class (before data balancing)', y=1.02)
plt.show()

# Visualize boxplots for each feature by class
plt.figure(figsize=(15, 10))
for i, col in enumerate(data_for_analysis.columns[:-1]): # Exclude the target variable for individual boxplots
    plt.subplot(3, 3, i + 1) # Changed from 2x2 to 3x3 grid
    sns.boxplot(x='classes', y=col, data=data_for_analysis)
    plt.title(f'Boxplot of {col} by Class')
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the Pearson correlation matrix for all classes
correlation_matrix_all = data_for_analysis.drop(columns=['classes']).corr(method='pearson')

# Display the correlation matrix for all classes
print("Pearson Correlation Matrix for all classes:")
display(correlation_matrix_all)

# Visualize the correlation matrix for all classes as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix_all, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Pearson Correlation Matrix - All Classes (before data balancing)')
plt.show()


# Calculate Pearson correlation matrix for class 0
correlation_matrix_class_0 = data_for_analysis[data_for_analysis['classes'] == 0].drop(columns=['classes']).corr(method='pearson')

# Display correlation matrix for class 0
print("Pearson Correlation Matrix for Class 0:")
display(correlation_matrix_class_0)

# Visualize correlation matrix for class 0 as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix_class_0, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Pearson Correlation Matrix - Class 0 (before data balancing)')
plt.show()

# Calculate Pearson correlation matrix for class 1
correlation_matrix_class_1 = data_for_analysis[data_for_analysis['classes'] == 1].drop(columns=['classes']).corr(method='pearson')

# Display correlation matrix for class 1
print("\nPearson Correlation Matrix for Class 1:")
display(correlation_matrix_class_1)

# Visualize correlation matrix for class 1 as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix_class_1, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Pearson Correlation Matrix - Class 1 (before data balancing)')
plt.show()

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# All classes
X_vif = data_for_analysis.drop(columns=['classes'])
X_vif = sm.add_constant(X_vif)  # <── add constant

vif_data = pd.DataFrame()
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i)
                   for i in range(X_vif.shape[1])]
vif_data = vif_data[vif_data["feature"] != "const"]  # remove constant from table
print("Variance Inflation Factor (VIF) for all features:")
display(vif_data)

# Class 0
X_vif_class_0 = data_for_analysis[data_for_analysis['classes'] == 0].drop(columns=['classes'])
X_vif_class_0 = sm.add_constant(X_vif_class_0)

vif_data_class_0 = pd.DataFrame()
vif_data_class_0["feature"] = X_vif_class_0.columns
vif_data_class_0["VIF"] = [variance_inflation_factor(X_vif_class_0.values, i)
                           for i in range(X_vif_class_0.shape[1])]
vif_data_class_0 = vif_data_class_0[vif_data_class_0["feature"] != "const"]
print("\nVariance Inflation Factor (VIF) for features in Class 0:")
display(vif_data_class_0)

# Class 1
X_vif_class_1 = data_for_analysis[data_for_analysis['classes'] == 1].drop(columns=['classes'])
X_vif_class_1 = sm.add_constant(X_vif_class_1)

vif_data_class_1 = pd.DataFrame()
vif_data_class_1["feature"] = X_vif_class_1.columns
vif_data_class_1["VIF"] = [variance_inflation_factor(X_vif_class_1.values, i)
                           for i in range(X_vif_class_1.shape[1])]
vif_data_class_1 = vif_data_class_1[vif_data_class_1["feature"] != "const"]
print("\nVariance Inflation Factor (VIF) for features in Class 1:")
display(vif_data_class_1)


'''VIF Value Interpretation
1       No correlation with other variables
1–5     Moderate correlation, generally acceptable
> 5     Concerning multicollinearity
> 10    Very high – should remove or combine variables'''

In [None]:
import pandas as pd

# Generate basic statistics for the entire data_for_analysis DataFrame
print("Basic Statistics for the entire dataset:")
display(data_for_analysis.describe())

# Generate basic statistics for Class 0
print("\nBasic Statistics for Class 0:")
display(data_for_analysis[data_for_analysis['classes'] == 0].describe())

# Generate basic statistics for Class 1
print("\nBasic Statistics for Class 1:")
display(data_for_analysis[data_for_analysis['classes'] == 1].describe())

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Separate attributes and classes
X = data.drop(columns=['system:index', 'landcover_classification_2022', 'classes', '.geo'])  # Remove non-numeric columns
y = data['classes']

# Apply SMOTE to increase minority class samples by 50%
smote = SMOTE(sampling_strategy=0.2, random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Check class distribution before and after SMOTE
y.value_counts(), y_res.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Separate data into labeled (class 1) and unlabeled (class 0)
X_labeled = X_res[y_res == 1]
y_labeled = y_res[y_res == 1]

X_unlabeled = X_res[y_res == 0]
y_unlabeled = y_res[y_res == 0]

# Select 20% of labeled samples as "spies"
X_spy, X_labeled_train, y_spy, y_labeled_train = train_test_split(
    X_labeled, y_labeled, test_size=0.8, random_state=42, stratify=y_labeled
)

print("Shape of y_spy:", y_spy.shape)
print("Shape of y_labeled_train:", y_labeled_train.shape)
print("Shape of y_unlabeled:", y_unlabeled.shape)

y_spy_fake = pd.Series([0] * len(y_spy), index=y_spy.index)

# Mix "spies" with unlabeled data
X_train_spy = pd.concat([X_labeled_train, X_unlabeled, X_spy])
y_train_spy = pd.concat([y_labeled_train, y_unlabeled, y_spy_fake])

# Shuffle data
shuffle_index = np.random.permutation(len(X_train_spy))
X_train_spy = X_train_spy.iloc[shuffle_index].reset_index(drop=True)
y_train_spy = y_train_spy.iloc[shuffle_index].reset_index(drop=True)

print('--------------')
print("Shape of X_train_spy:", X_train_spy.shape)
print("Shape of y_train_spy:", y_train_spy.shape)
print("\nValue counts of y_train_spy:")
print(y_train_spy.value_counts())

In [None]:
from sklearn.naive_bayes import GaussianNB

# Initialize Naive Bayes classifier
nb_classifier = GaussianNB()

# Train classifier with training data, including spies
nb_classifier.fit(X_train_spy, y_train_spy)

# Calculate probabilities for training data
proba_spy = nb_classifier.predict_proba(X_spy)

# Display first probabilities
print("Probabilities calculated by Naive Bayes:")
print(proba_spy[:5])

In [None]:
proba_positive = proba_spy[:, 1]
print("Probabilities of positive class:")
print(proba_positive[:5])

In [None]:
import numpy as np

mean = np.mean(proba_positive)
std_dev = np.std(proba_positive)
median = np.median(proba_positive)
quartiles = np.percentile(proba_positive, [25, 50, 75])

print(f"Mean: {mean:.2f}")
print(f"Standard deviation: {std_dev:.2f}")
print(f"Median: {median:.2f}")
print(f"First quartile (Q1): {quartiles[0]:.2f}")

threshold_cut = mean - std_dev
print(f"Cutoff threshold defined as Mean - Standard Deviation: {threshold_cut:.2f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Create histogram
plt.figure(figsize=(10, 6))
sns.histplot(proba_positive, bins=50, kde=True)
plt.title('Histogram of Positive Class Probabilities')
plt.xlabel('Probability')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Create boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(y=proba_positive)
plt.title('Boxplot of Positive Class Probabilities predicted by Naive Bayes')
plt.ylabel('Predicted Probability')
plt.grid(True)
plt.show()

# Create plot with point (circle) and error bars
plt.figure(figsize=(6, 8))
plt.errorbar(['Positive Probability'], [mean], yerr=[std_dev], fmt='o', capsize=5, color='skyblue', markersize=10)
plt.title('Mean and Standard Deviation of Positive Class Probabilities')
plt.ylabel('Probability')
plt.ylim(0, 1)  # Set y-axis limits between 0 and 1
plt.grid(axis='y')
plt.show()

In [None]:
# Count samples less than threshold_cut
count_smaller = (proba_positive < threshold_cut).sum()

# Count samples greater than threshold_cut
count_larger = (proba_positive >= threshold_cut).sum()

print(f"Number of samples with probability less than {threshold_cut:.2f}: {count_smaller}")
print(f"Number of samples with probability greater than or equal to {threshold_cut:.2f}: {count_larger}")

In [None]:
# Predict classes and probabilities for unlabeled data
predicted_proba = nb_classifier.predict_proba(X_unlabeled)
predicted_classes = nb_classifier.predict(X_unlabeled)


# Create DataFrame with unlabeled data and predicted classes
unlabeled_predictions_df = X_unlabeled.copy()
unlabeled_predictions_df['predicted_classes'] = predicted_classes
unlabeled_predictions_df['proba_class_0'] = predicted_proba[:, 0]
unlabeled_predictions_df['proba_class_1'] = predicted_proba[:, 1]


# Display first rows of new DataFrame and count of predicted classes
print("DataFrame with unlabeled data, predicted classes and probabilities:")
display(unlabeled_predictions_df.head(10))

print("\nCount of predicted classes in unlabeled data:")
print(unlabeled_predictions_df['predicted_classes'].value_counts())

In [None]:
# Add new column 'new_class' based on cutoff threshold
unlabeled_predictions_df['new_class'] = (unlabeled_predictions_df['proba_class_1'] >= threshold_cut).astype(int)

# Display first rows of DataFrame with new column
print("DataFrame with new column 'new_class':")
display(unlabeled_predictions_df.head(1000))

# Display value counts in new column
print("\nValue counts in new column 'new_class':")
print(unlabeled_predictions_df['new_class'].value_counts())

In [None]:
unlabeled_predictions_df = unlabeled_predictions_df.drop(columns=['predicted_classes', 'proba_class_0', 'proba_class_1'])
display(unlabeled_predictions_df.head())
print("\nColumns after dropping:")
print(unlabeled_predictions_df.columns)

In [None]:
# Create copy of X_labeled DataFrame and reset its index
labeled_df = X_labeled.copy().reset_index(drop=True)

# Add 'new_class' column from y_labeled
labeled_df['new_class'] = y_labeled.reset_index(drop=True)

print(labeled_df['new_class'].value_counts())
print(unlabeled_predictions_df['new_class'].value_counts())

In [None]:
# Concatenate labeled and unlabeled dataframes
combined_df = pd.concat([labeled_df, unlabeled_predictions_df], ignore_index=True)

print("Combined DataFrame:")
display(combined_df.head())
print("\nValue counts of new_class in combined DataFrame:")
print(combined_df['new_class'].value_counts())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Select relevant columns for analysis from combined_df
combined_data_for_analysis = combined_df.drop(columns=[col for col in combined_df.columns if col in ['system:index', 'landcover_classification_2022', '.geo']])

# Visualize distributions of numerical features
combined_data_for_analysis.hist(bins=30, figsize=(15, 10))
plt.suptitle('Distribution of Numerical Features (Combined data with new classes)', y=1.02)
plt.tight_layout()
plt.show()

# Visualize relationships between features using pairplot
sns.pairplot(combined_data_for_analysis, hue='new_class', diag_kind='kde')
plt.suptitle('Pairplot of Features by New Classes (Combined data with new classes)', y=1.02)
plt.show()

# Visualize boxplots for each feature by class
plt.figure(figsize=(15, 10))
for i, col in enumerate(combined_data_for_analysis.columns[:-1]): # Exclude the target variable for individual boxplots
    plt.subplot(3, 3, i + 1) # Changed from 2x2 to 3x3 grid
    sns.boxplot(x='new_class', y=col, data=combined_data_for_analysis)
    plt.title(f'Boxplot of {col} by New Classes')
plt.tight_layout()
plt.show()

# Calculate the Pearson correlation matrix for all classes in the combined data
combined_correlation_matrix_all = combined_data_for_analysis.drop(columns=['new_class']).corr(method='pearson')

# Display the correlation matrix for all classes
print("Pearson Correlation Matrix for all classes (Combined Data):")
display(combined_correlation_matrix_all)

# Visualize the correlation matrix for all classes as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(combined_correlation_matrix_all, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Pearson Correlation Matrix - All Classes (Combined Data)')
plt.show()


# Calculate Pearson correlation matrix for class 0
combined_data_correlation_matrix_class_0 = combined_data_for_analysis[combined_data_for_analysis['new_class'] == 0].drop(columns=['new_class']).corr(method='pearson')

# Display correlation matrix for class 0
print("Pearson Correlation Matrix for Class 0:")
display(combined_data_correlation_matrix_class_0)

# Visualize correlation matrix for class 0 as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(combined_data_correlation_matrix_class_0, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Pearson Correlation Matrix - New Class 0 (Combined Data)' )
plt.show()

# Calculate Pearson correlation matrix for class 1
combined_data_correlation_matrix_class_1 = combined_data_for_analysis[combined_data_for_analysis['new_class'] == 1].drop(columns=['new_class']).corr(method='pearson')

# Display correlation matrix for class 1
print("\nPearson Correlation Matrix for Class 1:")
display(combined_data_correlation_matrix_class_1)

# Visualize correlation matrix for class 1 as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(combined_data_correlation_matrix_class_1, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Pearson Correlation Matrix - New Class 1 (Combined Data)')
plt.show()

# Generate basic statistics for the entire combined_data_for_analysis DataFrame
print("Basic Statistics for the entire dataset:")
display(combined_data_for_analysis.describe())

# Generate basic statistics for Class 0
print("\nBasic Statistics for Class 0:")
display(combined_data_for_analysis[combined_data_for_analysis['new_class'] == 0].describe())

# Generate basic statistics for Class 1
print("\nBasic Statistics for Class 1:")
display(combined_data_for_analysis[combined_data_for_analysis['new_class'] == 1].describe())


# --- Variance Inflation Factor (VIF) Calculation ---

# All classes
# Select features for VIF calculation (exclude the target variable 'new_class')
X_vif_combined = combined_data_for_analysis.drop(columns=['new_class'])
# Add a constant to the features for VIF calculation
X_vif_combined = sm.add_constant(X_vif_combined)

# Calculate VIF for each feature in the combined data
vif_data_combined = pd.DataFrame()
vif_data_combined["feature"] = X_vif_combined.columns
vif_data_combined["VIF"] = [variance_inflation_factor(X_vif_combined.values, i)
                           for i in range(X_vif_combined.shape[1])]
# Remove the constant from the VIF table
vif_data_combined = vif_data_combined[vif_data_combined["feature"] != "const"]
print("Variance Inflation Factor (VIF) for all features (Combined Data):")
display(vif_data_combined)

# Class 0
# Select features for VIF calculation for new class 0
X_vif_combined_class_0 = combined_data_for_analysis[combined_data_for_analysis['new_class'] == 0].drop(columns=['new_class'])
# Add a constant to the features for VIF calculation
X_vif_combined_class_0 = sm.add_constant(X_vif_combined_class_0)

# Calculate VIF for each feature in new Class 0
vif_data_combined_class_0 = pd.DataFrame()
vif_data_combined_class_0["feature"] = X_vif_combined_class_0.columns
vif_data_combined_class_0["VIF"] = [variance_inflation_factor(X_vif_combined_class_0.values, i)
                                   for i in range(X_vif_combined_class_0.shape[1])]
# Remove the constant from the VIF table
vif_data_combined_class_0 = vif_data_combined_class_0[vif_data_combined_class_0["feature"] != "const"]
print("\nVariance Inflation Factor (VIF) for features in New Class 0:")
display(vif_data_combined_class_0)

# Class 1
# Select features for VIF calculation for new class 1
X_vif_combined_class_1 = combined_data_for_analysis[combined_data_for_analysis['new_class'] == 1].drop(columns=['new_class'])
# Add a constant to the features for VIF calculation
X_vif_combined_class_1 = sm.add_constant(X_vif_combined_class_1)

# Calculate VIF for each feature in new Class 1
vif_data_combined_class_1 = pd.DataFrame()
vif_data_combined_class_1["feature"] = X_vif_combined_class_1.columns
vif_data_combined_class_1["VIF"] = [variance_inflation_factor(X_vif_combined_class_1.values, i)
                                   for i in range(X_vif_combined_class_1.shape[1])]
# Remove the constant from the VIF table
vif_data_combined_class_1 = vif_data_combined_class_1[vif_data_combined_class_1["feature"] != "const"]
print("\nVariance Inflation Factor (VIF) for features in New Class 1:")
display(vif_data_combined_class_1)


'''VIF Value Interpretation
1       No correlation with other variables
1–5     Moderate correlation, generally acceptable
> 5     Concerning multicollinearity
> 10    Very high – should remove or combine variables'''

===ATTENTION: Execute the first part of the code (increase, labeling, data balancing) up to this cell===

In [None]:
from google.colab import files

# Export final DataFrame to CSV file
combined_df.to_csv('combined_dataset.csv', index=False)

print("DataFrame successfully exported to 'combined_dataset.csv'")

# Download file
files.download('combined_dataset.csv')

===ATTENTION: Execute the second procedure (separability analysis) in the code starting from the cell below.===

In [None]:
import pandas as pd
import numpy as np

# Load CSV file to examine data
file_path = '/content/drive/MyDrive/floodProjectAssets/dataSeparabilityEvaluation.csv'
data_evaluation = pd.read_csv(file_path)

# Display first rows of file to see its structure
display(data_evaluation.head())

# Display number of rows in file
print(f"\nNumber of rows in file: {data_evaluation.shape[0]}")

# Display value counts in 'cluster' column
print("\nValue counts in 'cluster' column:")
print(data_evaluation['cluster'].value_counts())
# Display value counts in 'cluster' column
print("\nValue counts in 'classification_hand' column:")
print(data_evaluation['classification_hand'].value_counts())
# Display value counts in 'cluster' column
print("\nValue counts in 'classification' column:")
print(data_evaluation['classification'].value_counts())

In [None]:
# Drop the specified columns
data_evaluation_dropped = data_evaluation.drop(columns=['system:index', '.geo'])

# Separate the data into three dataframes
attributes = data_evaluation_dropped.drop(columns=['classification', 'classification_hand', 'cluster'])

data_with_classification = attributes.copy()
data_with_classification['classification'] = data_evaluation_dropped['classification']

data_with_classification_hand = attributes.copy()
data_with_classification_hand['classification_hand'] = data_evaluation_dropped['classification_hand']

data_with_cluster = attributes.copy()
data_with_cluster['cluster'] = data_evaluation_dropped['cluster']

# Display the head of each new dataframe to verify
print("Data with 'classification':")
display(data_with_classification.head())

print("\nData with 'classification_hand':")
display(data_with_classification_hand.head())

print("\nData with 'cluster':")
display(data_with_cluster.head())

In [None]:
import pandas as pd

def sample_dataframe(dataframe, target_column, max_samples_per_class=10000):
    """
    Samples a DataFrame to a maximum number of samples per class.

    Args:
        dataframe (pd.DataFrame): The input DataFrame.
        target_column (str): The name of the target column (classes).
        max_samples_per_class (int): The maximum number of samples desired for each class.

    Returns:
        pd.DataFrame: The sampled DataFrame.
    """
    sampled_df = pd.DataFrame()
    classes = dataframe[target_column].unique()
    #print(classes)
    #print(len(classes))
    for cls in classes:
        class_subset = dataframe[dataframe[target_column] == cls]
        #print(len(class_subset), '---- ', max_samples_per_class)
        if len(class_subset) > max_samples_per_class:
            # Randomly sample if the class has more than max_samples_per_class
            sampled_class_subset = class_subset.sample(n=max_samples_per_class, random_state=42)
            #print('oi 1')
        else:
            # Keep all samples if the class has less than or equal to max_samples_per_class
            sampled_class_subset = class_subset
            #print('oi 2')
        sampled_df = pd.concat([sampled_df, sampled_class_subset])

    # Shuffle the resulting DataFrame
    sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
    return sampled_df

# Sample each of the dataframes
sampled_data_with_classification = sample_dataframe(data_with_classification, 'classification')
sampled_data_with_classification_hand = sample_dataframe(data_with_classification_hand, 'classification_hand')
sampled_data_with_cluster = sample_dataframe(data_with_cluster, 'cluster')

# Display the value counts for the new sampled dataframes to verify
print("Value counts for sampled_data_with_classification:")
print(sampled_data_with_classification['classification'].value_counts())

print("\nValue counts for sampled_data_with_classification_hand:")
print(sampled_data_with_classification_hand['classification_hand'].value_counts())

print("\nValue counts for sampled_data_with_cluster:")
print(sampled_data_with_cluster['cluster'].value_counts())

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns # Import seaborn for potentially nicer plots

def plot_pca_separability_sampled(dataframe, target_column, title_suffix=""):
    """
    Applies PCA and plots class separability for a sampled dataframe.

    Args:
        dataframe (pd.DataFrame): The sampled DataFrame containing attributes and target column.
        target_column (str): The name of the target column (classes).
        title_suffix (str): Suffix to add to plot title.
    """
    attributes = dataframe.drop(columns=[target_column])
    classes = dataframe[target_column]

    # Apply PCA with 2 components
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(attributes)

    # Plot scatter plot
    plt.figure(figsize=(10, 8))
    # Using seaborn for potentially better looking scatter plot and easier hue mapping
    sns.scatterplot(x=principal_components[:, 0], y=principal_components[:, 1], hue=classes, palette={0: 'skyblue', 1: 'red'}, alpha=0.6)

    plt.title(f'PCA of 2 Components – Class Separation {title_suffix}')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(title=target_column)
    plt.grid(True)
    plt.show()

# Assuming the sampled dataframes are available and calling the function for each
if 'sampled_data_with_classification' in locals() and 'sampled_data_with_classification_hand' in locals() and 'sampled_data_with_cluster' in locals():

    # Plot for sampled_data_with_classification
    plot_pca_separability_sampled(sampled_data_with_classification, 'classification', " (Sampled - semi-supervised classification)")

    # Plot for sampled_data_with_classification_hand
    plot_pca_separability_sampled(sampled_data_with_classification_hand, 'classification_hand', " (Sampled - slicing classification by HAND)")

    # Plot for sampled_data_with_cluster
    plot_pca_separability_sampled(sampled_data_with_cluster, 'cluster', " (Sampled - unsupervised classification)")

else:
    print("Error: Sampled DataFrames not found. Please run the cell that creates the sampled dataframes.")

In [None]:
import numpy as np
from scipy.spatial.distance import mahalanobis
from scipy.stats import multivariate_normal
import pandas as pd

def calculate_fdr(X1, X2):
    """Calculates Fisher's Discriminant Ratio."""
    mean1 = np.mean(X1, axis=0)
    mean2 = np.mean(X2, axis=0)
    var1 = np.var(X1, axis=0)
    var2 = np.var(X2, axis=0)

    # Avoid division by zero
    fdr = np.sum(((mean1 - mean2)**2) / (var1 + var2 + 1e-8))
    return fdr

def calculate_bhattacharyya_distance(X1, X2):
    """Calculates Bhattacharyya Distance assuming multivariate normal distributions."""
    mean1 = np.mean(X1, axis=0)
    mean2 = np.mean(X2, axis=0)
    cov1 = np.cov(X1.T)
    cov2 = np.cov(X2.T)

    # Add a small value to the diagonal for numerical stability
    cov1 += np.eye(cov1.shape[0]) * 1e-6
    cov2 += np.eye(cov2.shape[0]) * 1e-6

    try:
        cov_pooled = (cov1 + cov2) / 2
        term1 = 0.125 * np.dot((mean1 - mean2).T, np.linalg.solve(cov_pooled, (mean1 - mean2)))
        term2 = 0.5 * np.log(np.linalg.det(cov_pooled) / np.sqrt(np.linalg.det(cov1) * np.linalg.det(cov2)))
        b_distance = term1 + term2
        return b_distance
    except np.linalg.LinAlgError:
        print("Warning: Could not calculate Bhattacharyya Distance due to singular covariance matrix.")
        return np.nan


def calculate_mahalanobis_distance(X1, X2):
    """Calculates the average Mahalanobis distance between points in X1 and X2."""
    mean1 = np.mean(X1, axis=0)
    mean2 = np.mean(X2, axis=0)
    cov_pooled = (np.cov(X1.T) + np.cov(X2.T)) / 2

    # Add a small value to the diagonal for numerical stability
    cov_pooled += np.eye(cov_pooled.shape[0]) * 1e-6

    try:
        inv_cov_pooled = np.linalg.inv(cov_pooled)
        avg_mahalanobis = mahalanobis(mean1, mean2, inv_cov_pooled)
        return avg_mahalanobis
    except np.linalg.LinAlgError:
        print("Warning: Could not calculate Mahalanobis Distance due to singular covariance matrix.")
        return np.nan


def evaluate_separability_indices(dataframe, target_column):
    """Calculates separability indices for a dataframe."""
    classes = dataframe[target_column].unique()
    if len(classes) != 2:
        print(f"Warning: Separability indices are typically for binary classification. Found {len(classes)} classes.")
        return np.nan, np.nan

    class_0_data = dataframe[dataframe[target_column] == classes[0]].drop(columns=[target_column]).values
    class_1_data = dataframe[dataframe[target_column] == classes[1]].drop(columns=[target_column]).values

    if len(class_0_data) < 2 or len(class_1_data) < 2:
         print("Warning: Need at least 2 samples in each class to calculate covariance. Skipping index calculation.")
         return np.nan, np.nan


    b_distance = calculate_bhattacharyya_distance(class_0_data, class_1_data)
    m_distance = calculate_mahalanobis_distance(class_0_data, class_1_data)

    return b_distance, m_distance

# Assuming the sampled dataframes are available
if 'sampled_data_with_classification' in locals() and 'sampled_data_with_classification_hand' in locals() and 'sampled_data_with_cluster' in locals():

    print("--- Evaluating sampled_data_with_classification ---")
    b_dist_clf, m_dist_clf = evaluate_separability_indices(sampled_data_with_classification, 'classification')
    print(f"Bhattacharyya Distance: {b_dist_clf:.2f}")
    print(f"Mahalanobis Distance: {m_dist_clf:.2f}")

    print("\n--- Evaluating sampled_data_with_classification_hand ---")
    b_dist_hand, m_dist_hand = evaluate_separability_indices(sampled_data_with_classification_hand, 'classification_hand')
    print(f"Bhattacharyya Distance: {b_dist_hand:.2f}")
    print(f"Mahalanobis Distance: {m_dist_hand:.2f}")


    print("\n--- Evaluating sampled_data_with_cluster ---")
    b_dist_cluster, m_dist_cluster = evaluate_separability_indices(sampled_data_with_cluster, 'cluster')
    print(f"Bhattacharyya Distance: {b_dist_cluster:.2f}")
    print(f"Mahalanobis Distance: {m_dist_cluster:.2f}")

else:
    print("Error: Sampled DataFrames not found. Please run the cell that creates the sampled dataframes.")

In [None]:
from sklearn.metrics import silhouette_score
import numpy as np
import pandas as pd

# Assuming the sampled dataframes are available
if 'sampled_data_with_classification' in locals() and 'sampled_data_with_classification_hand' in locals() and 'sampled_data_with_cluster' in locals():

    print("--- Evaluating sampled_data_with_classification ---")
    X_clf = sampled_data_with_classification.drop(columns=['classification']).values
    labels_clf = sampled_data_with_classification['classification'].values
    try:
        silhouette_avg_clf = silhouette_score(X_clf, labels_clf)
        print(f"Silhouette Score: {silhouette_avg_clf:.2f}")
    except ValueError as e:
         print(f"Could not calculate Silhouette Score: {e}")


    print("\n--- Evaluating sampled_data_with_classification_hand ---")
    X_hand = sampled_data_with_classification_hand.drop(columns=['classification_hand']).values
    labels_hand = sampled_data_with_classification_hand['classification_hand'].values
    try:
        silhouette_avg_hand = silhouette_score(X_hand, labels_hand)
        print(f"Silhouette Score: {silhouette_avg_hand:.2f}")
    except ValueError as e:
         print(f"Could not calculate Silhouette Score: {e}")


    print("\n--- Evaluating sampled_data_with_cluster ---")
    X_cluster = sampled_data_with_cluster.drop(columns=['cluster']).values
    labels_cluster = sampled_data_with_cluster['cluster'].values
    try:
        silhouette_avg_cluster = silhouette_score(X_cluster, labels_cluster)
        print(f"Silhouette Score: {silhouette_avg_cluster:.2f}")
    except ValueError as e:
         print(f"Could not calculate Silhouette Score: {e}")

else:
    print("Error: Sampled DataFrames not found. Please run the cell that creates the sampled dataframes.")