In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [None]:
data = np.load('<path_to_file>/Blueberry_train_cubes.npy', allow_pickle= True)
labels = np.load('<path_to_file>/Blueberry_train_cubes_labels.npy', allow_pickle= True)

data.shape, labels.shape

In [None]:
mean = data.mean(axis=(0, 1, 2), keepdims=True)
std_dev = data.std(axis=(0, 1, 2), keepdims=True)

In [None]:
# Add these 2 lines to save normalization parameters
np.save('Blueberry_train_cubes_mean.npy', mean)
np.save('Blueberry_train_cubes_std.npy', std_dev)  # <-- NEW CODE

In [None]:

# Z-score normalization
data = (data - mean) / std_dev

print("Dataset shape:", data.shape)

In [None]:
# PCA START
hsi_image_train = data

# Reshape
hsi_reshaped_train = hsi_image_train.reshape(-1, 462)

In [None]:
# Apply PCA
pca = PCA()
hsi_train_pca = pca.fit_transform(hsi_reshaped_train)

explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance)

In [None]:
plt.figure(figsize=(5, 5))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--')
plt.xlabel('Number of Dimensions')
plt.ylabel('Total Variance Retained')
plt.title('Variance Retained vs No. of Dimensions')
plt.grid()
plt.show()

In [None]:
# No. of Dimensions that retains 99% variance:
optimal_components = np.argmax(cumulative_explained_variance >= 0.999) + 1 
print(f"Number of Dimensions to retain 99% variance: {optimal_components}")

# Apply PCA with optimal components
pca = PCA(n_components=optimal_components)
X_train_PCA = pca.fit_transform(hsi_reshaped_train)

X_train_PCA.shape

In [None]:
# Save PCA components and mean
np.save('Blueberry_train_cubes_PCA_components.npy', pca.components_)
np.save('Blueberry_train_cubes_PCA_mean.npy', pca.mean_)

In [None]:
# Reshape back to the spatial dimensions
data_train = X_train_PCA.reshape(2938, 32, 32, optimal_components)

data_train.shape

In [None]:
# SAVE PCA Training and Testing set
np.save('Blueberry_train_cubes_PCA.npy', data_train)

In [None]:
# LOAD PCA DATA
data_train = np.load('Blueberry_train_cubes_PCA.npy')
train_labels = np.load('Blueberry_train_cubes_labels.npy')

print(data_train.shape, train_labels.shape)

APPLY PCA TO TEST

In [None]:
# Load test data
test_data = np.load('Blueberry_test_cubes.npy', allow_pickle=True)

# Load training normalization parameters
train_mean = np.load('Blueberry_train_cubes_mean.npy')
train_std = np.load('Blueberry_train_cubes_std.npy')

# Normalize test data using TRAINING'S mean and std
test_data_normalized = (test_data - train_mean) / train_std

In [None]:
# Reshape test data to 2D
num_test_samples = test_data.shape[0]
hsi_reshaped_test = test_data_normalized.reshape(num_test_samples * 32 * 32, 462)

In [None]:
# Load PCA parameters from training
pca_components = np.load('Blueberry_train_cubes_PCA_components.npy')
pca_mean = np.load('Blueberry_train_cubes_PCA_mean.npy')

In [None]:
# Apply PCA transformation
hsi_test_pca = (hsi_reshaped_test - pca_mean) @ pca_components.T

# Reshape back to 4D (preserve spatial dimensions)
optimal_components = pca_components.shape[0]  # Get from component shape
test_data_pca = hsi_test_pca.reshape(num_test_samples, 32, 32, optimal_components)

# Save transformed test data
np.save('Blueberry_test_cubes_PCA.npy', test_data_pca)

In [None]:
test_data_pca.shape

In [None]:
#LOAD PCA DATA
train_labels = np.load('Blueberry_train_cubes_labels.npy')
test_labels = np.load('Blueberry_test_cubes_labels.npy')

print(train_labels.shape, test_labels.shape )

In [None]:
# Check unique labels and counts
# Corrected function with proper variable names
def check_class_distribution(labels, dataset_name):
    unique, counts = np.unique(labels, return_counts=True)
    print(f"\n{dataset_name} Set Class Distribution:")
    for cls, count in zip(unique, counts):
        print(f"Class {cls}: {count} samples ({count/len(labels)*100:.2f}%)")
    return unique, counts

# For training set
train_unique, train_counts = check_class_distribution(train_labels, "Training")
test_unique, test_counts = check_class_distribution(test_labels, "Testing")

# Rest of the analysis code remains the same...

# Check if test classes match train classes
if set(train_unique) != set(test_unique):
    print("\nWarning: Mismatched classes between train and test sets!")
    print(f"Train classes: {train_unique}")
    print(f"Test classes: {test_unique}")

# Visualize class distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(train_unique, train_counts)
plt.title('Training Set Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
plt.bar(test_unique, test_counts)
plt.title('Test Set Class Distribution')
plt.xlabel('Class')

plt.tight_layout()
plt.show()

# Calculate imbalance ratio
def calculate_imbalance_ratio(counts):
    majority = max(counts)
    minority = min(counts)
    return majority/minority

train_ratio = calculate_imbalance_ratio(train_counts)
test_ratio = calculate_imbalance_ratio(test_counts)

print(f"\nTraining set imbalance ratio: {train_ratio:.2f}:1")
print(f"Test set imbalance ratio: {test_ratio:.2f}:1")

# Class imbalance thresholds
if train_ratio > 4:
    print("\nSevere class imbalance detected in training data!")
elif train_ratio > 2:
    print("\nModerate class imbalance detected in training data.")
else:
    print("\nClasses are relatively balanced in training data.")