In [1]:
import sys
sys.path.append('/scratch/ne2213/projects/tmp_packages')
sys.path.append('/scratch/ne2213/projects/tmp_packages/')

In [2]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Classification Results on original dataset

In [3]:
# Step 1: Load and preprocess the dataset
pkl_files = [
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_EY/EY_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_NP/NP_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_HL/HL_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_SK2/SK2_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_SK1/SK1_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_KK/KK_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_SF/SF_data.pkl",
]

# Combine data from all files into a single DataFrame
combined_structured_data = []
for file in pkl_files:
    with open(file, "rb") as f:
        data = pickle.load(f)
        if isinstance(data["structured_data"], list):
            combined_structured_data.extend(data["structured_data"])
        elif isinstance(data["structured_data"], pd.DataFrame):
            combined_structured_data.append(data["structured_data"])

# Concatenate all DataFrames into a single DataFrame
data = pd.concat(
    [entry for entry in combined_structured_data if isinstance(entry, pd.DataFrame)],
    ignore_index=True
)

# Step 2: Normalize the entire `data` DataFrame
# Normalize RGB values to [0, 1]
data['R'] /= 65535.0
data['G'] /= 65535.0
data['B'] /= 65535.0

# Normalize xr, yr, zr values to [0, 1]
data['xr'] /= data['xr'].max()
data['yr'] /= data['yr'].max()
data['zr'] /= data['zr'].max()


# Step 4: Initialize lists for grayscale and colorized features
grayscale_features = []
colorized_features = []
labels = []

# Step 5: Extract features from the normalized `data` DataFrame
# Grayscale features: Spatial coordinates and weight
grayscale = data[['xr', 'yr', 'zr', 'weight']].values
grayscale_features.extend(grayscale)

# Colorized features: Grayscale features + RGB values
colorized = data[['xr', 'yr', 'zr', 'weight', 'R', 'G', 'B']].values
colorized_features.extend(colorized)

# Extract labels (IDs)
labels.extend(data['ID'].tolist())

# Convert to NumPy arrays
grayscale_features = np.array(grayscale_features)
colorized_features = np.array(colorized_features)
labels = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)


In [4]:
# Step 3: Split for grayscale features
X_train_gray, X_test_gray, y_train_gray, y_test_gray = train_test_split(
    grayscale_features, encoded_labels, test_size=0.3, random_state=42
)

# Split for colorized features
X_train_color, X_test_color, y_train_color, y_test_color = train_test_split(
    colorized_features, encoded_labels, test_size=0.3, random_state=42
)

print(f"Grayscale Training Samples: {len(X_train_gray)}")
print(f"Colorized Training Samples: {len(X_train_color)}")

Grayscale Training Samples: 10900
Colorized Training Samples: 10900


In [6]:
# Classification Results on Grayscale images

In [5]:
# Step 4: Train a model using grayscale features
model_gray = RandomForestClassifier(random_state=42)
model_gray.fit(X_train_gray, y_train_gray)

# Evaluate the model on grayscale test set
y_pred_gray = model_gray.predict(X_test_gray)

# Get the unique labels present in y_test
test_labels_gray = np.unique(y_test_gray)

# Map these back to the original class names
test_target_names_gray = [label_encoder.classes_[label] for label in test_labels_gray]

# Calculate and print overall accuracy
accuracy_gray = accuracy_score(y_test_gray, y_pred_gray)
print(f"Overall Accuracy (Grayscale): {accuracy_gray:.4f}")

# Generate the classification report using only the labels in y_test
print("Grayscale Feature Performance:")
print(classification_report(
    y_test_gray,
    y_pred_gray,
    labels=test_labels_gray,
    target_names=test_target_names_gray,
    zero_division=0  # Set to 0 or 1 based on your preference
))

Overall Accuracy (Grayscale): 0.2324
Grayscale Feature Performance:
              precision    recall  f1-score   support

                   0.47      0.80      0.59      1196
        ADAL       0.10      0.04      0.06        26
        ADAR       0.25      0.05      0.09        19
       ADAR?       0.00      0.00      0.00         1
        ADEL       0.11      0.10      0.10        20
        ADER       0.00      0.00      0.00        15
        ADFL       0.00      0.00      0.00         8
        ADFR       0.00      0.00      0.00        10
        ADLL       0.17      0.05      0.07        22
        ADLR       0.00      0.00      0.00        14
        AFDL       0.00      0.00      0.00        16
        AFDR       0.00      0.00      0.00        21
        AIAL       0.00      0.00      0.00         9
        AIAR       0.00      0.00      0.00        12
        AIB?       0.00      0.00      0.00         1
        AIBL       0.00      0.00      0.00        15
        AIBR 

In [8]:
# Classification Results on Colored Images

In [6]:
# Step 5: Train a model using colorized features
model_color = RandomForestClassifier(random_state=42)
model_color.fit(X_train_color, y_train_color)

# Evaluate the model on colorized test set
y_pred_color = model_color.predict(X_test_color)

# Get the unique labels present in y_test
test_labels_color = np.unique(y_test_color)

# Map these back to the original class names
test_target_names_color = [label_encoder.classes_[label] for label in test_labels_color]

# Calculate and print overall accuracy
accuracy_color = accuracy_score(y_test_color, y_pred_color)
print(f"Overall Accuracy (Colorized): {accuracy_color:.4f}")

# Generate the classification report using only the labels in y_test
print("Colorized Feature Performance:")
print(classification_report(
    y_test_color,
    y_pred_color,
    labels=test_labels_color,
    target_names=test_target_names_color,
    zero_division=0  # Set to 0 or 1 based on your preference
))

Overall Accuracy (Colorized): 0.4060
Colorized Feature Performance:
              precision    recall  f1-score   support

                   0.52      0.91      0.67      1196
        ADAL       0.58      0.27      0.37        26
        ADAR       0.50      0.21      0.30        19
       ADAR?       0.00      0.00      0.00         1
        ADEL       0.53      0.40      0.46        20
        ADER       0.43      0.40      0.41        15
        ADFL       0.00      0.00      0.00         8
        ADFR       0.00      0.00      0.00        10
        ADLL       0.14      0.05      0.07        22
        ADLR       0.00      0.00      0.00        14
        AFDL       0.24      0.25      0.24        16
        AFDR       0.40      0.10      0.15        21
        AIAL       0.00      0.00      0.00         9
        AIAR       0.00      0.00      0.00        12
        AIB?       0.00      0.00      0.00         1
        AIBL       0.21      0.33      0.26        15
        AIBR 

# Classification Results on dataset after removing background labels

In [3]:
# Step 1: Load and preprocess the dataset
pkl_files = [
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_EY/EY_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_NP/NP_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_HL/HL_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_SK2/SK2_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_SK1/SK1_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_KK/KK_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_SF/SF_data.pkl",
]

# Combine data from all files into a single DataFrame
combined_structured_data = []
for file in pkl_files:
    with open(file, "rb") as f:
        data = pickle.load(f)
        if isinstance(data["structured_data"], list):
            combined_structured_data.extend(data["structured_data"])
        elif isinstance(data["structured_data"], pd.DataFrame):
            combined_structured_data.append(data["structured_data"])

# Concatenate all DataFrames into a single DataFrame
data = pd.concat(
    [entry for entry in combined_structured_data if isinstance(entry, pd.DataFrame)],
    ignore_index=True
)

# Step 2: Normalize the entire `data` DataFrame
# Normalize RGB values to [0, 1]
data['R'] /= 65535.0
data['G'] /= 65535.0
data['B'] /= 65535.0

# Normalize xr, yr, zr values to [0, 1]
data['xr'] /= data['xr'].max()
data['yr'] /= data['yr'].max()
data['zr'] /= data['zr'].max()

# Check the total length before excluding the background
total_length_before = len(data)
print(f"Total number of rows before excluding background: {total_length_before}")

# Step 3: Remove background data and reset index
background_label = ""
data = data[data['ID'] != background_label].reset_index(drop=True)

# Check the total length after excluding the background
total_length_after = len(data)
print(f"Total number of rows after excluding background: {total_length_after}")

# Step 4: Initialize lists for grayscale and colorized features
grayscale_features = []
colorized_features = []
labels = []

# Step 5: Extract features from the normalized `data` DataFrame
# Grayscale features: Spatial coordinates and weight
grayscale = data[['xr', 'yr', 'zr', 'weight']].values
grayscale_features.extend(grayscale)

# Colorized features: Grayscale features + RGB values
colorized = data[['xr', 'yr', 'zr', 'weight', 'R', 'G', 'B']].values
colorized_features.extend(colorized)

# Extract labels (IDs)
labels.extend(data['ID'].tolist())

# Convert to NumPy arrays
grayscale_features = np.array(grayscale_features)
colorized_features = np.array(colorized_features)
labels = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)


Total number of rows before excluding background: 15572
Total number of rows after excluding background: 11511


In [4]:
# Step 3: Split for grayscale features
X_train_gray, X_test_gray, y_train_gray, y_test_gray = train_test_split(
    grayscale_features, encoded_labels, test_size=0.3, random_state=42
)

# Split for colorized features
X_train_color, X_test_color, y_train_color, y_test_color = train_test_split(
    colorized_features, encoded_labels, test_size=0.3, random_state=42
)

print(f"Grayscale Training Samples: {len(X_train_gray)}")
print(f"Colorized Training Samples: {len(X_train_color)}")

Grayscale Training Samples: 8057
Colorized Training Samples: 8057


In [6]:
# Classification Results on Grayscale images

In [5]:
# Step 4: Train a model using grayscale features
model_gray = RandomForestClassifier(random_state=42)
model_gray.fit(X_train_gray, y_train_gray)

# Evaluate the model on grayscale test set
y_pred_gray = model_gray.predict(X_test_gray)

# Get the unique labels present in y_test
test_labels_gray = np.unique(y_test_gray)

# Map these back to the original class names
test_target_names_gray = [label_encoder.classes_[label] for label in test_labels_gray]

# Calculate and print overall accuracy
accuracy_gray = accuracy_score(y_test_gray, y_pred_gray)
print(f"Overall Accuracy (Grayscale): {accuracy_gray:.4f}")

# Generate the classification report using only the labels in y_test
print("Grayscale Feature Performance:")
print(classification_report(
    y_test_gray,
    y_pred_gray,
    labels=test_labels_gray,
    target_names=test_target_names_gray,
    zero_division=0  # Set to 0 or 1 based on your preference
))

Overall Accuracy (Grayscale): 0.0556
Grayscale Feature Performance:
              precision    recall  f1-score   support

        ADAL       0.03      0.06      0.04        18
        ADAR       0.09      0.05      0.06        20
        ADEL       0.00      0.00      0.00        27
        ADER       0.06      0.05      0.05        19
        ADFL       0.00      0.00      0.00        12
        ADFR       0.00      0.00      0.00         9
        ADLL       0.00      0.00      0.00        15
        ADLR       0.00      0.00      0.00        12
        AFDL       0.05      0.06      0.05        18
        AFDR       0.07      0.07      0.07        15
        AIAL       0.00      0.00      0.00        10
        AIAR       0.00      0.00      0.00         8
        AIBL       0.00      0.00      0.00        23
        AIBR       0.08      0.09      0.09        23
        AIM?       0.00      0.00      0.00         1
        AIML       0.00      0.00      0.00        19
        AIMR 

In [7]:
# Classification Results on Colored Images

In [8]:
# Step 5: Train a model using colorized features
model_color = RandomForestClassifier(random_state=42)
model_color.fit(X_train_color, y_train_color)

# Evaluate the model on colorized test set
y_pred_color = model_color.predict(X_test_color)

# Get the unique labels present in y_test
test_labels_color = np.unique(y_test_color)

# Map these back to the original class names
test_target_names_color = [label_encoder.classes_[label] for label in test_labels_color]

# Calculate and print overall accuracy
accuracy_color = accuracy_score(y_test_color, y_pred_color)
print(f"Overall Accuracy (Colorized): {accuracy_color:.4f}")

# Generate the classification report using only the labels in y_test
print("Colorized Feature Performance:")
print(classification_report(
    y_test_color,
    y_pred_color,
    labels=test_labels_color,
    target_names=test_target_names_color,
    zero_division=0  # Set to 0 or 1 based on your preference
))

Overall Accuracy (Colorized): 0.2791
Colorized Feature Performance:
              precision    recall  f1-score   support

        ADAL       0.31      0.50      0.38        18
        ADAR       0.29      0.10      0.15        20
        ADEL       0.34      0.41      0.37        27
        ADER       0.30      0.32      0.31        19
        ADFL       0.00      0.00      0.00        12
        ADFR       0.09      0.11      0.10         9
        ADLL       0.11      0.13      0.12        15
        ADLR       0.06      0.08      0.07        12
        AFDL       0.17      0.11      0.13        18
        AFDR       0.22      0.27      0.24        15
        AIAL       0.00      0.00      0.00        10
        AIAR       0.00      0.00      0.00         8
        AIBL       0.23      0.26      0.24        23
        AIBR       0.24      0.39      0.30        23
        AIM?       0.00      0.00      0.00         1
        AIML       0.31      0.26      0.29        19
        AIMR 

# Classification Results on dataset after removing background labels an applying SMOTE

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
import pickle
from collections import Counter

# Step 1: Load and preprocess the dataset
pkl_files = [
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_EY/EY_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_NP/NP_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_HL/HL_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_SK2/SK2_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_SK1/SK1_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_KK/KK_data.pkl",
    "/scratch/ne2213/projects/CV-Project/NWBelegans/extracted_data_all_SF/SF_data.pkl",
]

# Combine data from all files into a single DataFrame
combined_structured_data = []
for file in pkl_files:
    with open(file, "rb") as f:
        data = pickle.load(f)
        if isinstance(data["structured_data"], list):
            combined_structured_data.extend(data["structured_data"])
        elif isinstance(data["structured_data"], pd.DataFrame):
            combined_structured_data.append(data["structured_data"])

# Concatenate all DataFrames into a single DataFrame
data = pd.concat(
    [entry for entry in combined_structured_data if isinstance(entry, pd.DataFrame)],
    ignore_index=True
)

# Step 2: Normalize the entire `data` DataFrame
# Normalize RGB values to [0, 1]
data['R'] /= 65535.0
data['G'] /= 65535.0
data['B'] /= 65535.0

# Normalize xr, yr, zr values to [0, 1]
data['xr'] /= data['xr'].max()
data['yr'] /= data['yr'].max()
data['zr'] /= data['zr'].max()

# Step 3: Remove background data and reset index
background_label = ""
data = data[data['ID'] != background_label].reset_index(drop=True)

# Step 4: Initialize lists for grayscale and colorized features
grayscale_features = []
colorized_features = []
labels = []

# Step 5: Extract features from the normalized `data` DataFrame
# Grayscale features: Spatial coordinates and weight
grayscale = data[['xr', 'yr', 'zr', 'weight']].values
grayscale_features.extend(grayscale)

# Colorized features: Grayscale features + RGB values
colorized = data[['xr', 'yr', 'zr', 'weight', 'R', 'G', 'B']].values
colorized_features.extend(colorized)

# Extract labels (IDs)
labels.extend(data['ID'].tolist())

# Convert to NumPy arrays
grayscale_features = np.array(grayscale_features)
colorized_features = np.array(colorized_features)
labels = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Step 6: Split for grayscale features
X_train_gray, X_test_gray, y_train_gray, y_test_gray = train_test_split(
    grayscale_features, encoded_labels, test_size=0.3, random_state=42
)

# Split for colorized features
X_train_color, X_test_color, y_train_color, y_test_color = train_test_split(
    colorized_features, encoded_labels, test_size=0.3, random_state=42
)

print(f"Grayscale Training Samples: {len(X_train_gray)}")
print(f"Colorized Training Samples: {len(X_train_color)}")

# Step 7: Filter out classes with fewer than two samples
class_counts = Counter(y_train_gray)

# Identify classes with more than one sample
sufficient_classes = [cls for cls, count in class_counts.items() if count > 1]
filtered_indices = [i for i, cls in enumerate(y_train_gray) if cls in sufficient_classes]

# Step 4: Apply SMOTE for both grayscale and colorized features
# Combine grayscale and colorized features for SMOTE
X_train_combined_filtered = np.hstack([X_train_gray_filtered, X_train_color_filtered])

# Apply SMOTE to the filtered data
smote = SMOTE(k_neighbors=1, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_combined_filtered, y_train_filtered)

# Split back into grayscale and colorized features
X_train_gray_smote = X_train_smote[:, :X_train_gray.shape[1]]
X_train_color_smote = X_train_smote[:, X_train_gray.shape[1]:]

# Verify the shape of SMOTE results
print(f"New grayscale training samples after SMOTE: {X_train_gray_smote.shape}")
print(f"New colorized training samples after SMOTE: {X_train_color_smote.shape}")


Grayscale Training Samples: 8057
Colorized Training Samples: 8057
New grayscale training samples after SMOTE: (18270, 4)
New colorized training samples after SMOTE: (18270, 7)


In [None]:
# Classification Results on Grayscale images

In [7]:
# Step 5: Train a model using grayscale features
model_gray = RandomForestClassifier(random_state=42)
model_gray.fit(X_train_gray_smote, y_train_smote)  # Ensure matching sample sizes

# Evaluate the model on grayscale test set
y_pred_gray = model_gray.predict(X_test_gray)

# Get the unique labels present in y_test
test_labels_gray = np.unique(y_test_gray)

# Map these back to the original class names
test_target_names_gray = [label_encoder.classes_[label] for label in test_labels_gray]

# Calculate and print overall accuracy
accuracy_gray = accuracy_score(y_test_gray, y_pred_gray)
print(f"Overall Accuracy (Grayscale): {accuracy_gray:.4f}")

# Generate the classification report using only the labels in y_test
print("Grayscale Feature Performance:")
print(classification_report(
    y_test_gray,
    y_pred_gray,
    labels=test_labels_gray,
    target_names=test_target_names_gray,
    zero_division=0  # Set to 0 or 1 based on your preference
))


Overall Accuracy (Grayscale): 0.0559
Grayscale Feature Performance:
              precision    recall  f1-score   support

        ADAL       0.07      0.11      0.08        18
        ADAR       0.08      0.05      0.06        20
        ADEL       0.08      0.07      0.08        27
        ADER       0.00      0.00      0.00        19
        ADFL       0.00      0.00      0.00        12
        ADFR       0.00      0.00      0.00         9
        ADLL       0.00      0.00      0.00        15
        ADLR       0.00      0.00      0.00        12
        AFDL       0.05      0.06      0.05        18
        AFDR       0.08      0.07      0.07        15
        AIAL       0.00      0.00      0.00        10
        AIAR       0.00      0.00      0.00         8
        AIBL       0.07      0.04      0.05        23
        AIBR       0.15      0.09      0.11        23
        AIM?       0.00      0.00      0.00         1
        AIML       0.00      0.00      0.00        19
        AIMR 

In [None]:
# Classification Results on Colored images

In [9]:
# Step 5: Train a model using colorized features
model_color = RandomForestClassifier(random_state=42)
model_color.fit(X_train_color_smote, y_train_smote)  # Ensure matching sample sizes for SMOTE

# Evaluate the model on colorized test set
y_pred_color = model_color.predict(X_test_color)

# Get the unique labels present in y_test
test_labels_color = np.unique(y_test_color)

# Map these back to the original class names
test_target_names_color = [label_encoder.classes_[label] for label in test_labels_color]

# Calculate and print overall accuracy
accuracy_color = accuracy_score(y_test_color, y_pred_color)
print(f"Overall Accuracy (Colorized): {accuracy_color:.4f}")

# Generate the classification report using only the labels in y_test
print("Colorized Feature Performance:")
print(classification_report(
    y_test_color,
    y_pred_color,
    labels=test_labels_color,
    target_names=test_target_names_color,
    zero_division=0  # Set to 0 or 1 based on your preference
))


Overall Accuracy (Colorized): 0.2817
Colorized Feature Performance:
              precision    recall  f1-score   support

        ADAL       0.57      0.44      0.50        18
        ADAR       0.20      0.20      0.20        20
        ADEL       0.45      0.37      0.41        27
        ADER       0.29      0.21      0.24        19
        ADFL       0.07      0.08      0.08        12
        ADFR       0.00      0.00      0.00         9
        ADLL       0.14      0.20      0.16        15
        ADLR       0.05      0.08      0.06        12
        AFDL       0.12      0.11      0.11        18
        AFDR       0.22      0.27      0.24        15
        AIAL       0.00      0.00      0.00        10
        AIAR       0.09      0.12      0.11         8
        AIBL       0.36      0.35      0.36        23
        AIBR       0.24      0.35      0.29        23
        AIM?       0.00      0.00      0.00         1
        AIML       0.24      0.21      0.22        19
        AIMR 