In [None]:
# ✅ Install Required Libraries
!pip install kagglehub torch torchvision matplotlib scikit-learn

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score, silhouette_score
import kagglehub

#  Download Malimg Dataset using Kaggle Hub
dataset_path = kagglehub.dataset_download("manmandes/malimg")
dataset_path = os.path.join(dataset_path, "malimg_dataset", "train")
print(f"Dataset loaded from: {dataset_path}")

# ✅ Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ✅ Define Data Preprocessing
transform = transforms.Compose([
    transforms.Resize((64, 64)),  # Resize to speed up training
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)  # Normalize
])

# ✅ Load Dataset
dataset = datasets.ImageFolder(root=dataset_path, transform=transform)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# ✅ Define Deep Autoencoder
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 3, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Tanh()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# ✅ Initialize Model
model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# ✅ Train Autoencoder
epochs = 10
for epoch in range(epochs):
    epoch_loss = 0
    for images, _ in dataloader:
        images = images.to(device)

        # Forward Pass
        encoded, decoded = model(images)
        loss = criterion(decoded, images)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(dataloader):.4f}")

# ✅ Extract Features for Clustering
model.eval()
latent_features = []
labels = []

with torch.no_grad():
    for images, lbls in dataloader:
        images = images.to(device)
        encoded, _ = model(images)
        latent_features.append(encoded.view(images.size(0), -1).cpu().numpy())
        labels.extend(lbls.numpy())

latent_features = np.vstack(latent_features)

# ✅ Apply K-Means Clustering
num_clusters = len(dataset.classes)
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(latent_features)

# ✅ Evaluate Clustering
ari = adjusted_rand_score(labels, clusters)
silhouette = silhouette_score(latent_features, clusters)

print(f"ARI Score: {ari:.4f}")
print(f"Silhouette Score: {silhouette:.4f}")

# ✅ Visualize Clusters
pca = PCA(n_components=2)
latent_2d = pca.fit_transform(latent_features)

plt.figure(figsize=(10, 6))
plt.scatter(latent_2d[:, 0], latent_2d[:, 1], c=clusters, cmap="viridis", alpha=0.6)
plt.title("Malimg Dataset Clustering with DAC")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.colorbar(label="Cluster")
plt.show()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from tqdm import tqdm
import kagglehub
import os

#import kagglehub
dataset_path = kagglehub.dataset_download("manmandes/malimg")
train_path = os.path.join(dataset_path, "malimg_dataset", "train")
test_path = os.path.join(dataset_path, "malimg_dataset", "test")

print(f"Dataset loaded from: {dataset_path}")


# Step 1: Download Malimg Dataset using Kaggle Hub
dataset_path = kagglehub.dataset_download("manmandes/malimg")

# Print the path where the dataset is downloaded
print(f"Dataset downloaded to: {dataset_path}")

# Adjust the path based on the downloaded directory structure
dataset_path = os.path.join(dataset_path, "malimg_dataset", "train")
print(f"Dataset loaded from: {dataset_path}")


class MalimgDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = [
            os.path.join(root_dir, img)
            for img in os.listdir(root_dir)
            if os.path.isfile(os.path.join(root_dir, img))  # Only include files
        ]

        # Debug: Check how many images were loaded
        print(f"Loaded {len(self.image_paths)} images from {root_dir}")

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image

# ---------------------- STEP 2: Define MalimgDataset Class ----------------------
# class MalimgDataset(Dataset):
#     def __init__(self, root_dir, transform=None):
#         self.root_dir = root_dir
#         self.transform = transform

#         self.image_paths = [
#             os.path.join(root_dir, img)
#             for img in os.listdir(root_dir)
#             if os.path.isfile(os.path.join(root_dir, img))  # Only include files
#         ]

#     def __len__(self):
#         return len(self.image_paths)

#     def __getitem__(self, idx):
#         img_path = self.image_paths[idx]
#         image = Image.open(img_path).convert('RGB')

#         if self.transform:
#             image = self.transform(image)

#         return image



# print(f"Number of batches processed: {len(features_list)}")
# print(f"Number of images in dataset: {len(dataset)}")

# Load Dataset
dataset = MalimgDataset(root_dir=dataset_path, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

# Debug: Check number of batches and images
print(f"Number of batches: {len(dataloader)}")
print(f"Number of images in dataset: {len(dataset)}")

# Check a few sample images to ensure they are being loaded
for i, images in enumerate(dataloader):
    print(f"Batch {i + 1} - Number of images: {len(images)}")
    if i == 1:  # Show only a few batches for debugging
        break



# # Image Transformations
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),
# ])

# # Load Dataset
# dataset = MalimgDataset(root_dir=dataset_path, transform=transform)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=False)





# Now you can continue with the rest of your code...

# ---------------------- STEP 1: Load Malimg Dataset & Extract Features from CNN ----------------------

# class MalimgDataset(Dataset):
#     def __init__(self, root_dir, transform=None):
#         self.root_dir = root_dir
#         self.transform = transform
#         self.image_paths = [os.path.join(root_dir, img) for img in os.listdir(root_dir)]

#     def __len__(self):
#         return len(self.image_paths)

#     def __getitem__(self, idx):
#         img_path = self.image_paths[idx]
#         image = Image.open(img_path).convert('RGB')

#         if self.transform:
#             image = self.transform(image)

#         return image

# # Image Transformations
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),
# ])

# # Load Dataset
# dataset_path = "malimg_dataset/train"  # Adjust the path based on your dataset
# dataset = MalimgDataset(root_dir=dataset_path, transform=transform)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=False)



# Load Pretrained CNN (ResNet as Example)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet18(pretrained=True)  # Can replace with GoogleNet, Xception, etc.
model.fc = nn.Identity()  # Remove final classification layer
model = model.to(device)
model.eval()

# Extract Features from Penultimate Layer
features_list = []

with torch.no_grad():
    for images in tqdm(dataloader, desc="Extracting Features"):
        images = images.to(device)
        features = model(images)
        features_list.append(features.cpu().numpy())

features_array = np.vstack(features_list)

print(f"Feature extraction completed! Shape: {features_array.shape}")  # (Num_samples, Feature_dim)

# ---------------------- STEP 2: Train Deep Autoencoder (DAC) ----------------------

class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
        )
        self.decoder = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Initialize Autoencoder
input_dim = features_array.shape[1]
autoencoder = Autoencoder(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

# Convert Features to Torch Tensor
features_tensor = torch.tensor(features_array, dtype=torch.float32).to(device)

# Train Autoencoder
num_epochs = 50
batch_size = 32
dataset_size = len(features_tensor)

print("Training Deep Autoencoder...")
for epoch in range(num_epochs):
    perm = torch.randperm(dataset_size)
    epoch_loss = 0

    for i in range(0, dataset_size, batch_size):
        batch_idx = perm[i:i+batch_size]
        batch = features_tensor[batch_idx]

        optimizer.zero_grad()
        encoded, decoded = autoencoder(batch)
        loss = criterion(decoded, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

print("Autoencoder training completed!")

# Extract Compressed Features from Encoder
with torch.no_grad():
    compressed_features = autoencoder.encoder(features_tensor).cpu().numpy()

print(f"Compressed feature shape: {compressed_features.shape}")  # (Num_samples, 128)

# ---------------------- STEP 3: Apply K-Means & DAC Clustering ----------------------

# Apply K-Means on Raw CNN Features
kmeans_raw = KMeans(n_clusters=10, random_state=42)
kmeans_raw_labels = kmeans_raw.fit_predict(features_array)

# Apply K-Means on Compressed Autoencoder Features (DAC)
kmeans_dac = KMeans(n_clusters=10, random_state=42)
kmeans_dac_labels = kmeans_dac.fit_predict(compressed_features)

# ---------------------- STEP 4: Visualize Clustering with t-SNE ----------------------

def plot_clusters(data, labels, title):
    tsne = TSNE(n_components=2, perplexity=30, random_state=42)
    reduced_data = tsne.fit_transform(data)

    plt.figure(figsize=(8, 6))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='tab10', alpha=0.7)
    plt.colorbar()
    plt.title(title)
    plt.show()

# Plot Raw CNN Features Clustering
plot_clusters(features_array, kmeans_raw_labels, "K-Means Clustering on Raw CNN Features")

# Plot DAC Clustering
plot_clusters(compressed_features, kmeans_dac_labels, "Deep Autoencoder-Based Clustering (DAC)")

print("Clustering & Visualization Completed!")


Dataset loaded from: /root/.cache/kagglehub/datasets/manmandes/malimg/versions/1
Dataset downloaded to: /root/.cache/kagglehub/datasets/manmandes/malimg/versions/1
Dataset loaded from: /root/.cache/kagglehub/datasets/manmandes/malimg/versions/1/malimg_dataset/train
Loaded 0 images from /root/.cache/kagglehub/datasets/manmandes/malimg/versions/1/malimg_dataset/train
Number of batches: 0
Number of images in dataset: 0


Extracting Features: 0it [00:00, ?it/s]


ValueError: need at least one array to concatenate