In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from torchvision.datasets import ImageFolder

In [2]:
output_dir = '/home/ec2-user/SageMaker/prediction_July4/Dataset'

In [3]:
def get_file_paths(data_dir: str) -> list:
    """Take a directory as input and return a list of paths to all files inside."""
    file_paths = []
    for root_path, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith('png'):
                file_paths.append(os.path.join(root_path, file))
    return file_paths

In [4]:
def load_images(file_paths, dim=(128, 128), verbose=False):
    print(f"Processing {len(file_paths)} image paths...") if verbose else None
    images = []
    
    transform = transforms.Compose([
        transforms.Resize(dim),  # Resize the image
        transforms.ToTensor()  # Convert PIL image to PyTorch tensor
    ])

    for i, file_path in enumerate(file_paths):
        f_path = os.path.join(file_path)
        if "checkpoint" in f_path:
            print(f"[{i+1}] Skipping '{f_path}' (contains 'checkpoint')...") if verbose else None
            continue
        
        # print(f"[{i+1}] Loading '{f_path}' . . .") if verbose else None
        image = Image.open(f_path)
        
        # Convert to RGB if the image has an alpha channel
        if image.mode == 'RGBA':
            image = image.convert('RGB')
        
        image = transform(image)
        images.append(image)
    
    # Resize images to a consistent size
    resized_images = []
    for image in images:
        if image.shape != (3, dim[0], dim[1]):
            resize_transform = transforms.Resize((dim[1], dim[0]))
            image = resize_transform(image)
        resized_images.append(image)
    
    return torch.stack(resized_images)

In [5]:
file_paths = get_file_paths(output_dir)
print(f"Total {len(file_paths)} images in directory.")

Total 501 images in directory.


In [6]:
# Take a sample of images
percent = 100
N = int(len(file_paths)*percent)
indexes = np.random.randint(low=0, high=len(file_paths), size=N)
sample_files = [path for (i, path) in enumerate(file_paths) if i in indexes]
X = load_images(sample_files, verbose=True, dim=(128, 128))

Processing 501 image paths...
[501] Skipping '/home/ec2-user/SageMaker/prediction_July4/Dataset/.ipynb_checkpoints/soloAUV 20220909-022428-001-checkpoint.png' (contains 'checkpoint')...


In [7]:
def show_images(X, n=10, h=28, w=28, latent_vector=False, title=None, plot_type="grid", figsize=(10, 4)):
    if plot_type == "grid":
        num_rows = int(np.ceil(n / 10))
        fig, axes = plt.subplots(num_rows, 10, figsize=figsize)
        fig.suptitle(title, fontsize=14)
        axes = axes.flatten()
        for i in range(n):
            ax = axes[i]
            if latent_vector:
                img = X[i].reshape(h, w)
            else:
                img = np.transpose(X[i], (1, 2, 0))
            ax.imshow(img, cmap='gray')
            ax.axis('off')
    elif plot_type == "flat":
        fig, axes = plt.subplots(1, n, figsize=figsize)
        fig.suptitle(title, fontsize=14)
        for i in range(n):
            ax = axes[i]
            if latent_vector:
                img = X[i].reshape(h, w)
            else:
                img = np.transpose(X[i], (1, 2, 0))
            ax.imshow(img, cmap='gray')
            ax.axis('off')
    plt.show()


# Building Autoencoder

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(8, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # Code (latent vector)
        self.code = nn.MaxPool2d(kernel_size=2)

        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(8, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2, mode='nearest'),
            nn.ConvTranspose2d(8, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2, mode='nearest'),
            nn.ConvTranspose2d(8, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Upsample(scale_factor=2, mode='nearest'),
            nn.Conv2d(16, 3, kernel_size=3, padding=1)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        code = self.code(encoded)
        decoded = self.decoder(code)
        return decoded

In [10]:


# Define the batch size
batch_size = 32

# Create data loaders
loader = DataLoader(X.to(device), batch_size=batch_size, shuffle=True)

# Define the number of epochs
epochs = 50

# PredictionLoop

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

# Assuming you have defined Autoencoder class as mentioned in your last message

# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assuming you have already defined train_loader and test_loader

# Load the model checkpoint and state dictionary
checkpoint = torch.load('/home/ec2-user/SageMaker/autoencoder_weights.pt', map_location=device)

autoencoder = Autoencoder().to(device)

# Load the encoder's state dictionary
encoder_state_dict = {
    k.replace('encoder.', ''): v for k, v in checkpoint.items() if k.startswith('encoder.')
}
autoencoder.encoder.load_state_dict(encoder_state_dict)

# Load the decoder's state dictionary
decoder_state_dict = {
    k.replace('decoder.', ''): v for k, v in checkpoint.items() if k.startswith('decoder.')
}
autoencoder.decoder.load_state_dict(decoder_state_dict)

# Set the model to evaluation mode
autoencoder.eval()


# Initialize lists to store latent vectors and targets
latent_vectors = []
targets = []

# Generate latent vectors using the data from the loader
with torch.no_grad():
    for batch_data in loader:
        # Move data to CUDA device
        batch_data = batch_data.to(device)
        
        # Forward pass to get the latent vectors
        latent_vector = autoencoder.code(autoencoder.encoder(batch_data))
        
        # Move the latent vector and target back to the CPU
        latent_vectors.extend(latent_vector.cpu().numpy())
        targets.extend(batch_data.cpu().numpy())


# Convert lists to numpy arrays
latent_vectors = np.array(latent_vectors)

# Flatten the latent_vectors array
latent_vectors = latent_vectors.reshape(latent_vectors.shape[0], -1)

print(latent_vectors.shape)
gmm.fit_predict(latent_vectors)

import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Apply Gaussian Mixture Model for clustering
k_value = 3
gmm = GaussianMixture(n_components=k_value)
cluster_labels = gmm.fit_predict(latent_vectors)

import os

output_dir = "./cluster analysis"
os.makedirs(output_dir, exist_ok=True)

# Create subfolders for each cluster
for i in range(k_value):
    cluster_dir = os.path.join(output_dir, f"cluster_{i}")
    os.makedirs(cluster_dir, exist_ok=True)

from PIL import Image

# Loop through the images and save them to the corresponding cluster folders
for idx, (image, cluster_label) in enumerate(zip(targets, cluster_labels)):
    cluster_dir = os.path.join(output_dir, f"cluster_{cluster_label}")
    image_filename = f"image_{idx}.png"
    image_path = os.path.join(cluster_dir, image_filename)

    # Convert the numpy array to an image using PIL
    pil_image = Image.fromarray((image * 255).astype(np.uint8), mode='L')
    
    # Save the image
    pil_image.save(image_path)

    
# Continue with the code to create the t-SNE plot using the `clusters` dictionary

# Concatenate the clusters into a single array
concatenated_clusters = np.concatenate(list(clusters.values()))

# Create a DataFrame for the concatenated clusters
df_concatenated = pd.DataFrame(concatenated_clusters)

# Add a column for cluster labels
df_concatenated['clusterid'] = np.concatenate([np.full(len(cluster), cid) for cid, cluster in clusters.items()])

# Perform t-SNE on the concatenated data
tsne = TSNE(n_components=2, random_state=42)
tsne_representation = tsne.fit_transform(df_concatenated.drop('clusterid', axis=1))

# Create a scatter plot of the t-SNE representation with cluster labels
plt.figure(figsize=(8, 6))
sns.scatterplot(x=tsne_representation[:, 0], y=tsne_representation[:, 1], hue=df_concatenated['clusterid'], palette='Set1', legend='full')
plt.title('t-SNE Plot of All Clusters')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()


(500, 512)


ValueError: Too many dimensions: 3 > 2.

In [13]:
print(checkpoint.keys())

odict_keys(['encoder.0.weight', 'encoder.0.bias', 'encoder.3.weight', 'encoder.3.bias', 'encoder.6.weight', 'encoder.6.bias', 'decoder.0.weight', 'decoder.0.bias', 'decoder.3.weight', 'decoder.3.bias', 'decoder.6.weight', 'decoder.6.bias', 'decoder.9.weight', 'decoder.9.bias'])


In [None]:
import torch
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

# Load the model checkpoint and state dictionary
checkpoint = torch.load('/home/ec2-user/SageMaker/Zooplankon_latent_AE.pt')
autoencoder = Autoencoder().to(device)
autoencoder.load_state_dict(checkpoint['model_state_dict'])
autoencoder.eval()

# Initialize lists to store latent vectors and targets
latent_vectors = []
targets = []

# Generate latent vectors using the data from the loader
with torch.no_grad():
    for batch_data, _ in loader:
        # Resize the target tensors to match the output size
        target = torch.nn.functional.interpolate(batch_data, size=(128, 128), mode='bilinear', align_corners=False)
            
        # Move data to CUDA device
        batch_data = batch_data.to(device)
        target = target.to(device)
            
        # Forward pass to get the latent vectors
        latent_vector = autoencoder.encode(batch_data)
        
        # Move the latent vector and target back to the CPU
        latent_vectors.extend(latent_vector.cpu().numpy())
        targets.extend(target.cpu().numpy())

# Convert lists to numpy arrays
latent_vectors = np.array(latent_vectors)
targets = np.array(targets)

# Apply Gaussian Mixture Model for clustering
k_value = 2
gmm = GaussianMixture(n_components=k_value)
cluster_labels = gmm.fit_predict(latent_vectors)

# Visualization of the clusters
plt.scatter(latent_vectors[cluster_labels == 0, 0], latent_vectors[cluster_labels == 0, 1], c='red', label='Cluster 1')
plt.scatter(latent_vectors[cluster_labels == 1, 0], latent_vectors[cluster_labels == 1, 1], c='blue', label='Cluster 2')
plt.title('Latent Space Clustering')
plt.xlabel('Latent Dimension 1')
plt.ylabel('Latent Dimension 2')
plt.legend()
plt.show()


# EM Clustering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture

# Define the number of clusters and random state
k = 7
random_state = np.random.randint(0, 500)

# Fit Gaussian Mixture Model
gmm = GaussianMixture(n_components=k, random_state=random_state)
gmm.fit(latent_vector.cpu())
cluster_id = gmm.predict(latent_vector.cpu())

# Create DataFrame with cluster labels
df = pd.DataFrame(X_test.cpu().reshape(X_test.shape[0], X_test.shape[1]*X_test.shape[2]*X_test.shape[3]))
df["clusterid"] = cluster_id

# Store clusters in a dictionary
clusters = dict()
for cid in df.clusterid.unique():
    clusters[cid] = df[df.clusterid == cid].drop("clusterid", axis="columns").to_numpy()

# Function to visualize random samples
def show_random_samples(X, n=12, h=400, w=400, latent_vector=False, title="", figsize=(16, 16), plot_type="grid"):
    # Reshape if needed
    if X.ndim == 2 and latent_vector:
        X = X.reshape(X.shape[0], 3, h, w)
    if X.ndim == 2:
        X = X.reshape(X.shape[0], 3, h, w)
    
    # Take random sample
    idxs = np.random.randint(len(X), size=n)
    
    # Create a list to store images from each cluster
    cluster_images = [[] for _ in range(n)]
    
    # Collect images from each cluster
    for i, idx in enumerate(idxs):
        cluster_images[i % n].append(X[idx])
    
    # Calculate optimal figsize for horizontal display
    image_height = cluster_images[0][0].shape[1]
    image_width = cluster_images[0][0].shape[2]
    figsize = (figsize[0], (figsize[0] / n) * (image_height / image_width))
    
    # Plot images horizontally for each cluster
    fig, ax = plt.subplots(nrows=1, ncols=n, figsize=figsize)
    for i, images in enumerate(cluster_images):
        combined_image = np.concatenate(images, axis=2)
        ax[i].imshow(combined_image.transpose(1, 2, 0))
        ax[i].axis('off')
        ax[i].set_aspect('auto')
    
    # Show plot
    plt.suptitle(title, fontsize=20)
    plt.tight_layout()
    plt.show()

# Visualize clusters
for cid in sorted(clusters.keys()):
    show_random_samples(clusters[cid], h=128, w=128, n=8, title=f"cluster {cid}", plot_type="grid")


In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Assuming you have the `clusters` dictionary containing the cluster data

# Concatenate the clusters into a single array
concatenated_clusters = np.concatenate(list(clusters.values()))

# Create a DataFrame for the concatenated clusters
df_concatenated = pd.DataFrame(concatenated_clusters)

# Add a column for cluster labels
df_concatenated['clusterid'] = np.concatenate([np.full(len(cluster), cid) for cid, cluster in clusters.items()])

# Perform t-SNE on the concatenated data
tsne = TSNE(n_components=2, random_state=42)
tsne_representation = tsne.fit_transform(df_concatenated.drop('clusterid', axis=1))

# Create a scatter plot of the t-SNE representation with cluster labels
plt.figure(figsize=(8, 6))
sns.scatterplot(x=tsne_representation[:, 0], y=tsne_representation[:, 1], hue=df_concatenated['clusterid'], palette='Set1', legend='full')
plt.title('t-SNE Plot of All Clusters')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()
