Welcome to assignment 1.                                                       

We are using pathology images for our first assignment please download data from this link https://drive.google.com/drive/folders/10dUOzcPR-PQwfFYcHk5gsLjIjSorQ32Q?usp=sharing



# Task 1: Feature Generation (15%)
# Use and run the following code (a deep network) to generate features from a set of training images. For this assignment, you do not need to know how the deep network is working here to extract features.
# This code extracts the features of image T4.tif (in the T folder of dataset). Modify the code so that it iterates over all images of the dataset and extracts their features.
# Allocate 10% of the data for validation.

# Insert your code here for Task 1





In [64]:
import os
import numpy as np
import random
import torch
import torchvision.transforms as transforms
from torchvision.models import densenet121
from torch.autograd import Variable
from PIL import Image
import matplotlib.pyplot as plt



# Load pre-trained DenseNet model
model = densenet121(weights='DenseNet121_Weights.IMAGENET1K_V1')

# Remove the classification layer (last fully connected layer)
model = torch.nn.Sequential(*list(model.children())[:-1])

# Add a global average pooling layer
model.add_module('global_avg_pool', torch.nn.AdaptiveAvgPool2d(1))

# Set the model to evaluation mode
model.eval()

# Define the image preprocessing pipeline
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load dataset folder
imageNames = []
extractedFeatures = []

#shouldn't need to change the root directory if you pulled correctly
for root, _, files in os.walk('.\\train-20240206T024149Z-001\\train'):
    for file in files:
            imageNames.append(os.path.join(root, file))
            # Load an image
            image_path = os.path.join(root, file)

            # get_image_files(image_path)

            image = Image.open(image_path)

            # Preprocess the image
            input_tensor = preprocess(image)
            input_batch = input_tensor.unsqueeze(0)  # Add a batch dimension

            # Wrap the input tensor in a Variable
            input_var = Variable(input_batch)

            # Forward pass through the model
            features = model(input_var)

            # Extract the feature tensor
            feature_vector = features.squeeze().detach().numpy()

            # Now 'feature_vector' contains the feature from the last fully connected layer of DenseNet
            #print("Feature vector shape:", feature_vector.shape)
            # for feature in feature_vector:
            #     print(feature)
            extractedFeatures.append(feature_vector)

#check to see if all images were parsed through
print(len(extractedFeatures))

# Needed to convert to np array. was having issues with just normal list
extractedFeatures = np.array(extractedFeatures)

# Splitting the data
tenPercentLength = int(len(extractedFeatures) * 0.1)
randomIndices = np.random.choice(len(extractedFeatures), size=tenPercentLength, replace=False)

# Extract the elements corresponding to the random indices for test set
validationFeatures = extractedFeatures[randomIndices]
validationImageNames = [imageNames[i] for i in randomIndices]

# Extract the remaining elements for training set
trainFeatures = np.delete(extractedFeatures, randomIndices, axis=0)
trainImageNames = [imageNames[i] for i in range(len(imageNames)) if i not in randomIndices]

# I (Jared) needed this for KMeans (if anyone else needs this I (Jared) can explain how it works)
# Get the letter labels for validation and training sets
validationLabels = [label.split('\\')[-2][-1] for label in validationImageNames]
trainLabels = [label.split('\\')[-2][-1] for label in trainImageNames]

# Create a mapping between letter labels and numeric labels
unique_labels = np.unique(trainLabels)
label_mapping = {label: i for i, label in enumerate(unique_labels)}

# Convert letter labels to numeric labels
validationLabelsNum = [label_mapping[label] for label in validationLabels]
trainLabelsNum = [label_mapping[label] for label in trainLabels]

print('Test Set Length:', len(validationFeatures))
print('Train Set Length:', len(trainFeatures))

780
Test Set Length: 78
Train Set Length: 702


# Task 2: High Bias Classification Method (5%)
# Choose a classification method and let is have a high bias.
# Train it on the generated features and discuss why it is underfitting.

# Insert your code here for Task 2




# Task 3: High Variance Classification Method (5%)
# Use the chosen classification method and let it have a high variance.
# Train it on the generated features and discuss why it is overfitting.

# Insert your code here for Task 3




# Task 4: Balanced Classification Method (15%)
# Use the chosen classification method and let it balance the bias and variance.
# Train it on the generated features, possibly adjusting parameters.
# Discuss insights into achieving balance.

# Insert your code here for Task 4




# Task 5: K-Means Clustering (20%)
# Apply K-Means clustering on the generated features.
# Test with available labels and report accuracy.
# Experiment with automated K and compare with manually set 20 clusters.

# Insert your code here for Task 5




In [97]:
from sklearn.cluster import KMeans
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

##### HELPER FUNCTIONS #####

# helper function to map the labels from the KMeans data to the training data
def map_labels(train_labels, cluster_labels, predict_labels):

    # Compute confusion matrix
    conf_matrix = confusion_matrix(train_labels, cluster_labels)

    # Apply the Hungarian algorithm to find the best matching
    row_ind, col_ind = linear_sum_assignment(-conf_matrix)

    # Map KMeans labels to train labels based on the best matching
    label_mapping = {k: v for k, v in zip(col_ind, row_ind)}

    # Map prediction (KMean) labels to original labels
    prediction_mapped = [label_mapping[label] for label in predict_labels]

    return prediction_mapped

# helper function to auto select the value of K (clusters) using the elbow method
def autoK(features, max_clusters):
    wcss = []
    
    # iterate through K = 2, 3, ..., max_clusters + 1 and keep track of WCSS value for each
    for i in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters = i, init='k-means++', random_state = 420)
        kmeans.fit(features)
        wcss.append(kmeans.inertia_)

    # Calculate the first derivative of the WCSS and use it to find the elbow point
    differences = np.diff(wcss)
    elbow_point = np.argmax(differences) + 1
        
    return elbow_point

##### MAIN CODE #####

# Apply KMeans clustering to data using K clusters = 20
kmeans_man = KMeans(n_clusters = 20, random_state = 420)
kmeans_man.fit(trainFeatures)

# predict test data and map the predicted clusters numbers to the corresponding trained cluster numbers
prediction_man = kmeans_man.predict(validationFeatures)
prediction_man_mapped = map_labels(trainLabelsNum, kmeans_man.labels_, prediction_man)

# check accuracy
accuracy = accuracy_score(validationLabelsNum, prediction_man_mapped)
print("Accuracy K (20) = ", accuracy)

# set K (auto)
K = autoK(trainFeatures, max_clusters = 40)
print("Automated K Value = ", K)

# Apply KMeans clustering to data using K clusters = auto
kmeans_auto = KMeans(n_clusters = K, random_state = 420)
kmeans_auto.fit(trainFeatures)

# predict test data and map the predicted clusters numbers to the corresponding trained cluster numbers
prediction_auto = kmeans_auto.predict(validationFeatures)
prediction_auto_mapped = map_labels(trainLabelsNum, kmeans_auto.labels_, prediction_auto)

# check accuracy
accuracy = accuracy_score(validationLabelsNum, prediction_auto_mapped)
print("Accuracy K (Auto) = ", accuracy)

# plot results
#fig = plt.figure(2, figsize = (11,4))

# K = 20
#ax = fig.add_subplot(121, projection='3d')
#ax.scatter(trainFeatures[:,0],trainFeatures[:,1], trainFeatures[:,2], c=labels_man, cmap='Set2')
#plt.title('KMeans using sklearn, K = 20')

# auto K
#ax = fig.add_subplot(122,projection='3d')
#ax.scatter(trainFeatures[:,0],trainFeatures[:,1], trainFeatures[:,2], c=labels_auto, cmap='Set2')
#plt.title('KMeans using sklearn, K = auto')
#plt.show()


Accuracy K (20) =  0.7948717948717948
Automated K Value =  17
Accuracy K (Auto) =  0.782051282051282


# Task 6: Additional Clustering Algorithm (10%)
# Choose another clustering algorithm and apply it on the features.
# Test accuracy with available labels.

# Insert your code here for Task 6




# Task 7: PCA for Classification Improvement (20%)
# Apply PCA on the features and then feed them to the best classification method in the above tasks.
# Assess if PCA improves outcomes and discuss the results.

# Insert your code here for Task 7




# Task 8: Visualization and Analysis (10%)
# Plot the features in a lower dimension using dimentinality reduction techniques.
# Analyze the visual representation, identifying patterns or insights.

# Insert your code here for Task 8