<a href="https://colab.research.google.com/github/nannthd/project_AIEngineer/blob/main/Fine_Tune_ResNet50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip '/content/drive/MyDrive/drug/data 50 class add_augment3.zip'

In [None]:
!pip install torch

In [None]:
!pip install tensorflow

#Fine-Tune ResNet50 (224)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import time

# ตั้งค่าพารามิเตอร์
batch_size = 16
learning_rate = 0.001
num_epochs = 10
image_size = (224, 224)

# ตรวจสอบว่า GPU สามารถใช้ได้หรือไม่
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# โหลดชุดข้อมูล
transform = transforms.Compose([
    transforms.Resize(image_size),
    transforms.ToTensor(),
])

# กำหนดเส้นทางไปยังชุดข้อมูล
train_data_path = '/content/data 50 class add_augment'

# สร้าง ImageFolder สำหรับชุดข้อมูลฝึก
train_dataset = ImageFolder(root=train_data_path, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# ตรวจสอบจำนวนคลาส
num_classes = len(train_dataset.classes)
print(f'Number of classes: {num_classes}')

# โหลดโมเดล ResNet-50 ที่ผ่านการฝึกมาแล้ว
model = models.resnet50(pretrained=True)

# ตรึงเลเยอร์ทั้งหมด
for param in model.parameters():
    param.requires_grad = False

# ปรับเปลี่ยนเลเยอร์สุดท้าย
model.fc = nn.Linear(model.fc.in_features, num_classes)

# ย้ายโมเดลไปที่อุปกรณ์ (GPU หรือ CPU)
model.to(device)

# เลือก optimizer และ loss function
optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# ฟังก์ชันเพื่อแปลงเวลาเป็นชั่วโมง, นาที, และวินาที
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    return f'{hours}h {minutes}m {secs}s'

# ฝึกโมเดล
model.train()
start_time = time.time()  # เริ่มจับเวลา

for epoch in range(num_epochs):
    epoch_start_time = time.time()  # เริ่มจับเวลาแต่ละ epoch
    for inputs, labels in train_loader:
        # ย้ายข้อมูลไปที่อุปกรณ์ (GPU หรือ CPU)
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    epoch_time = time.time() - epoch_start_time  # เวลาที่ใช้ในแต่ละ epoch
    total_time = time.time() - start_time  # เวลาทั้งหมดที่ใช้ในการฝึก
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Epoch Time: {format_time(epoch_time)}, Total Time: {format_time(total_time)}')

print(f'Finished Training. Total Time: {format_time(total_time)}')

Number of classes: 50
Epoch 1/10, Loss: 0.721778154373169, Epoch Time: 0h 3m 18s, Total Time: 0h 3m 18s
Epoch 2/10, Loss: 0.33759504556655884, Epoch Time: 0h 3m 14s, Total Time: 0h 6m 32s
Epoch 3/10, Loss: 0.07416685670614243, Epoch Time: 0h 3m 15s, Total Time: 0h 9m 48s
Epoch 4/10, Loss: 0.055027905851602554, Epoch Time: 0h 3m 13s, Total Time: 0h 13m 1s
Epoch 5/10, Loss: 0.03758874535560608, Epoch Time: 0h 3m 13s, Total Time: 0h 16m 15s
Epoch 6/10, Loss: 0.22012344002723694, Epoch Time: 0h 3m 12s, Total Time: 0h 19m 27s
Epoch 7/10, Loss: 0.02919219620525837, Epoch Time: 0h 3m 13s, Total Time: 0h 22m 41s
Epoch 8/10, Loss: 0.03523397445678711, Epoch Time: 0h 3m 14s, Total Time: 0h 25m 55s
Epoch 9/10, Loss: 0.01805408112704754, Epoch Time: 0h 3m 12s, Total Time: 0h 29m 8s
Epoch 10/10, Loss: 0.05058996379375458, Epoch Time: 0h 3m 17s, Total Time: 0h 32m 26s
Finished Training. Total Time: 0h 32m 26s


In [None]:
# เซฟโมเดล
model_save_path = './model_resnet50_224.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model saved to {model_save_path}')

Model saved to ./model_resnet50_224.pth


In [None]:
model_save_path = model_save_path = '/content/drive/MyDrive/drug/FineTune/model_resnet50_224.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model saved to {model_save_path}')

Model saved to /content/drive/MyDrive/drug/newtrain/model_resnet50_224.pth


#Fine-Tune ResNet50 (640)

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
import time

# ตั้งค่าพารามิเตอร์
batch_size = 16
learning_rate = 0.001
num_epochs = 10
image_size = (640, 640)

# ตรวจสอบว่า GPU สามารถใช้ได้หรือไม่
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# โหลดชุดข้อมูล
transform = transforms.Compose([
    transforms.Resize(image_size),
    transforms.ToTensor(),
])

# กำหนดเส้นทางไปยังชุดข้อมูล
train_data_path = '/content/data 50 class add_augment'

# สร้าง ImageFolder สำหรับชุดข้อมูลฝึก
train_dataset = ImageFolder(root=train_data_path, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# ตรวจสอบจำนวนคลาส
num_classes = len(train_dataset.classes)
print(f'Number of classes: {num_classes}')

# โหลดโมเดล ResNet-50 ที่ผ่านการฝึกมาแล้ว
model = models.resnet50(pretrained=True)

# ตรึงเลเยอร์ทั้งหมด
for param in model.parameters():
    param.requires_grad = False

# ปรับเปลี่ยนเลเยอร์สุดท้าย
model.fc = nn.Linear(model.fc.in_features, num_classes)

# ย้ายโมเดลไปที่อุปกรณ์ (GPU หรือ CPU)
model.to(device)

# เลือก optimizer และ loss function
optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# ฟังก์ชันเพื่อแปลงเวลาเป็นชั่วโมง, นาที, และวินาที
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    return f'{hours}h {minutes}m {secs}s'

# ฝึกโมเดล
model.train()
start_time = time.time()  # เริ่มจับเวลา

for epoch in range(num_epochs):
    epoch_start_time = time.time()  # เริ่มจับเวลาแต่ละ epoch
    for inputs, labels in train_loader:
        # ย้ายข้อมูลไปที่อุปกรณ์ (GPU หรือ CPU)
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    epoch_time = time.time() - epoch_start_time  # เวลาที่ใช้ในแต่ละ epoch
    total_time = time.time() - start_time  # เวลาทั้งหมดที่ใช้ในการฝึก
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Epoch Time: {format_time(epoch_time)}, Total Time: {format_time(total_time)}')

print(f'Finished Training. Total Time: {format_time(total_time)}')

Number of classes: 50


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 156MB/s]


Epoch 1/10, Loss: 1.1659988164901733, Epoch Time: 0h 4m 32s, Total Time: 0h 4m 32s
Epoch 2/10, Loss: 0.6981589198112488, Epoch Time: 0h 4m 23s, Total Time: 0h 8m 55s
Epoch 3/10, Loss: 0.133559450507164, Epoch Time: 0h 4m 26s, Total Time: 0h 13m 22s
Epoch 4/10, Loss: 0.6324554681777954, Epoch Time: 0h 4m 22s, Total Time: 0h 17m 45s
Epoch 5/10, Loss: 0.13220003247261047, Epoch Time: 0h 4m 26s, Total Time: 0h 22m 11s
Epoch 6/10, Loss: 0.11460687965154648, Epoch Time: 0h 4m 23s, Total Time: 0h 26m 35s
Epoch 7/10, Loss: 0.11895160377025604, Epoch Time: 0h 4m 23s, Total Time: 0h 30m 59s
Epoch 8/10, Loss: 0.20608116686344147, Epoch Time: 0h 4m 21s, Total Time: 0h 35m 22s
Epoch 9/10, Loss: 0.08346973359584808, Epoch Time: 0h 4m 22s, Total Time: 0h 39m 44s
Epoch 10/10, Loss: 0.2149277627468109, Epoch Time: 0h 4m 21s, Total Time: 0h 44m 6s
Finished Training. Total Time: 0h 44m 6s


In [6]:
# เซฟโมเดล
model_save_path = './model_resnet50_640.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model saved to {model_save_path}')

Model saved to ./model_resnet50_640.pth


In [7]:
model_save_path = model_save_path = '/content/drive/MyDrive/drug/FineTune/model_resnet50_640.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model saved to {model_save_path}')

Model saved to /content/drive/MyDrive/drug/FineTune/model_resnet50_640.pth


#image2vector

##224*224

###50cls

In [None]:
import pandas as pd
import numpy as np
import os
from PIL import Image
import torch
from torchvision import models, transforms
from tqdm import tqdm

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a feature extractor model
class FeatureExtractor(nn.Module):
    def __init__(self, base_model):
        super(FeatureExtractor, self).__init__()
        self.base_model = base_model
        self.base_model.fc = nn.Identity()  # Remove the final classification layer

    def forward(self, x):
        return self.base_model(x)

# Initialize ResNet-50 model and feature extractor
resnet50_model = models.resnet50(pretrained=False)
num_classes = 50  # Replace with the actual number of classes from your training
resnet50_model.fc = nn.Linear(resnet50_model.fc.in_features, num_classes)  # Replace the final layer with your trained layer
model_path = '/content/drive/MyDrive/drug/FineTune/model_resnet50_224.pth'
resnet50_model.load_state_dict(torch.load(model_path))
feature_extractor = FeatureExtractor(resnet50_model).to(device)
feature_extractor.eval()  # Set the model to evaluation mode

# Define the image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def image_embedding(path):
    img = Image.open(path).convert('RGB')  # Convert image to RGB
    img = transform(img).unsqueeze(0).to(device)  # Apply transformation and add batch dimension

    with torch.no_grad():
        # Extract features from the penultimate layer
        features = feature_extractor(img)
        avg_embedding = features.squeeze().cpu().numpy()  # Convert to numpy array and remove batch dimension

    curr_df = pd.DataFrame(avg_embedding).T
    return curr_df

def process_images_in_folder(folder_path, output_csv_path):
    pdEmbedded = pd.DataFrame()
    image_files = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))]

    print(f"Processing images in folder: {folder_path}")

    for idx, image_file in enumerate(tqdm(image_files, desc="Processing images", unit="image")):
        image_path = os.path.join(folder_path, image_file)
        embedded = image_embedding(image_path)
        embedded['ID'] = image_file  # Add ID column to the DataFrame
        pdEmbedded = pd.concat([pdEmbedded, embedded], ignore_index=True)

    # Ensure output directory exists
    output_folder = os.path.dirname(output_csv_path)
    os.makedirs(output_folder, exist_ok=True)

    # Save DataFrame to CSV
    pdEmbedded.to_csv(output_csv_path, index=False)
    print(f"Saved embedding data for {folder_path} to {output_csv_path}")

def process_all_folders(base_folder_path, output_base_folder):
    subfolders = [f.path for f in os.scandir(base_folder_path) if f.is_dir()]

    print(f"Processing folders in base directory: {base_folder_path}")

    for subfolder in subfolders:
        subfolder_name = os.path.basename(subfolder)
        output_csv_path = os.path.join(output_base_folder, f"{subfolder_name}.csv")
        process_images_in_folder(subfolder, output_csv_path)
        print(f"Completed processing for folder: {subfolder_name}")

# Example usage
base_folder_path = '/content/data 50 class add_augment'
output_base_folder = '/content/Vector_ResNet50_FT224'
os.makedirs(output_base_folder, exist_ok=True)

process_all_folders(base_folder_path, output_base_folder)

In [12]:
# Export Data drug50cls
# Define the folder you want to zip and download
import shutil

folder_to_download = '/content/Vector_ResNet50_FT224'

# Zip the folder
shutil.make_archive('/content/drive/MyDrive/drug/FineTune/Vector_ResNet50_FT224', 'zip', folder_to_download)

# # Download the zip file
# from google.colab import files
# files.download('/content/drive/MyDrive/drug/FineTune/content/Vector_ResNet50_FT224.zip')

'/content/drive/MyDrive/drug/FineTune/Vector_ResNet50_FT224.zip'

###Cosine similarity

In [None]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a feature extractor model
class FeatureExtractor(nn.Module):
    def __init__(self, base_model):
        super(FeatureExtractor, self).__init__()
        self.base_model = base_model
        self.base_model.fc = nn.Identity()  # Remove the final classification layer

    def forward(self, x):
        return self.base_model(x)

# Initialize ResNet-50 model and feature extractor
resnet50_model = models.resnet50(pretrained=False)
num_classes = 50  # Replace with the actual number of classes from your training
resnet50_model.fc = nn.Linear(resnet50_model.fc.in_features, num_classes)  # Replace the final layer with your trained layer
model_path = '/content/model_resnet50.pth'
resnet50_model.load_state_dict(torch.load(model_path))
feature_extractor = FeatureExtractor(resnet50_model).to(device)
feature_extractor.eval()  # Set the model to evaluation mode

# Define the image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def image_embedding(path):
    img = Image.open(path).convert('RGB')  # Convert image to RGB
    img = transform(img).unsqueeze(0).to(device)  # Apply transformation and add batch dimension

    with torch.no_grad():
        # Extract features from the penultimate layer
        features = feature_extractor(img)
        avg_embedding = features.squeeze().cpu().numpy()  # Convert to numpy array and remove batch dimension

    curr_df = pd.DataFrame(avg_embedding).T
    return curr_df

def load_embeddings_from_csv(csv_folder):
    all_embeddings = {}
    for csv_file in os.listdir(csv_folder):
        if csv_file.endswith('.csv'):
            class_name = os.path.splitext(csv_file)[0]  # Use CSV file name as class name
            csv_path = os.path.join(csv_folder, csv_file)
            df = pd.read_csv(csv_path)
            # Drop ID column and keep only class embeddings
            embeddings_no_id = df.drop(['ID'], axis=1)
            all_embeddings[class_name] = embeddings_no_id
    return all_embeddings

def find_most_similar_classes(new_image_path, all_embeddings):
    new_embedding = image_embedding(new_image_path)
    similarity_scores = {}

    for class_name, embeddings_df in all_embeddings.items():
        # Compute similarity between new image embedding and class embeddings
        similarity_score = cosine_similarity(new_embedding, embeddings_df)
        max_similarity_score = similarity_score.max()  # Get highest similarity score in class
        similarity_scores[class_name] = max_similarity_score

    sorted_similarity = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
    top_5_similar_classes = sorted_similarity[:5]

    return top_5_similar_classes

# Load embeddings from CSV folder
csv_folder_path = '/content/ResNet50_FT_model_v1'
all_embeddings = load_embeddings_from_csv(csv_folder_path)

# Image to test
new_image_path = '/content/cropped_11_0_0.jpg'

# Find top 5 most similar classes
similar_classes = find_most_similar_classes(new_image_path, all_embeddings)

# Print results
print("Top 5 most similar classes:")
for rank, (class_name, similarity_score) in enumerate(similar_classes, start=1):
    print(f"{rank}. Class: {class_name}, Similarity Score: {similarity_score}")



Top 5 most similar classes:
1. Class: Novonorm2mg, Similarity Score: 0.8101994092840938
2. Class: Novonorm1mg, Similarity Score: 0.7914134416210582
3. Class: BlopressPlus8mg, Similarity Score: 0.7524376232783746
4. Class: Prenolol50mg, Similarity Score: 0.7453936909688319
5. Class: Amlopine10mg, Similarity Score: 0.7168482991407736


##640*640

###50cls

In [None]:
import pandas as pd
import numpy as np
import os
from PIL import Image
import torch
from torchvision import models, transforms
from tqdm import tqdm
import torch.nn as nn

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a feature extractor model
class FeatureExtractor(nn.Module):
    def __init__(self, base_model):
        super(FeatureExtractor, self).__init__()
        self.base_model = base_model
        self.base_model.fc = nn.Identity()  # Remove the final classification layer

    def forward(self, x):
        return self.base_model(x)

# Initialize ResNet-50 model and feature extractor
resnet50_model = models.resnet50(pretrained=False)
num_classes = 50  # Replace with the actual number of classes from your training
resnet50_model.fc = nn.Linear(resnet50_model.fc.in_features, num_classes)  # Replace the final layer with your trained layer
model_path = '/content/drive/MyDrive/drug/FineTune/model_resnet50_640.pth'
resnet50_model.load_state_dict(torch.load(model_path))
feature_extractor = FeatureExtractor(resnet50_model).to(device)
feature_extractor.eval()  # Set the model to evaluation mode

# Define the image transformation
transform = transforms.Compose([
    transforms.Resize((640, 640)),
    transforms.ToTensor(),
])

def image_embedding(path):
    img = Image.open(path).convert('RGB')  # Convert image to RGB
    img = transform(img).unsqueeze(0).to(device)  # Apply transformation and add batch dimension

    with torch.no_grad():
        # Extract features from the penultimate layer
        features = feature_extractor(img)
        avg_embedding = features.squeeze().cpu().numpy()  # Convert to numpy array and remove batch dimension

    curr_df = pd.DataFrame(avg_embedding).T
    return curr_df

def process_images_in_folder(folder_path, output_csv_path):
    pdEmbedded = pd.DataFrame()
    image_files = [f for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))]

    print(f"Processing images in folder: {folder_path}")

    for idx, image_file in enumerate(tqdm(image_files, desc="Processing images", unit="image")):
        image_path = os.path.join(folder_path, image_file)
        embedded = image_embedding(image_path)
        embedded['ID'] = image_file  # Add ID column to the DataFrame
        pdEmbedded = pd.concat([pdEmbedded, embedded], ignore_index=True)

    # Ensure output directory exists
    output_folder = os.path.dirname(output_csv_path)
    os.makedirs(output_folder, exist_ok=True)

    # Save DataFrame to CSV
    pdEmbedded.to_csv(output_csv_path, index=False)
    print(f"Saved embedding data for {folder_path} to {output_csv_path}")

def process_all_folders(base_folder_path, output_base_folder):
    subfolders = [f.path for f in os.scandir(base_folder_path) if f.is_dir()]

    print(f"Processing folders in base directory: {base_folder_path}")

    for subfolder in subfolders:
        subfolder_name = os.path.basename(subfolder)
        output_csv_path = os.path.join(output_base_folder, f"{subfolder_name}.csv")
        process_images_in_folder(subfolder, output_csv_path)
        print(f"Completed processing for folder: {subfolder_name}")

# Example usage
base_folder_path = '/content/data 50 class add_augment'
output_base_folder = '/content/Vector_ResNet50_FT640'
os.makedirs(output_base_folder, exist_ok=True)

process_all_folders(base_folder_path, output_base_folder)

In [14]:
# Export Data drug50cls
# Define the folder you want to zip and download
import shutil

folder_to_download = '/content/Vector_ResNet50_FT640'

# Zip the folder
shutil.make_archive('/content/drive/MyDrive/drug/FineTune/Vector_ResNet50_FT640', 'zip', folder_to_download)

# # Download the zip file
# from google.colab import files
# files.download('/content/drive/MyDrive/drug/drug50cls_ResNet50_FT_model_v1_640.zip')

'/content/drive/MyDrive/drug/FineTune/Vector_ResNet50_FT640.zip'

###Cosine similarity

In [None]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a feature extractor model
class FeatureExtractor(nn.Module):
    def __init__(self, base_model):
        super(FeatureExtractor, self).__init__()
        self.base_model = base_model
        self.base_model.fc = nn.Identity()  # Remove the final classification layer

    def forward(self, x):
        return self.base_model(x)

# Initialize ResNet-50 model and feature extractor
resnet50_model = models.resnet50(pretrained=False)
num_classes = 50  # Replace with the actual number of classes from your training
resnet50_model.fc = nn.Linear(resnet50_model.fc.in_features, num_classes)  # Replace the final layer with your trained layer
model_path = '/content/drive/MyDrive/drug/trained_resnet50_epoch_24.pth'
resnet50_model.load_state_dict(torch.load(model_path))
feature_extractor = FeatureExtractor(resnet50_model).to(device)
feature_extractor.eval()  # Set the model to evaluation mode

# Define the image transformation
transform = transforms.Compose([
    transforms.Resize((640, 640)),
    transforms.ToTensor(),
])

def image_embedding(path):
    img = Image.open(path).convert('RGB')  # Convert image to RGB
    img = transform(img).unsqueeze(0).to(device)  # Apply transformation and add batch dimension

    with torch.no_grad():
        # Extract features from the penultimate layer
        features = feature_extractor(img)
        avg_embedding = features.squeeze().cpu().numpy()  # Convert to numpy array and remove batch dimension

    curr_df = pd.DataFrame(avg_embedding).T
    return curr_df

def load_embeddings_from_csv(csv_folder):
    all_embeddings = {}
    for csv_file in os.listdir(csv_folder):
        if csv_file.endswith('.csv'):
            class_name = os.path.splitext(csv_file)[0]  # Use CSV file name as class name
            csv_path = os.path.join(csv_folder, csv_file)
            df = pd.read_csv(csv_path)
            # Drop ID column and keep only class embeddings
            embeddings_no_id = df.drop(['ID'], axis=1)
            all_embeddings[class_name] = embeddings_no_id
    return all_embeddings

def find_most_similar_classes(new_image_path, all_embeddings):
    new_embedding = image_embedding(new_image_path)
    similarity_scores = {}

    for class_name, embeddings_df in all_embeddings.items():
        # Compute similarity between new image embedding and class embeddings
        similarity_score = cosine_similarity(new_embedding, embeddings_df)
        max_similarity_score = similarity_score.max()  # Get highest similarity score in class
        similarity_scores[class_name] = max_similarity_score

    sorted_similarity = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
    top_5_similar_classes = sorted_similarity[:5]

    return top_5_similar_classes

# Load embeddings from CSV folder
csv_folder_path = '/content/ResNet50_FT_model_v1_640'
all_embeddings = load_embeddings_from_csv(csv_folder_path)

# Image to test
new_image_path = '/content/cropped_11_0_0.jpg'

# Find top 5 most similar classes
similar_classes = find_most_similar_classes(new_image_path, all_embeddings)

# Print results
print("Top 5 most similar classes:")
for rank, (class_name, similarity_score) in enumerate(similar_classes, start=1):
    print(f"{rank}. Class: {class_name}, Similarity Score: {similarity_score}")



Top 5 most similar classes:
1. Class: Novonorm2mg, Similarity Score: 0.9064162646280978
2. Class: Novonorm1mg, Similarity Score: 0.8640252960489345
3. Class: BlopressPlus8mg, Similarity Score: 0.8330473335570844
4. Class: Forxiga10mg, Similarity Score: 0.8227136119137224
5. Class: MetoprololStada100mg, Similarity Score: 0.8203953731375324


In [None]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import matplotlib.pyplot as plt

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a feature extractor model
class FeatureExtractor(nn.Module):
    def __init__(self, base_model):
        super(FeatureExtractor, self).__init__()
        self.base_model = base_model
        self.base_model.fc = nn.Identity()  # Remove the final classification layer

    def forward(self, x):
        return self.base_model(x)

# Initialize ResNet-50 model and feature extractor
resnet50_model = models.resnet50(pretrained=False)
num_classes = 50  # Replace with the actual number of classes from your training
resnet50_model.fc = nn.Linear(resnet50_model.fc.in_features, num_classes)  # Replace the final layer with your trained layer
model_path = '/content/drive/MyDrive/drug/trained_resnet50_epoch_24.pth'
resnet50_model.load_state_dict(torch.load(model_path))
feature_extractor = FeatureExtractor(resnet50_model).to(device)
feature_extractor.eval()  # Set the model to evaluation mode

# Define the image transformation
transform = transforms.Compose([
    transforms.Resize((640, 640)),
    transforms.ToTensor(),
])

def image_embedding(path):
    img = Image.open(path).convert('RGB')  # Convert image to RGB
    img = transform(img).unsqueeze(0).to(device)  # Apply transformation and add batch dimension

    with torch.no_grad():
        # Extract features from the penultimate layer
        features = feature_extractor(img)
        avg_embedding = features.squeeze().cpu().numpy()  # Convert to numpy array and remove batch dimension

    curr_df = pd.DataFrame(avg_embedding).T
    return curr_df

def load_embeddings_from_csv(csv_folder):
    all_embeddings = {}
    for csv_file in os.listdir(csv_folder):
        if csv_file.endswith('.csv'):
            class_name = os.path.splitext(csv_file)[0]  # Use CSV file name as class name
            csv_path = os.path.join(csv_folder, csv_file)
            df = pd.read_csv(csv_path)
            # Drop ID column and keep only class embeddings
            embeddings_no_id = df.drop(['ID'], axis=1)
            all_embeddings[class_name] = embeddings_no_id
    return all_embeddings

def find_most_similar_classes(new_image_path, all_embeddings):
    new_embedding = image_embedding(new_image_path)
    similarity_scores = {}

    for class_name, embeddings_df in all_embeddings.items():
        # Compute similarity between new image embedding and class embeddings
        similarity_score = cosine_similarity(new_embedding, embeddings_df)
        max_similarity_score = similarity_score.max()  # Get highest similarity score in class
        similarity_scores[class_name] = max_similarity_score

    sorted_similarity = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
    top_5_similar_classes = sorted_similarity[:5]

    return top_5_similar_classes

def display_image_with_similarities(image_path, similar_classes):
    img = Image.open(image_path)
    plt.imshow(img)
    plt.axis('off')
    plt.title("Top 5 most similar classes:")
    plt.show()

    for rank, (class_name, similarity_score) in enumerate(similar_classes, start=1):
        print(f"{rank}. Class: {class_name}, Similarity Score: {similarity_score}")
    print("\n")

# Load embeddings from CSV folder
csv_folder_path = '/content/ResNet50_FT_model_v1_640'
all_embeddings = load_embeddings_from_csv(csv_folder_path)

# Folder containing new images to test
new_images_folder = '/content/test'

# Loop through all images in the folder
for image_filename in os.listdir(new_images_folder):
    new_image_path = os.path.join(new_images_folder, image_filename)
    if new_image_path.endswith(('.jpg', '.jpeg', '.png')):
        similar_classes = find_most_similar_classes(new_image_path, all_embeddings)

        # Display image and results
        display_image_with_similarities(new_image_path, similar_classes)